mirror of
https://codeberg.org/forgejo/forgejo.git
synced 2024-12-27 10:20:39 +00:00
Patch in exact search for meilisearch (#29671)
meilisearch does not have an search option to contorl fuzzynes per query right now: - https://github.com/meilisearch/meilisearch/issues/1192 - https://github.com/orgs/meilisearch/discussions/377 - https://github.com/meilisearch/meilisearch/discussions/1096 so we have to create a workaround by post-filter the search result in gitea until this is addressed. For future works I added an option in backend only atm, to enable fuzzynes for issue indexer too. And also refactored the code so the fuzzy option is equal in logic to code indexer --- *Sponsored by Kithara Software GmbH* Conflicts: routers/web/repo/search.go trivial context confict s/isMatch/isFuzzy/
This commit is contained in:
parent
299c2a1408
commit
38c3cc4eb7
14 changed files with 184 additions and 33 deletions
|
@ -233,21 +233,21 @@ func (b *Indexer) Delete(_ context.Context, repoID int64) error {
|
|||
|
||||
// Search searches for files in the specified repo.
|
||||
// Returns the matching file-paths
|
||||
func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
|
||||
func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
|
||||
var (
|
||||
indexerQuery query.Query
|
||||
keywordQuery query.Query
|
||||
)
|
||||
|
||||
if isMatch {
|
||||
prefixQuery := bleve.NewPrefixQuery(keyword)
|
||||
prefixQuery.FieldVal = "Content"
|
||||
keywordQuery = prefixQuery
|
||||
} else {
|
||||
if isFuzzy {
|
||||
phraseQuery := bleve.NewMatchPhraseQuery(keyword)
|
||||
phraseQuery.FieldVal = "Content"
|
||||
phraseQuery.Analyzer = repoIndexerAnalyzer
|
||||
keywordQuery = phraseQuery
|
||||
} else {
|
||||
prefixQuery := bleve.NewPrefixQuery(keyword)
|
||||
prefixQuery.FieldVal = "Content"
|
||||
keywordQuery = prefixQuery
|
||||
}
|
||||
|
||||
if len(repoIDs) > 0 {
|
||||
|
|
|
@ -281,10 +281,10 @@ func extractAggs(searchResult *elastic.SearchResult) []*internal.SearchResultLan
|
|||
}
|
||||
|
||||
// Search searches for codes and language stats by given conditions.
|
||||
func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
|
||||
searchType := esMultiMatchTypeBestFields
|
||||
if isMatch {
|
||||
searchType = esMultiMatchTypePhrasePrefix
|
||||
func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
|
||||
searchType := esMultiMatchTypePhrasePrefix
|
||||
if isFuzzy {
|
||||
searchType = esMultiMatchTypeBestFields
|
||||
}
|
||||
|
||||
kwQuery := elastic.NewMultiMatchQuery(keyword, "content").Type(searchType)
|
||||
|
|
|
@ -70,7 +70,7 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
|
|||
|
||||
for _, kw := range keywords {
|
||||
t.Run(kw.Keyword, func(t *testing.T) {
|
||||
total, res, langs, err := indexer.Search(context.TODO(), kw.RepoIDs, "", kw.Keyword, 1, 10, false)
|
||||
total, res, langs, err := indexer.Search(context.TODO(), kw.RepoIDs, "", kw.Keyword, 1, 10, true)
|
||||
assert.NoError(t, err)
|
||||
assert.Len(t, kw.IDs, int(total))
|
||||
assert.Len(t, langs, kw.Langs)
|
||||
|
|
|
@ -16,7 +16,7 @@ type Indexer interface {
|
|||
internal.Indexer
|
||||
Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *RepoChanges) error
|
||||
Delete(ctx context.Context, repoID int64) error
|
||||
Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*SearchResult, []*SearchResultLanguages, error)
|
||||
Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int64, []*SearchResult, []*SearchResultLanguages, error)
|
||||
}
|
||||
|
||||
// NewDummyIndexer returns a dummy indexer
|
||||
|
@ -38,6 +38,6 @@ func (d *dummyIndexer) Delete(ctx context.Context, repoID int64) error {
|
|||
return fmt.Errorf("indexer is not ready")
|
||||
}
|
||||
|
||||
func (d *dummyIndexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*SearchResult, []*SearchResultLanguages, error) {
|
||||
func (d *dummyIndexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int64, []*SearchResult, []*SearchResultLanguages, error) {
|
||||
return 0, nil, nil, fmt.Errorf("indexer is not ready")
|
||||
}
|
||||
|
|
|
@ -124,12 +124,13 @@ func searchResult(result *internal.SearchResult, startIndex, endIndex int) (*Res
|
|||
}
|
||||
|
||||
// PerformSearch perform a search on a repository
|
||||
func PerformSearch(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int, []*Result, []*internal.SearchResultLanguages, error) {
|
||||
// if isFuzzy is true set the Damerau-Levenshtein distance from 0 to 2
|
||||
func PerformSearch(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int, []*Result, []*internal.SearchResultLanguages, error) {
|
||||
if len(keyword) == 0 {
|
||||
return 0, nil, nil, nil
|
||||
}
|
||||
|
||||
total, results, resultLanguages, err := (*globalIndexer.Load()).Search(ctx, repoIDs, language, keyword, page, pageSize, isMatch)
|
||||
total, results, resultLanguages, err := (*globalIndexer.Load()).Search(ctx, repoIDs, language, keyword, page, pageSize, isFuzzy)
|
||||
if err != nil {
|
||||
return 0, nil, nil, err
|
||||
}
|
||||
|
|
|
@ -25,6 +25,13 @@ func MatchPhraseQuery(matchPhrase, field, analyzer string) *query.MatchPhraseQue
|
|||
return q
|
||||
}
|
||||
|
||||
// PrefixQuery generates a match prefix query for the given prefix and field
|
||||
func PrefixQuery(matchPrefix, field string) *query.PrefixQuery {
|
||||
q := bleve.NewPrefixQuery(matchPrefix)
|
||||
q.FieldVal = field
|
||||
return q
|
||||
}
|
||||
|
||||
// BoolFieldQuery generates a bool field query for the given value and field
|
||||
func BoolFieldQuery(value bool, field string) *query.BoolFieldQuery {
|
||||
q := bleve.NewBoolFieldQuery(value)
|
||||
|
|
|
@ -156,12 +156,19 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (
|
|||
var queries []query.Query
|
||||
|
||||
if options.Keyword != "" {
|
||||
keywordQueries := []query.Query{
|
||||
inner_bleve.MatchPhraseQuery(options.Keyword, "title", issueIndexerAnalyzer),
|
||||
inner_bleve.MatchPhraseQuery(options.Keyword, "content", issueIndexerAnalyzer),
|
||||
inner_bleve.MatchPhraseQuery(options.Keyword, "comments", issueIndexerAnalyzer),
|
||||
if options.IsFuzzyKeyword {
|
||||
queries = append(queries, bleve.NewDisjunctionQuery([]query.Query{
|
||||
inner_bleve.MatchPhraseQuery(options.Keyword, "title", issueIndexerAnalyzer),
|
||||
inner_bleve.MatchPhraseQuery(options.Keyword, "content", issueIndexerAnalyzer),
|
||||
inner_bleve.MatchPhraseQuery(options.Keyword, "comments", issueIndexerAnalyzer),
|
||||
}...))
|
||||
} else {
|
||||
queries = append(queries, bleve.NewDisjunctionQuery([]query.Query{
|
||||
inner_bleve.PrefixQuery(options.Keyword, "title"),
|
||||
inner_bleve.PrefixQuery(options.Keyword, "content"),
|
||||
inner_bleve.PrefixQuery(options.Keyword, "comments"),
|
||||
}...))
|
||||
}
|
||||
queries = append(queries, bleve.NewDisjunctionQuery(keywordQueries...))
|
||||
}
|
||||
|
||||
if len(options.RepoIDs) > 0 || options.AllPublic {
|
||||
|
|
|
@ -19,6 +19,10 @@ import (
|
|||
|
||||
const (
|
||||
issueIndexerLatestVersion = 1
|
||||
// multi-match-types, currently only 2 types are used
|
||||
// Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types
|
||||
esMultiMatchTypeBestFields = "best_fields"
|
||||
esMultiMatchTypePhrasePrefix = "phrase_prefix"
|
||||
)
|
||||
|
||||
var _ internal.Indexer = &Indexer{}
|
||||
|
@ -141,7 +145,13 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (
|
|||
query := elastic.NewBoolQuery()
|
||||
|
||||
if options.Keyword != "" {
|
||||
query.Must(elastic.NewMultiMatchQuery(options.Keyword, "title", "content", "comments"))
|
||||
|
||||
searchType := esMultiMatchTypePhrasePrefix
|
||||
if options.IsFuzzyKeyword {
|
||||
searchType = esMultiMatchTypeBestFields
|
||||
}
|
||||
|
||||
query.Must(elastic.NewMultiMatchQuery(options.Keyword, "title", "content", "comments").Type(searchType))
|
||||
}
|
||||
|
||||
if len(options.RepoIDs) > 0 {
|
||||
|
|
|
@ -74,6 +74,8 @@ type SearchResult struct {
|
|||
type SearchOptions struct {
|
||||
Keyword string // keyword to search
|
||||
|
||||
IsFuzzyKeyword bool // if false the levenshtein distance is 0
|
||||
|
||||
RepoIDs []int64 // repository IDs which the issues belong to
|
||||
AllPublic bool // if include all public repositories
|
||||
|
||||
|
|
|
@ -5,6 +5,7 @@ package meilisearch
|
|||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
|
@ -16,12 +17,15 @@ import (
|
|||
)
|
||||
|
||||
const (
|
||||
issueIndexerLatestVersion = 2
|
||||
issueIndexerLatestVersion = 3
|
||||
|
||||
// TODO: make this configurable if necessary
|
||||
maxTotalHits = 10000
|
||||
)
|
||||
|
||||
// ErrMalformedResponse is never expected as we initialize the indexer ourself and so define the types.
|
||||
var ErrMalformedResponse = errors.New("meilisearch returned unexpected malformed content")
|
||||
|
||||
var _ internal.Indexer = &Indexer{}
|
||||
|
||||
// Indexer implements Indexer interface
|
||||
|
@ -47,6 +51,9 @@ func NewIndexer(url, apiKey, indexerName string) *Indexer {
|
|||
},
|
||||
DisplayedAttributes: []string{
|
||||
"id",
|
||||
"title",
|
||||
"content",
|
||||
"comments",
|
||||
},
|
||||
FilterableAttributes: []string{
|
||||
"repo_id",
|
||||
|
@ -221,11 +228,9 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (
|
|||
return nil, err
|
||||
}
|
||||
|
||||
hits := make([]internal.Match, 0, len(searchRes.Hits))
|
||||
for _, hit := range searchRes.Hits {
|
||||
hits = append(hits, internal.Match{
|
||||
ID: int64(hit.(map[string]any)["id"].(float64)),
|
||||
})
|
||||
hits, err := nonFuzzyWorkaround(searchRes, options.Keyword, options.IsFuzzyKeyword)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &internal.SearchResult{
|
||||
|
@ -241,3 +246,77 @@ func parseSortBy(sortBy internal.SortBy) string {
|
|||
}
|
||||
return field + ":asc"
|
||||
}
|
||||
|
||||
// nonFuzzyWorkaround is needed as meilisearch does not have an exact search
|
||||
// and you can only change "typo tolerance" per index. So we have to post-filter the results
|
||||
// https://www.meilisearch.com/docs/learn/configuration/typo_tolerance#configuring-typo-tolerance
|
||||
// TODO: remove once https://github.com/orgs/meilisearch/discussions/377 is addressed
|
||||
func nonFuzzyWorkaround(searchRes *meilisearch.SearchResponse, keyword string, isFuzzy bool) ([]internal.Match, error) {
|
||||
hits := make([]internal.Match, 0, len(searchRes.Hits))
|
||||
for _, hit := range searchRes.Hits {
|
||||
hit, ok := hit.(map[string]any)
|
||||
if !ok {
|
||||
return nil, ErrMalformedResponse
|
||||
}
|
||||
|
||||
if !isFuzzy {
|
||||
keyword = strings.ToLower(keyword)
|
||||
|
||||
// declare a anon func to check if the title, content or at least one comment contains the keyword
|
||||
found, err := func() (bool, error) {
|
||||
// check if title match first
|
||||
title, ok := hit["title"].(string)
|
||||
if !ok {
|
||||
return false, ErrMalformedResponse
|
||||
} else if strings.Contains(strings.ToLower(title), keyword) {
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// check if content has a match
|
||||
content, ok := hit["content"].(string)
|
||||
if !ok {
|
||||
return false, ErrMalformedResponse
|
||||
} else if strings.Contains(strings.ToLower(content), keyword) {
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// now check for each comment if one has a match
|
||||
// so we first try to cast and skip if there are no comments
|
||||
comments, ok := hit["comments"].([]any)
|
||||
if !ok {
|
||||
return false, ErrMalformedResponse
|
||||
} else if len(comments) == 0 {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// now we iterate over all and report as soon as we detect one match
|
||||
for i := range comments {
|
||||
comment, ok := comments[i].(string)
|
||||
if !ok {
|
||||
return false, ErrMalformedResponse
|
||||
}
|
||||
if strings.Contains(strings.ToLower(comment), keyword) {
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
|
||||
// we got no match
|
||||
return false, nil
|
||||
}()
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
} else if !found {
|
||||
continue
|
||||
}
|
||||
}
|
||||
issueID, ok := hit["id"].(float64)
|
||||
if !ok {
|
||||
return nil, ErrMalformedResponse
|
||||
}
|
||||
hits = append(hits, internal.Match{
|
||||
ID: int64(issueID),
|
||||
})
|
||||
}
|
||||
return hits, nil
|
||||
}
|
||||
|
|
|
@ -10,7 +10,11 @@ import (
|
|||
"testing"
|
||||
"time"
|
||||
|
||||
"code.gitea.io/gitea/modules/indexer/issues/internal"
|
||||
"code.gitea.io/gitea/modules/indexer/issues/internal/tests"
|
||||
|
||||
"github.com/meilisearch/meilisearch-go"
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestMeilisearchIndexer(t *testing.T) {
|
||||
|
@ -49,3 +53,44 @@ func TestMeilisearchIndexer(t *testing.T) {
|
|||
|
||||
tests.TestIndexer(t, indexer)
|
||||
}
|
||||
|
||||
func TestNonFuzzyWorkaround(t *testing.T) {
|
||||
// get unexpected return
|
||||
_, err := nonFuzzyWorkaround(&meilisearch.SearchResponse{
|
||||
Hits: []any{"aa", "bb", "cc", "dd"},
|
||||
}, "bowling", false)
|
||||
assert.ErrorIs(t, err, ErrMalformedResponse)
|
||||
|
||||
validResponse := &meilisearch.SearchResponse{
|
||||
Hits: []any{
|
||||
map[string]any{
|
||||
"id": float64(11),
|
||||
"title": "a title",
|
||||
"content": "issue body with no match",
|
||||
"comments": []any{"hey whats up?", "I'm currently bowling", "nice"},
|
||||
},
|
||||
map[string]any{
|
||||
"id": float64(22),
|
||||
"title": "Bowling as title",
|
||||
"content": "",
|
||||
"comments": []any{},
|
||||
},
|
||||
map[string]any{
|
||||
"id": float64(33),
|
||||
"title": "Bowl-ing as fuzzy match",
|
||||
"content": "",
|
||||
"comments": []any{},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
// nonFuzzy
|
||||
hits, err := nonFuzzyWorkaround(validResponse, "bowling", false)
|
||||
assert.NoError(t, err)
|
||||
assert.EqualValues(t, []internal.Match{{ID: 11}, {ID: 22}}, hits)
|
||||
|
||||
// fuzzy
|
||||
hits, err = nonFuzzyWorkaround(validResponse, "bowling", true)
|
||||
assert.NoError(t, err)
|
||||
assert.EqualValues(t, []internal.Match{{ID: 11}, {ID: 22}, {ID: 33}}, hits)
|
||||
}
|
||||
|
|
|
@ -35,7 +35,7 @@ func Code(ctx *context.Context) {
|
|||
keyword := ctx.FormTrim("q")
|
||||
|
||||
queryType := ctx.FormTrim("t")
|
||||
isMatch := queryType == "match"
|
||||
isFuzzy := queryType != "match"
|
||||
|
||||
ctx.Data["Keyword"] = keyword
|
||||
ctx.Data["Language"] = language
|
||||
|
@ -77,7 +77,7 @@ func Code(ctx *context.Context) {
|
|||
)
|
||||
|
||||
if (len(repoIDs) > 0) || isAdmin {
|
||||
total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, repoIDs, language, keyword, page, setting.UI.RepoSearchPagingNum, isMatch)
|
||||
total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, repoIDs, language, keyword, page, setting.UI.RepoSearchPagingNum, isFuzzy)
|
||||
if err != nil {
|
||||
if code_indexer.IsAvailable(ctx) {
|
||||
ctx.ServerError("SearchResults", err)
|
||||
|
|
|
@ -21,7 +21,7 @@ func Search(ctx *context.Context) {
|
|||
keyword := ctx.FormTrim("q")
|
||||
|
||||
queryType := ctx.FormTrim("t")
|
||||
isMatch := queryType == "match"
|
||||
isFuzzy := queryType != "match"
|
||||
|
||||
ctx.Data["Keyword"] = keyword
|
||||
ctx.Data["Language"] = language
|
||||
|
@ -44,7 +44,7 @@ func Search(ctx *context.Context) {
|
|||
ctx.Data["CodeIndexerEnabled"] = true
|
||||
|
||||
total, searchResults, searchResultLanguages, err := code_indexer.PerformSearch(ctx, []int64{ctx.Repo.Repository.ID},
|
||||
language, keyword, page, setting.UI.RepoSearchPagingNum, isMatch)
|
||||
language, keyword, page, setting.UI.RepoSearchPagingNum, isFuzzy)
|
||||
if err != nil {
|
||||
if code_indexer.IsAvailable(ctx) {
|
||||
ctx.ServerError("SearchResults", err)
|
||||
|
|
|
@ -40,7 +40,7 @@ func CodeSearch(ctx *context.Context) {
|
|||
keyword := ctx.FormTrim("q")
|
||||
|
||||
queryType := ctx.FormTrim("t")
|
||||
isMatch := queryType == "match"
|
||||
isFuzzy := queryType != "match"
|
||||
|
||||
ctx.Data["Keyword"] = keyword
|
||||
ctx.Data["Language"] = language
|
||||
|
@ -75,7 +75,7 @@ func CodeSearch(ctx *context.Context) {
|
|||
)
|
||||
|
||||
if len(repoIDs) > 0 {
|
||||
total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, repoIDs, language, keyword, page, setting.UI.RepoSearchPagingNum, isMatch)
|
||||
total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, repoIDs, language, keyword, page, setting.UI.RepoSearchPagingNum, isFuzzy)
|
||||
if err != nil {
|
||||
if code_indexer.IsAvailable(ctx) {
|
||||
ctx.ServerError("SearchResults", err)
|
||||
|
|
Loading…
Reference in a new issue