Created
August 19, 2020 10:52
-
-
Save boutros/296b48836e4239e919a8f7a419507cd8 to your computer and use it in GitHub Desktop.
sibyl phrase suggestor (did you mean)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/sibyl/assets/esconf/publication_mapping.json b/sibyl/assets/esconf/publication_mapping.json | |
index a51436a72..916d90d21 100644 | |
--- a/sibyl/assets/esconf/publication_mapping.json | |
+++ b/sibyl/assets/esconf/publication_mapping.json | |
@@ -7,7 +7,17 @@ | |
"type": "keyword" | |
}, | |
"suggest": { | |
- "type": "completion" | |
+ "type": "text", | |
+ "fields": { | |
+ "trigram": { | |
+ "type": "text", | |
+ "analyzer": "trigram" | |
+ }, | |
+ "reverse": { | |
+ "type": "text", | |
+ "analyzer": "reverse" | |
+ } | |
+ } | |
}, | |
"mainTitle": { | |
"type": "text", | |
diff --git a/sibyl/assets/esconf/publication_settings.json b/sibyl/assets/esconf/publication_settings.json | |
index 2ec7f2c3d..d0f977bdf 100644 | |
--- a/sibyl/assets/esconf/publication_settings.json | |
+++ b/sibyl/assets/esconf/publication_settings.json | |
@@ -21,9 +21,24 @@ | |
"norwegian_folding": { | |
"type": "icu_folding", | |
"unicode_set_filter": "[^æøåÆØÅ]" | |
+ }, | |
+ "shingle": { | |
+ "type": "shingle", | |
+ "min_shingle_size": 2, | |
+ "max_shingle_size": 3 | |
} | |
}, | |
"analyzer": { | |
+ "trigram": { | |
+ "type": "custom", | |
+ "tokenizer": "standard", | |
+ "filter": ["lowercase","shingle"] | |
+ }, | |
+ "reverse": { | |
+ "type": "custom", | |
+ "tokenizer": "standard", | |
+ "filter": ["lowercase","reverse"] | |
+ }, | |
"default": { | |
"char_filter": [ | |
"character_mappings" | |
diff --git a/sibyl/internal/cache/rdf/pipeline/publication/extendedpublication.go b/sibyl/internal/cache/rdf/pipeline/publication/extendedpublication.go | |
index 677ad8ace..320445cc7 100644 | |
--- a/sibyl/internal/cache/rdf/pipeline/publication/extendedpublication.go | |
+++ b/sibyl/internal/cache/rdf/pipeline/publication/extendedpublication.go | |
@@ -13,14 +13,14 @@ import ( | |
) | |
type extPublicationTemplate struct { | |
- Uri string `json:"uri"` | |
- RecordID string `json:"recordId"` | |
- Suggest []publication.Suggest `json:"suggest"` | |
- MediaType string `json:"mediaType"` | |
- MainTitle string `json:"mainTitle"` | |
- IDs common.FlexArray `json:"ids"` | |
- Created string `json:"created"` | |
- AgeLimit int `json:"ageLimit"` | |
+ Uri string `json:"uri"` | |
+ RecordID string `json:"recordId"` | |
+ Suggest []string `json:"suggest"` | |
+ MediaType string `json:"mediaType"` | |
+ MainTitle string `json:"mainTitle"` | |
+ IDs common.FlexArray `json:"ids"` | |
+ Created string `json:"created"` | |
+ AgeLimit int `json:"ageLimit"` | |
FormatAdaptations common.FlexArray `json:"formatAdaptations"` | |
Formats common.FlexArray `json:"formats"` | |
@@ -354,21 +354,25 @@ func (ep *extPublicationTemplate) Apply() (publication.IndexedPublication, error | |
if model.Publication.HomeBranches != nil { | |
// TODO: decide which fields to include in suggest. | |
- // Currently: publication.mainTitle and work.Authors | |
+ // Currently: publication.mainTitle and work.Agents | |
- // TODO: decide on how to rank the suggestions | |
- // For now, we weight the suggestion to the number of items on the publications, | |
- // which seems to work out fairly good and "fair". | |
- model.Publication.Suggest = append(model.Publication.Suggest, publication.Suggest{ | |
- Input: ep.MainTitle, | |
- Weight: int(ep.NumItems), | |
- }) | |
- for _, author := range []string(ep.WorkAuthors) { | |
- model.Publication.Suggest = append(model.Publication.Suggest, publication.Suggest{ | |
- Input: author, | |
- Weight: int(ep.NumItems), | |
- }) | |
- } | |
+ model.Publication.Suggest = append(model.Publication.Suggest, ep.MainTitle) | |
+ model.Publication.Suggest = append(model.Publication.Suggest, []string(model.Work.Agents.Actors)...) | |
+ model.Publication.Suggest = append(model.Publication.Suggest, []string(model.Work.Agents.Adaptors)...) | |
+ model.Publication.Suggest = append(model.Publication.Suggest, []string(model.Work.Agents.Authors)...) | |
+ model.Publication.Suggest = append(model.Publication.Suggest, []string(model.Work.Agents.Contributors)...) | |
+ model.Publication.Suggest = append(model.Publication.Suggest, []string(model.Work.Agents.Coreographers)...) | |
+ model.Publication.Suggest = append(model.Publication.Suggest, []string(model.Work.Agents.Directors)...) | |
+ model.Publication.Suggest = append(model.Publication.Suggest, []string(model.Work.Agents.Editors)...) | |
+ model.Publication.Suggest = append(model.Publication.Suggest, []string(model.Work.Agents.Featuring)...) | |
+ model.Publication.Suggest = append(model.Publication.Suggest, []string(model.Work.Agents.Illustrators)...) | |
+ model.Publication.Suggest = append(model.Publication.Suggest, []string(model.Work.Agents.Performers)...) | |
+ model.Publication.Suggest = append(model.Publication.Suggest, []string(model.Work.Agents.Photographers)...) | |
+ model.Publication.Suggest = append(model.Publication.Suggest, []string(model.Work.Agents.Producers)...) | |
+ model.Publication.Suggest = append(model.Publication.Suggest, []string(model.Work.Agents.ProductionCompanies)...) | |
+ model.Publication.Suggest = append(model.Publication.Suggest, []string(model.Work.Agents.Publishers)...) | |
+ model.Publication.Suggest = append(model.Publication.Suggest, []string(model.Work.Agents.Readers)...) | |
+ model.Publication.Suggest = append(model.Publication.Suggest, []string(model.Work.Agents.Translators)...) | |
} | |
diff --git a/sibyl/internal/search/autocomplete/autocomplete.go b/sibyl/internal/search/autocomplete/autocomplete.go | |
index fb5bfa310..b687736be 100644 | |
--- a/sibyl/internal/search/autocomplete/autocomplete.go | |
+++ b/sibyl/internal/search/autocomplete/autocomplete.go | |
@@ -21,10 +21,10 @@ func NewClient(esUrl *url.URL) (Client, error) { | |
client, err := elastic.NewClient( | |
elastic.SetURL(esUrl.String()), | |
elastic.SetSniff(false), | |
- elastic.SetGzip(true), | |
+ //elastic.SetGzip(true), | |
elastic.SetHealthcheck(false), | |
elastic.SetErrorLog(searchAutocompleteLogger.WithFields(log.Fields{"logger_name": "ELASTIC"})), | |
- //elastic.SetTraceLog(searchAutocompleteLogger.WithFields(log.Fields{"logger_name": "ELASTIC"})), | |
+ elastic.SetTraceLog(searchAutocompleteLogger.WithFields(log.Fields{"logger_name": "ELASTIC"})), | |
elastic.SetInfoLog(searchAutocompleteLogger.WithFields(log.Fields{"logger_name": "ELASTIC"}))) | |
if err != nil { | |
return Client{}, err | |
@@ -32,16 +32,36 @@ func NewClient(esUrl *url.URL) (Client, error) { | |
return Client{client}, nil | |
} | |
-func (c Client) Search(q string, size int) ([]string, error) { | |
+func (c Client) Search(q string, size int, confidence float64, highlight bool) ([]string, error) { | |
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) | |
defer cancel() | |
- query := elastic.NewCompletionSuggester("suggest"). | |
+ g1 := elastic.NewDirectCandidateGenerator("suggest.trigram"). | |
+ SuggestMode("always"). | |
+ MinWordLength(2) | |
+ | |
+ g2 := elastic.NewDirectCandidateGenerator("suggest.reverse"). | |
+ SuggestMode("always"). | |
+ MinWordLength(2). | |
+ PreFilter("reverse"). | |
+ PostFilter("reverse") | |
+ | |
+ query := elastic.NewPhraseSuggester("suggest"). | |
Size(size). | |
Text(q). | |
- Field("suggest"). | |
- //FuzzyOptions(elastic.NewFuzzyCompletionSuggesterOptions().EditDistance(2)). | |
- SkipDuplicates(true) | |
+ Confidence(confidence). | |
+ GramSize(1). | |
+ Field("suggest.trigram"). | |
+ CandidateGenerators(g1, g2). | |
+ CollateQuery( | |
+ elastic.NewScriptInline(`{"match":{"titleAll" : "{{suggestion}}"}}`), | |
+ ). | |
+ CollateParams(map[string]interface{}{"field_name": "suggest"}). | |
+ CollatePrune(true) | |
+ | |
+ if highlight { | |
+ query = query.Highlight("<em>", "</em>") | |
+ } | |
// TODO actually we nedd nothign from source field, but seems we have to ask for one field, | |
// otherwise we get alll fields | |
@@ -66,7 +86,11 @@ func (c Client) Search(q string, size int) ([]string, error) { | |
} | |
suggestion := suggestions[0] | |
for _, opt := range suggestion.Options { | |
- hits = append(hits, opt.Text) | |
+ if highlight { | |
+ hits = append(hits, opt.Highlighted) | |
+ } else { | |
+ hits = append(hits, opt.Text) | |
+ } | |
} | |
return hits, nil | |
diff --git a/sibyl/internal/search/publication/publication.go b/sibyl/internal/search/publication/publication.go | |
index 0a454c66c..6af6cbdf3 100644 | |
--- a/sibyl/internal/search/publication/publication.go | |
+++ b/sibyl/internal/search/publication/publication.go | |
@@ -47,16 +47,11 @@ type OtherPublications struct { | |
Languages common.FlexArray `json:"languages,omitempty"` | |
} | |
-type Suggest struct { | |
- Input string `json:"input"` | |
- Weight int `json:"weight"` | |
-} | |
- | |
type Publication struct { | |
// Obligatoriske felt: | |
ID string `json:"id"` | |
URI string `json:"uri"` | |
- Suggest []Suggest `json:"suggest"` | |
+ Suggest []string `json:"suggest"` | |
RecordID string `json:"recordId"` | |
MediaType string `json:"mediaType"` | |
MainTitle string `json:"mainTitle"` | |
diff --git a/sibyl/internal/search/search.go b/sibyl/internal/search/search.go | |
index ce8dbcf74..7e4bc5835 100644 | |
--- a/sibyl/internal/search/search.go | |
+++ b/sibyl/internal/search/search.go | |
@@ -82,10 +82,22 @@ func NewSearch(elasticUrl *url.URL) (*Search, error) { | |
func (s *Search) autocomplete(w http.ResponseWriter, r *http.Request) { | |
callId := r.Context().Value(config.DeichmanCallIdKey) | |
+ // extract query params: q, size, confidence, highlight | |
query := r.URL.Query().Get("q") | |
query = strings.ReplaceAll(query, `"`, `\"`) | |
size := extractSizeParam(r.URL.Query().Get("size")) | |
- hits, err := s.autocompleteClient.Search(query, size) | |
+ confidence := 0.5 // default | |
+ if confidenceParam := r.URL.Query().Get("confidence"); confidenceParam != "" { | |
+ if c, err := strconv.ParseFloat(confidenceParam, 64); err == nil { | |
+ confidence = c | |
+ } | |
+ } | |
+ highlight := false // default | |
+ if r.URL.Query().Get("highlight") != "" { | |
+ highlight = true | |
+ } | |
+ | |
+ hits, err := s.autocompleteClient.Search(query, size, confidence, highlight) | |
if err != nil { | |
logger.WithFields(log.Fields{ | |
"call_id": callId, |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment