Skip to content

Instantly share code, notes, and snippets.

@boutros
Created August 19, 2020 10:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save boutros/296b48836e4239e919a8f7a419507cd8 to your computer and use it in GitHub Desktop.
Save boutros/296b48836e4239e919a8f7a419507cd8 to your computer and use it in GitHub Desktop.
sibyl phrase suggestor (did you mean)
diff --git a/sibyl/assets/esconf/publication_mapping.json b/sibyl/assets/esconf/publication_mapping.json
index a51436a72..916d90d21 100644
--- a/sibyl/assets/esconf/publication_mapping.json
+++ b/sibyl/assets/esconf/publication_mapping.json
@@ -7,7 +7,17 @@
"type": "keyword"
},
"suggest": {
- "type": "completion"
+ "type": "text",
+ "fields": {
+ "trigram": {
+ "type": "text",
+ "analyzer": "trigram"
+ },
+ "reverse": {
+ "type": "text",
+ "analyzer": "reverse"
+ }
+ }
},
"mainTitle": {
"type": "text",
diff --git a/sibyl/assets/esconf/publication_settings.json b/sibyl/assets/esconf/publication_settings.json
index 2ec7f2c3d..d0f977bdf 100644
--- a/sibyl/assets/esconf/publication_settings.json
+++ b/sibyl/assets/esconf/publication_settings.json
@@ -21,9 +21,24 @@
"norwegian_folding": {
"type": "icu_folding",
"unicode_set_filter": "[^æøåÆØÅ]"
+ },
+ "shingle": {
+ "type": "shingle",
+ "min_shingle_size": 2,
+ "max_shingle_size": 3
}
},
"analyzer": {
+ "trigram": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": ["lowercase","shingle"]
+ },
+ "reverse": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": ["lowercase","reverse"]
+ },
"default": {
"char_filter": [
"character_mappings"
diff --git a/sibyl/internal/cache/rdf/pipeline/publication/extendedpublication.go b/sibyl/internal/cache/rdf/pipeline/publication/extendedpublication.go
index 677ad8ace..320445cc7 100644
--- a/sibyl/internal/cache/rdf/pipeline/publication/extendedpublication.go
+++ b/sibyl/internal/cache/rdf/pipeline/publication/extendedpublication.go
@@ -13,14 +13,14 @@ import (
)
type extPublicationTemplate struct {
- Uri string `json:"uri"`
- RecordID string `json:"recordId"`
- Suggest []publication.Suggest `json:"suggest"`
- MediaType string `json:"mediaType"`
- MainTitle string `json:"mainTitle"`
- IDs common.FlexArray `json:"ids"`
- Created string `json:"created"`
- AgeLimit int `json:"ageLimit"`
+ Uri string `json:"uri"`
+ RecordID string `json:"recordId"`
+ Suggest []string `json:"suggest"`
+ MediaType string `json:"mediaType"`
+ MainTitle string `json:"mainTitle"`
+ IDs common.FlexArray `json:"ids"`
+ Created string `json:"created"`
+ AgeLimit int `json:"ageLimit"`
FormatAdaptations common.FlexArray `json:"formatAdaptations"`
Formats common.FlexArray `json:"formats"`
@@ -354,21 +354,25 @@ func (ep *extPublicationTemplate) Apply() (publication.IndexedPublication, error
if model.Publication.HomeBranches != nil {
// TODO: decide which fields to include in suggest.
- // Currently: publication.mainTitle and work.Authors
+ // Currently: publication.mainTitle and work.Agents
- // TODO: decide on how to rank the suggestions
- // For now, we weight the suggestion to the number of items on the publications,
- // which seems to work out fairly good and "fair".
- model.Publication.Suggest = append(model.Publication.Suggest, publication.Suggest{
- Input: ep.MainTitle,
- Weight: int(ep.NumItems),
- })
- for _, author := range []string(ep.WorkAuthors) {
- model.Publication.Suggest = append(model.Publication.Suggest, publication.Suggest{
- Input: author,
- Weight: int(ep.NumItems),
- })
- }
+ model.Publication.Suggest = append(model.Publication.Suggest, ep.MainTitle)
+ model.Publication.Suggest = append(model.Publication.Suggest, []string(model.Work.Agents.Actors)...)
+ model.Publication.Suggest = append(model.Publication.Suggest, []string(model.Work.Agents.Adaptors)...)
+ model.Publication.Suggest = append(model.Publication.Suggest, []string(model.Work.Agents.Authors)...)
+ model.Publication.Suggest = append(model.Publication.Suggest, []string(model.Work.Agents.Contributors)...)
+ model.Publication.Suggest = append(model.Publication.Suggest, []string(model.Work.Agents.Coreographers)...)
+ model.Publication.Suggest = append(model.Publication.Suggest, []string(model.Work.Agents.Directors)...)
+ model.Publication.Suggest = append(model.Publication.Suggest, []string(model.Work.Agents.Editors)...)
+ model.Publication.Suggest = append(model.Publication.Suggest, []string(model.Work.Agents.Featuring)...)
+ model.Publication.Suggest = append(model.Publication.Suggest, []string(model.Work.Agents.Illustrators)...)
+ model.Publication.Suggest = append(model.Publication.Suggest, []string(model.Work.Agents.Performers)...)
+ model.Publication.Suggest = append(model.Publication.Suggest, []string(model.Work.Agents.Photographers)...)
+ model.Publication.Suggest = append(model.Publication.Suggest, []string(model.Work.Agents.Producers)...)
+ model.Publication.Suggest = append(model.Publication.Suggest, []string(model.Work.Agents.ProductionCompanies)...)
+ model.Publication.Suggest = append(model.Publication.Suggest, []string(model.Work.Agents.Publishers)...)
+ model.Publication.Suggest = append(model.Publication.Suggest, []string(model.Work.Agents.Readers)...)
+ model.Publication.Suggest = append(model.Publication.Suggest, []string(model.Work.Agents.Translators)...)
}
diff --git a/sibyl/internal/search/autocomplete/autocomplete.go b/sibyl/internal/search/autocomplete/autocomplete.go
index fb5bfa310..b687736be 100644
--- a/sibyl/internal/search/autocomplete/autocomplete.go
+++ b/sibyl/internal/search/autocomplete/autocomplete.go
@@ -21,10 +21,10 @@ func NewClient(esUrl *url.URL) (Client, error) {
client, err := elastic.NewClient(
elastic.SetURL(esUrl.String()),
elastic.SetSniff(false),
- elastic.SetGzip(true),
+ //elastic.SetGzip(true),
elastic.SetHealthcheck(false),
elastic.SetErrorLog(searchAutocompleteLogger.WithFields(log.Fields{"logger_name": "ELASTIC"})),
- //elastic.SetTraceLog(searchAutocompleteLogger.WithFields(log.Fields{"logger_name": "ELASTIC"})),
+ elastic.SetTraceLog(searchAutocompleteLogger.WithFields(log.Fields{"logger_name": "ELASTIC"})),
elastic.SetInfoLog(searchAutocompleteLogger.WithFields(log.Fields{"logger_name": "ELASTIC"})))
if err != nil {
return Client{}, err
@@ -32,16 +32,36 @@ func NewClient(esUrl *url.URL) (Client, error) {
return Client{client}, nil
}
-func (c Client) Search(q string, size int) ([]string, error) {
+func (c Client) Search(q string, size int, confidence float64, highlight bool) ([]string, error) {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
- query := elastic.NewCompletionSuggester("suggest").
+ g1 := elastic.NewDirectCandidateGenerator("suggest.trigram").
+ SuggestMode("always").
+ MinWordLength(2)
+
+ g2 := elastic.NewDirectCandidateGenerator("suggest.reverse").
+ SuggestMode("always").
+ MinWordLength(2).
+ PreFilter("reverse").
+ PostFilter("reverse")
+
+ query := elastic.NewPhraseSuggester("suggest").
Size(size).
Text(q).
- Field("suggest").
- //FuzzyOptions(elastic.NewFuzzyCompletionSuggesterOptions().EditDistance(2)).
- SkipDuplicates(true)
+ Confidence(confidence).
+ GramSize(1).
+ Field("suggest.trigram").
+ CandidateGenerators(g1, g2).
+ CollateQuery(
+ elastic.NewScriptInline(`{"match":{"titleAll" : "{{suggestion}}"}}`),
+ ).
+ CollateParams(map[string]interface{}{"field_name": "suggest"}).
+ CollatePrune(true)
+
+ if highlight {
+ query = query.Highlight("<em>", "</em>")
+ }
// TODO actually we nedd nothign from source field, but seems we have to ask for one field,
// otherwise we get alll fields
@@ -66,7 +86,11 @@ func (c Client) Search(q string, size int) ([]string, error) {
}
suggestion := suggestions[0]
for _, opt := range suggestion.Options {
- hits = append(hits, opt.Text)
+ if highlight {
+ hits = append(hits, opt.Highlighted)
+ } else {
+ hits = append(hits, opt.Text)
+ }
}
return hits, nil
diff --git a/sibyl/internal/search/publication/publication.go b/sibyl/internal/search/publication/publication.go
index 0a454c66c..6af6cbdf3 100644
--- a/sibyl/internal/search/publication/publication.go
+++ b/sibyl/internal/search/publication/publication.go
@@ -47,16 +47,11 @@ type OtherPublications struct {
Languages common.FlexArray `json:"languages,omitempty"`
}
-type Suggest struct {
- Input string `json:"input"`
- Weight int `json:"weight"`
-}
-
type Publication struct {
// Obligatoriske felt:
ID string `json:"id"`
URI string `json:"uri"`
- Suggest []Suggest `json:"suggest"`
+ Suggest []string `json:"suggest"`
RecordID string `json:"recordId"`
MediaType string `json:"mediaType"`
MainTitle string `json:"mainTitle"`
diff --git a/sibyl/internal/search/search.go b/sibyl/internal/search/search.go
index ce8dbcf74..7e4bc5835 100644
--- a/sibyl/internal/search/search.go
+++ b/sibyl/internal/search/search.go
@@ -82,10 +82,22 @@ func NewSearch(elasticUrl *url.URL) (*Search, error) {
func (s *Search) autocomplete(w http.ResponseWriter, r *http.Request) {
callId := r.Context().Value(config.DeichmanCallIdKey)
+ // extract query params: q, size, confidence, highlight
query := r.URL.Query().Get("q")
query = strings.ReplaceAll(query, `"`, `\"`)
size := extractSizeParam(r.URL.Query().Get("size"))
- hits, err := s.autocompleteClient.Search(query, size)
+ confidence := 0.5 // default
+ if confidenceParam := r.URL.Query().Get("confidence"); confidenceParam != "" {
+ if c, err := strconv.ParseFloat(confidenceParam, 64); err == nil {
+ confidence = c
+ }
+ }
+ highlight := false // default
+ if r.URL.Query().Get("highlight") != "" {
+ highlight = true
+ }
+
+ hits, err := s.autocompleteClient.Search(query, size, confidence, highlight)
if err != nil {
logger.WithFields(log.Fields{
"call_id": callId,
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment