Treat hyphenated words strong in explicit analyzer (#46974)

trediggs · Jan 12, 2024 · 971ca2f · 971ca2f
1 parent c53df7a
commit 971ca2f
Show file tree

Hide file tree

Showing 7 changed files with 69 additions and 20 deletions.
diff --git a/src/search/components/SearchResults.tsx b/src/search/components/SearchResults.tsx
@@ -72,6 +72,13 @@ function SearchResultHit({
   const title =
     hit.highlights.title && hit.highlights.title.length > 0 ? hit.highlights.title[0] : hit.title
 
+  let content = ''
+  if (hit.highlights.content_explicit?.length) {
+    content = hit.highlights.content_explicit[0]
+  } else if (hit.highlights.content?.length) {
+    content = hit.highlights.content[0]
+  }
+
   return (
     <div className={cx('my-6', styles.search_result)} data-testid="search-result">
       <p className="text-normal f5 color-fg-muted" style={{ wordSpacing: 2 }}>
@@ -99,9 +106,7 @@ function SearchResultHit({
           }}
         ></Link>
       </h2>
-      {hit.highlights.content && hit.highlights.content.length > 0 && (
-        <div dangerouslySetInnerHTML={{ __html: hit.highlights.content[0] }}></div>
-      )}
+      {content && <div dangerouslySetInnerHTML={{ __html: content }}></div>}
       {debug && (
         <Text as="p" fontWeight="bold">
           score: <code style={{ marginRight: 10 }}>{hit.score}</code> popularity:{' '}

diff --git a/src/search/components/types.ts b/src/search/components/types.ts
@@ -6,6 +6,7 @@ export type SearchResultHitT = {
   highlights: {
     title?: string[]
     content?: string[]
+    content_explicit?: string[]
   }
   score?: number
   popularity?: number

diff --git a/src/search/middleware/es-search.js b/src/search/middleware/es-search.js
@@ -1,6 +1,9 @@
 import { Client } from '@elastic/elasticsearch'
 
-export const POSSIBLE_HIGHLIGHT_FIELDS = ['title', 'content', 'headings']
+export const POSSIBLE_HIGHLIGHT_FIELDS = ['title', 'content']
+// This needs to match what we *use* in the `<SearchResults>` component.
+// For example, if we don't display "headings" we shouldn't request
+// highlights for it either.
 export const DEFAULT_HIGHLIGHT_FIELDS = ['title', 'content']
 
 const ELASTICSEARCH_URL = process.env.ELASTICSEARCH_URL
@@ -87,7 +90,11 @@ export async function getSearchResults({
     matchQuery.bool.filter = topicsFilter
   }
 
-  const highlightFields = highlights || DEFAULT_HIGHLIGHT_FIELDS
+  const highlightFields = Array.from(highlights || DEFAULT_HIGHLIGHT_FIELDS)
+  // These acts as an alias convenience
+  if (highlightFields.includes('content')) {
+    highlightFields.push('content_explicit')
+  }
   const highlight = getHighlightConfiguration(query, highlightFields)
 
   const searchQuery = {
@@ -182,7 +189,7 @@ function getMatchQueries(query, { usePrefixSearch, fuzzy }) {
   const BOOST_HEADINGS = 3.0
   const BOOST_CONTENT = 1.0
   const BOOST_AND = 2.5
-  const BOOST_EXPLICIT = 3.5
+  const BOOST_EXPLICIT = 6.5
   // Number doesn't matter so much but just make sure it's
   // boosted low. Because we only really want this to come into
   // play if nothing else matches. E.g. a search for `AcIons`
@@ -419,17 +426,14 @@ function getHighlightConfiguration(query, highlights) {
       number_of_fragments: 1,
     }
   }
-  if (highlights.includes('headings')) {
-    fields.headings = { fragment_size: 150, number_of_fragments: 2 }
-  }
   if (highlights.includes('content')) {
     // The 'no_match_size' is so we can display *something* for the
     // preview if there was no highlight match at all within the content.
     fields.content = {
       // Fast Vector Highlighter
       // Using this requires that you first index these fields
       // with {term_vector: 'with_positions_offsets'}
-      type: 'fvh', //
+      type: 'fvh',
       fragment_size: 150,
       number_of_fragments: 1,
       no_match_size: 150,
@@ -442,6 +446,27 @@ function getHighlightConfiguration(query, highlights) {
         },
       },
     }
+    // NOTE (JAN 2024) THIS IS DELIBERATELY COMMENTED OUT. FOR NOW...
+    // Once we know the indexes have all been rebuilt with the
+    // new `term_vector: 'with_positions_offsets',` everywhere we
+    // can un-comment this.
+    // fields.content_explicit = {
+    //   // Fast Vector Highlighter
+    //   // Using this requires that you first index these fields
+    //   // with {term_vector: 'with_positions_offsets'}
+    //   type: 'fvh',
+    //   fragment_size: 150,
+    //   number_of_fragments: 1,
+    //   no_match_size: 0,
+
+    //   highlight_query: {
+    //     match_phrase_prefix: {
+    //       content_explicit: {
+    //         query,
+    //       },
+    //     },
+    //   },
+    // }
   }
 
   return {

diff --git a/src/search/scripts/index-elasticsearch.js b/src/search/scripts/index-elasticsearch.js
@@ -294,6 +294,19 @@ async function indexVersion(client, indexName, version, language, sourceDirector
   // CREATE INDEX
   const settings = {
     analysis: {
+      char_filter: {
+        // This will turn `runs-on` into `runs_on` so that it can't be
+        // tokenized to `runs` because `on` is a stop word.
+        // It also means that prose terms, in English, like `opt-in`
+        // not be matched if someone searches for `opt in`. But this
+        // is why we have multiple different analyzers. So it becomes
+        // `opt_in` in the `text_analyzer_explicit` analyzer, but is
+        // left as `opt` in the `text_analyzer` analyzer.
+        hyphenation_filter: {
+          type: 'mapping',
+          mappings: ['- => _'],
+        },
+      },
       analyzer: {
         // We defined to analyzers. Both based on a "common core" with the
         // `standard` tokenizer. But the second one adds Snowball filter.
@@ -306,6 +319,7 @@ async function indexVersion(client, indexName, version, language, sourceDirector
         // A great use-case of this when users search for keywords that are
         // code words like `dependency-name`.
         text_analyzer_explicit: {
+          char_filter: ['hyphenation_filter'],
           filter: ['lowercase', 'stop', 'asciifolding'],
           tokenizer: 'standard',
           type: 'custom',
@@ -355,7 +369,13 @@ async function indexVersion(client, indexName, version, language, sourceDirector
           // the searches faster.
           term_vector: 'with_positions_offsets',
         },
-        content_explicit: { type: 'text', analyzer: 'text_analyzer_explicit' },
+        content_explicit: {
+          type: 'text',
+          analyzer: 'text_analyzer_explicit',
+          // This is used for fast highlighting. Uses more space but makes
+          // the searches faster.
+          term_vector: 'with_positions_offsets',
+        },
         headings: { type: 'text', analyzer: 'text_analyzer', norms: false },
         headings_explicit: { type: 'text', analyzer: 'text_analyzer_explicit', norms: false },
         breadcrumbs: { type: 'text' },
@@ -371,15 +391,16 @@ async function indexVersion(client, indexName, version, language, sourceDirector
   const operations = allRecords.flatMap((doc) => {
     const { title, objectID, content, breadcrumbs, headings, intro } = doc
     const contentEscaped = escapeHTML(content)
+    const headingsEscaped = escapeHTML(headings)
     const record = {
       url: objectID,
       title,
       title_explicit: title,
       content: contentEscaped,
       content_explicit: contentEscaped,
       breadcrumbs,
-      headings,
-      headings_explicit: headings,
+      headings: headingsEscaped,
+      headings_explicit: headingsEscaped,
       // This makes sure the popularities are always greater than 1.
       // Generally the 'popularity' is a ratio where the most popular
       // one of all is 1.0.

diff --git a/src/search/scripts/parse-page-sections-into-records.js b/src/search/scripts/parse-page-sections-into-records.js
@@ -57,7 +57,7 @@ export default function parsePageSectionsIntoRecords(page) {
   const headings = $sections
     .map((i, el) => $(el).text())
     .get()
-    .join(' ')
+    .join('\n')
     .trim()
 
   const intro = $('[data-search=lead] p').text().trim()

diff --git a/src/search/tests/api-search.js b/src/search/tests/api-search.js
@@ -128,15 +128,13 @@ describeIfElasticsearchURL('search v1 middleware', () => {
   test('configurable highlights', async () => {
     const sp = new URLSearchParams()
     sp.set('query', 'introduction heading')
-    sp.append('highlights', 'headings')
     sp.append('highlights', 'content')
     const res = await get('/api/search/v1?' + sp)
     expect(res.statusCode).toBe(200)
     const results = JSON.parse(res.body)
     expect(results.meta.found.value).toBeGreaterThanOrEqual(1)
     for (const hit of results.hits) {
       expect(hit.highlights.title).toBeFalsy()
-      expect(hit.highlights.headings).toBeTruthy()
       expect(hit.highlights.content).toBeTruthy()
     }
   })
@@ -145,14 +143,13 @@ describeIfElasticsearchURL('search v1 middleware', () => {
     const sp = new URLSearchParams()
     // This will match because it's in the 'content' but not in 'headings'
     sp.set('query', 'Fact of life')
-    sp.set('highlights', 'headings')
+    sp.set('highlights', 'title')
     const res = await get('/api/search/v1?' + sp)
     expect(res.statusCode).toBe(200)
     const results = JSON.parse(res.body)
     expect(results.meta.found.value).toBeGreaterThanOrEqual(1)
     for (const hit of results.hits) {
-      expect(hit.highlights.headings).toBeTruthy()
-      expect(hit.highlights.title).toBeFalsy()
+      expect(hit.highlights.title).toBeTruthy()
       expect(hit.highlights.content).toBeFalsy()
     }
   })

diff --git a/src/search/tests/parse-page-sections-into-records.js b/src/search/tests/parse-page-sections-into-records.js
@@ -41,7 +41,7 @@ describe('search parsePageSectionsIntoRecords module', () => {
       objectID: '/example/href',
       breadcrumbs: 'GitHub Actions / actions learning path',
       title: 'I am the page title',
-      headings: 'First heading Second heading Table heading',
+      headings: 'First heading\nSecond heading\nTable heading',
       content:
         'This is an introduction to the article.\n' +
         "In this article\nThis won't be ignored.\nFirst heading\n" +