diff --git a/src/search/components/SearchResults.tsx b/src/search/components/SearchResults.tsx
index 792783528d94..13fbf40cfc24 100644
--- a/src/search/components/SearchResults.tsx
+++ b/src/search/components/SearchResults.tsx
@@ -72,6 +72,13 @@ function SearchResultHit({
const title =
hit.highlights.title && hit.highlights.title.length > 0 ? hit.highlights.title[0] : hit.title
+ let content = ''
+ if (hit.highlights.content_explicit?.length) {
+ content = hit.highlights.content_explicit[0]
+ } else if (hit.highlights.content?.length) {
+ content = hit.highlights.content[0]
+ }
+
return (
@@ -99,9 +106,7 @@ function SearchResultHit({
}}
>
- {hit.highlights.content && hit.highlights.content.length > 0 && (
-
- )}
+ {content &&
}
{debug && (
score: {hit.score}
popularity:{' '}
diff --git a/src/search/components/types.ts b/src/search/components/types.ts
index f7ce0c5f458c..a7a35fba2975 100644
--- a/src/search/components/types.ts
+++ b/src/search/components/types.ts
@@ -6,6 +6,7 @@ export type SearchResultHitT = {
highlights: {
title?: string[]
content?: string[]
+ content_explicit?: string[]
}
score?: number
popularity?: number
diff --git a/src/search/middleware/es-search.js b/src/search/middleware/es-search.js
index d6e65b3a5443..6373de940585 100644
--- a/src/search/middleware/es-search.js
+++ b/src/search/middleware/es-search.js
@@ -1,6 +1,9 @@
import { Client } from '@elastic/elasticsearch'
-export const POSSIBLE_HIGHLIGHT_FIELDS = ['title', 'content', 'headings']
+export const POSSIBLE_HIGHLIGHT_FIELDS = ['title', 'content']
+// This needs to match what we *use* in the `` component.
+// For example, if we don't display "headings" we shouldn't request
+// highlights for it either.
export const DEFAULT_HIGHLIGHT_FIELDS = ['title', 'content']
const ELASTICSEARCH_URL = process.env.ELASTICSEARCH_URL
@@ -87,7 +90,11 @@ export async function getSearchResults({
matchQuery.bool.filter = topicsFilter
}
- const highlightFields = highlights || DEFAULT_HIGHLIGHT_FIELDS
+ const highlightFields = Array.from(highlights || DEFAULT_HIGHLIGHT_FIELDS)
+ // These acts as an alias convenience
+ if (highlightFields.includes('content')) {
+ highlightFields.push('content_explicit')
+ }
const highlight = getHighlightConfiguration(query, highlightFields)
const searchQuery = {
@@ -182,7 +189,7 @@ function getMatchQueries(query, { usePrefixSearch, fuzzy }) {
const BOOST_HEADINGS = 3.0
const BOOST_CONTENT = 1.0
const BOOST_AND = 2.5
- const BOOST_EXPLICIT = 3.5
+ const BOOST_EXPLICIT = 6.5
// Number doesn't matter so much but just make sure it's
// boosted low. Because we only really want this to come into
// play if nothing else matches. E.g. a search for `AcIons`
@@ -419,9 +426,6 @@ function getHighlightConfiguration(query, highlights) {
number_of_fragments: 1,
}
}
- if (highlights.includes('headings')) {
- fields.headings = { fragment_size: 150, number_of_fragments: 2 }
- }
if (highlights.includes('content')) {
// The 'no_match_size' is so we can display *something* for the
// preview if there was no highlight match at all within the content.
@@ -429,7 +433,7 @@ function getHighlightConfiguration(query, highlights) {
// Fast Vector Highlighter
// Using this requires that you first index these fields
// with {term_vector: 'with_positions_offsets'}
- type: 'fvh', //
+ type: 'fvh',
fragment_size: 150,
number_of_fragments: 1,
no_match_size: 150,
@@ -442,6 +446,27 @@ function getHighlightConfiguration(query, highlights) {
},
},
}
+ // NOTE (JAN 2024) THIS IS DELIBERATELY COMMENTED OUT. FOR NOW...
+ // Once we know the indexes have all been rebuilt with the
+ // new `term_vector: 'with_positions_offsets',` everywhere we
+ // can un-comment this.
+ // fields.content_explicit = {
+ // // Fast Vector Highlighter
+ // // Using this requires that you first index these fields
+ // // with {term_vector: 'with_positions_offsets'}
+ // type: 'fvh',
+ // fragment_size: 150,
+ // number_of_fragments: 1,
+ // no_match_size: 0,
+
+ // highlight_query: {
+ // match_phrase_prefix: {
+ // content_explicit: {
+ // query,
+ // },
+ // },
+ // },
+ // }
}
return {
diff --git a/src/search/scripts/index-elasticsearch.js b/src/search/scripts/index-elasticsearch.js
index 292f2a090bac..cbcc92523fed 100755
--- a/src/search/scripts/index-elasticsearch.js
+++ b/src/search/scripts/index-elasticsearch.js
@@ -294,6 +294,19 @@ async function indexVersion(client, indexName, version, language, sourceDirector
// CREATE INDEX
const settings = {
analysis: {
+ char_filter: {
+ // This will turn `runs-on` into `runs_on` so that it can't be
+ // tokenized to `runs` because `on` is a stop word.
+ // It also means that prose terms, in English, like `opt-in`
+ // not be matched if someone searches for `opt in`. But this
+ // is why we have multiple different analyzers. So it becomes
+ // `opt_in` in the `text_analyzer_explicit` analyzer, but is
+ // left as `opt` in the `text_analyzer` analyzer.
+ hyphenation_filter: {
+ type: 'mapping',
+ mappings: ['- => _'],
+ },
+ },
analyzer: {
// We defined to analyzers. Both based on a "common core" with the
// `standard` tokenizer. But the second one adds Snowball filter.
@@ -306,6 +319,7 @@ async function indexVersion(client, indexName, version, language, sourceDirector
// A great use-case of this when users search for keywords that are
// code words like `dependency-name`.
text_analyzer_explicit: {
+ char_filter: ['hyphenation_filter'],
filter: ['lowercase', 'stop', 'asciifolding'],
tokenizer: 'standard',
type: 'custom',
@@ -355,7 +369,13 @@ async function indexVersion(client, indexName, version, language, sourceDirector
// the searches faster.
term_vector: 'with_positions_offsets',
},
- content_explicit: { type: 'text', analyzer: 'text_analyzer_explicit' },
+ content_explicit: {
+ type: 'text',
+ analyzer: 'text_analyzer_explicit',
+ // This is used for fast highlighting. Uses more space but makes
+ // the searches faster.
+ term_vector: 'with_positions_offsets',
+ },
headings: { type: 'text', analyzer: 'text_analyzer', norms: false },
headings_explicit: { type: 'text', analyzer: 'text_analyzer_explicit', norms: false },
breadcrumbs: { type: 'text' },
@@ -371,6 +391,7 @@ async function indexVersion(client, indexName, version, language, sourceDirector
const operations = allRecords.flatMap((doc) => {
const { title, objectID, content, breadcrumbs, headings, intro } = doc
const contentEscaped = escapeHTML(content)
+ const headingsEscaped = escapeHTML(headings)
const record = {
url: objectID,
title,
@@ -378,8 +399,8 @@ async function indexVersion(client, indexName, version, language, sourceDirector
content: contentEscaped,
content_explicit: contentEscaped,
breadcrumbs,
- headings,
- headings_explicit: headings,
+ headings: headingsEscaped,
+ headings_explicit: headingsEscaped,
// This makes sure the popularities are always greater than 1.
// Generally the 'popularity' is a ratio where the most popular
// one of all is 1.0.
diff --git a/src/search/scripts/parse-page-sections-into-records.js b/src/search/scripts/parse-page-sections-into-records.js
index 0a3f3f1be606..a25b280a22b4 100644
--- a/src/search/scripts/parse-page-sections-into-records.js
+++ b/src/search/scripts/parse-page-sections-into-records.js
@@ -57,7 +57,7 @@ export default function parsePageSectionsIntoRecords(page) {
const headings = $sections
.map((i, el) => $(el).text())
.get()
- .join(' ')
+ .join('\n')
.trim()
const intro = $('[data-search=lead] p').text().trim()
diff --git a/src/search/tests/api-search.js b/src/search/tests/api-search.js
index 1771731ccfb5..1b6c35ec34db 100644
--- a/src/search/tests/api-search.js
+++ b/src/search/tests/api-search.js
@@ -128,7 +128,6 @@ describeIfElasticsearchURL('search v1 middleware', () => {
test('configurable highlights', async () => {
const sp = new URLSearchParams()
sp.set('query', 'introduction heading')
- sp.append('highlights', 'headings')
sp.append('highlights', 'content')
const res = await get('/api/search/v1?' + sp)
expect(res.statusCode).toBe(200)
@@ -136,7 +135,6 @@ describeIfElasticsearchURL('search v1 middleware', () => {
expect(results.meta.found.value).toBeGreaterThanOrEqual(1)
for (const hit of results.hits) {
expect(hit.highlights.title).toBeFalsy()
- expect(hit.highlights.headings).toBeTruthy()
expect(hit.highlights.content).toBeTruthy()
}
})
@@ -145,14 +143,13 @@ describeIfElasticsearchURL('search v1 middleware', () => {
const sp = new URLSearchParams()
// This will match because it's in the 'content' but not in 'headings'
sp.set('query', 'Fact of life')
- sp.set('highlights', 'headings')
+ sp.set('highlights', 'title')
const res = await get('/api/search/v1?' + sp)
expect(res.statusCode).toBe(200)
const results = JSON.parse(res.body)
expect(results.meta.found.value).toBeGreaterThanOrEqual(1)
for (const hit of results.hits) {
- expect(hit.highlights.headings).toBeTruthy()
- expect(hit.highlights.title).toBeFalsy()
+ expect(hit.highlights.title).toBeTruthy()
expect(hit.highlights.content).toBeFalsy()
}
})
diff --git a/src/search/tests/parse-page-sections-into-records.js b/src/search/tests/parse-page-sections-into-records.js
index d97e9bf42449..872f39be9067 100644
--- a/src/search/tests/parse-page-sections-into-records.js
+++ b/src/search/tests/parse-page-sections-into-records.js
@@ -41,7 +41,7 @@ describe('search parsePageSectionsIntoRecords module', () => {
objectID: '/example/href',
breadcrumbs: 'GitHub Actions / actions learning path',
title: 'I am the page title',
- headings: 'First heading Second heading Table heading',
+ headings: 'First heading\nSecond heading\nTable heading',
content:
'This is an introduction to the article.\n' +
"In this article\nThis won't be ignored.\nFirst heading\n" +