diff --git a/src/search/components/SearchResults.tsx b/src/search/components/SearchResults.tsx index 792783528d94..13fbf40cfc24 100644 --- a/src/search/components/SearchResults.tsx +++ b/src/search/components/SearchResults.tsx @@ -72,6 +72,13 @@ function SearchResultHit({ const title = hit.highlights.title && hit.highlights.title.length > 0 ? hit.highlights.title[0] : hit.title + let content = '' + if (hit.highlights.content_explicit?.length) { + content = hit.highlights.content_explicit[0] + } else if (hit.highlights.content?.length) { + content = hit.highlights.content[0] + } + return (

@@ -99,9 +106,7 @@ function SearchResultHit({ }} > - {hit.highlights.content && hit.highlights.content.length > 0 && ( -

- )} + {content &&
} {debug && ( score: {hit.score} popularity:{' '} diff --git a/src/search/components/types.ts b/src/search/components/types.ts index f7ce0c5f458c..a7a35fba2975 100644 --- a/src/search/components/types.ts +++ b/src/search/components/types.ts @@ -6,6 +6,7 @@ export type SearchResultHitT = { highlights: { title?: string[] content?: string[] + content_explicit?: string[] } score?: number popularity?: number diff --git a/src/search/middleware/es-search.js b/src/search/middleware/es-search.js index d6e65b3a5443..6373de940585 100644 --- a/src/search/middleware/es-search.js +++ b/src/search/middleware/es-search.js @@ -1,6 +1,9 @@ import { Client } from '@elastic/elasticsearch' -export const POSSIBLE_HIGHLIGHT_FIELDS = ['title', 'content', 'headings'] +export const POSSIBLE_HIGHLIGHT_FIELDS = ['title', 'content'] +// This needs to match what we *use* in the `` component. +// For example, if we don't display "headings" we shouldn't request +// highlights for it either. export const DEFAULT_HIGHLIGHT_FIELDS = ['title', 'content'] const ELASTICSEARCH_URL = process.env.ELASTICSEARCH_URL @@ -87,7 +90,11 @@ export async function getSearchResults({ matchQuery.bool.filter = topicsFilter } - const highlightFields = highlights || DEFAULT_HIGHLIGHT_FIELDS + const highlightFields = Array.from(highlights || DEFAULT_HIGHLIGHT_FIELDS) + // These acts as an alias convenience + if (highlightFields.includes('content')) { + highlightFields.push('content_explicit') + } const highlight = getHighlightConfiguration(query, highlightFields) const searchQuery = { @@ -182,7 +189,7 @@ function getMatchQueries(query, { usePrefixSearch, fuzzy }) { const BOOST_HEADINGS = 3.0 const BOOST_CONTENT = 1.0 const BOOST_AND = 2.5 - const BOOST_EXPLICIT = 3.5 + const BOOST_EXPLICIT = 6.5 // Number doesn't matter so much but just make sure it's // boosted low. Because we only really want this to come into // play if nothing else matches. E.g. a search for `AcIons` @@ -419,9 +426,6 @@ function getHighlightConfiguration(query, highlights) { number_of_fragments: 1, } } - if (highlights.includes('headings')) { - fields.headings = { fragment_size: 150, number_of_fragments: 2 } - } if (highlights.includes('content')) { // The 'no_match_size' is so we can display *something* for the // preview if there was no highlight match at all within the content. @@ -429,7 +433,7 @@ function getHighlightConfiguration(query, highlights) { // Fast Vector Highlighter // Using this requires that you first index these fields // with {term_vector: 'with_positions_offsets'} - type: 'fvh', // + type: 'fvh', fragment_size: 150, number_of_fragments: 1, no_match_size: 150, @@ -442,6 +446,27 @@ function getHighlightConfiguration(query, highlights) { }, }, } + // NOTE (JAN 2024) THIS IS DELIBERATELY COMMENTED OUT. FOR NOW... + // Once we know the indexes have all been rebuilt with the + // new `term_vector: 'with_positions_offsets',` everywhere we + // can un-comment this. + // fields.content_explicit = { + // // Fast Vector Highlighter + // // Using this requires that you first index these fields + // // with {term_vector: 'with_positions_offsets'} + // type: 'fvh', + // fragment_size: 150, + // number_of_fragments: 1, + // no_match_size: 0, + + // highlight_query: { + // match_phrase_prefix: { + // content_explicit: { + // query, + // }, + // }, + // }, + // } } return { diff --git a/src/search/scripts/index-elasticsearch.js b/src/search/scripts/index-elasticsearch.js index 292f2a090bac..cbcc92523fed 100755 --- a/src/search/scripts/index-elasticsearch.js +++ b/src/search/scripts/index-elasticsearch.js @@ -294,6 +294,19 @@ async function indexVersion(client, indexName, version, language, sourceDirector // CREATE INDEX const settings = { analysis: { + char_filter: { + // This will turn `runs-on` into `runs_on` so that it can't be + // tokenized to `runs` because `on` is a stop word. + // It also means that prose terms, in English, like `opt-in` + // not be matched if someone searches for `opt in`. But this + // is why we have multiple different analyzers. So it becomes + // `opt_in` in the `text_analyzer_explicit` analyzer, but is + // left as `opt` in the `text_analyzer` analyzer. + hyphenation_filter: { + type: 'mapping', + mappings: ['- => _'], + }, + }, analyzer: { // We defined to analyzers. Both based on a "common core" with the // `standard` tokenizer. But the second one adds Snowball filter. @@ -306,6 +319,7 @@ async function indexVersion(client, indexName, version, language, sourceDirector // A great use-case of this when users search for keywords that are // code words like `dependency-name`. text_analyzer_explicit: { + char_filter: ['hyphenation_filter'], filter: ['lowercase', 'stop', 'asciifolding'], tokenizer: 'standard', type: 'custom', @@ -355,7 +369,13 @@ async function indexVersion(client, indexName, version, language, sourceDirector // the searches faster. term_vector: 'with_positions_offsets', }, - content_explicit: { type: 'text', analyzer: 'text_analyzer_explicit' }, + content_explicit: { + type: 'text', + analyzer: 'text_analyzer_explicit', + // This is used for fast highlighting. Uses more space but makes + // the searches faster. + term_vector: 'with_positions_offsets', + }, headings: { type: 'text', analyzer: 'text_analyzer', norms: false }, headings_explicit: { type: 'text', analyzer: 'text_analyzer_explicit', norms: false }, breadcrumbs: { type: 'text' }, @@ -371,6 +391,7 @@ async function indexVersion(client, indexName, version, language, sourceDirector const operations = allRecords.flatMap((doc) => { const { title, objectID, content, breadcrumbs, headings, intro } = doc const contentEscaped = escapeHTML(content) + const headingsEscaped = escapeHTML(headings) const record = { url: objectID, title, @@ -378,8 +399,8 @@ async function indexVersion(client, indexName, version, language, sourceDirector content: contentEscaped, content_explicit: contentEscaped, breadcrumbs, - headings, - headings_explicit: headings, + headings: headingsEscaped, + headings_explicit: headingsEscaped, // This makes sure the popularities are always greater than 1. // Generally the 'popularity' is a ratio where the most popular // one of all is 1.0. diff --git a/src/search/scripts/parse-page-sections-into-records.js b/src/search/scripts/parse-page-sections-into-records.js index 0a3f3f1be606..a25b280a22b4 100644 --- a/src/search/scripts/parse-page-sections-into-records.js +++ b/src/search/scripts/parse-page-sections-into-records.js @@ -57,7 +57,7 @@ export default function parsePageSectionsIntoRecords(page) { const headings = $sections .map((i, el) => $(el).text()) .get() - .join(' ') + .join('\n') .trim() const intro = $('[data-search=lead] p').text().trim() diff --git a/src/search/tests/api-search.js b/src/search/tests/api-search.js index 1771731ccfb5..1b6c35ec34db 100644 --- a/src/search/tests/api-search.js +++ b/src/search/tests/api-search.js @@ -128,7 +128,6 @@ describeIfElasticsearchURL('search v1 middleware', () => { test('configurable highlights', async () => { const sp = new URLSearchParams() sp.set('query', 'introduction heading') - sp.append('highlights', 'headings') sp.append('highlights', 'content') const res = await get('/api/search/v1?' + sp) expect(res.statusCode).toBe(200) @@ -136,7 +135,6 @@ describeIfElasticsearchURL('search v1 middleware', () => { expect(results.meta.found.value).toBeGreaterThanOrEqual(1) for (const hit of results.hits) { expect(hit.highlights.title).toBeFalsy() - expect(hit.highlights.headings).toBeTruthy() expect(hit.highlights.content).toBeTruthy() } }) @@ -145,14 +143,13 @@ describeIfElasticsearchURL('search v1 middleware', () => { const sp = new URLSearchParams() // This will match because it's in the 'content' but not in 'headings' sp.set('query', 'Fact of life') - sp.set('highlights', 'headings') + sp.set('highlights', 'title') const res = await get('/api/search/v1?' + sp) expect(res.statusCode).toBe(200) const results = JSON.parse(res.body) expect(results.meta.found.value).toBeGreaterThanOrEqual(1) for (const hit of results.hits) { - expect(hit.highlights.headings).toBeTruthy() - expect(hit.highlights.title).toBeFalsy() + expect(hit.highlights.title).toBeTruthy() expect(hit.highlights.content).toBeFalsy() } }) diff --git a/src/search/tests/parse-page-sections-into-records.js b/src/search/tests/parse-page-sections-into-records.js index d97e9bf42449..872f39be9067 100644 --- a/src/search/tests/parse-page-sections-into-records.js +++ b/src/search/tests/parse-page-sections-into-records.js @@ -41,7 +41,7 @@ describe('search parsePageSectionsIntoRecords module', () => { objectID: '/example/href', breadcrumbs: 'GitHub Actions / actions learning path', title: 'I am the page title', - headings: 'First heading Second heading Table heading', + headings: 'First heading\nSecond heading\nTable heading', content: 'This is an introduction to the article.\n' + "In this article\nThis won't be ignored.\nFirst heading\n" +