Skip to content

Commit

Permalink
Treat hyphenated words strong in explicit analyzer (#46974)
Browse files Browse the repository at this point in the history
  • Loading branch information
peterbe authored Jan 12, 2024
1 parent c53df7a commit 971ca2f
Show file tree
Hide file tree
Showing 7 changed files with 69 additions and 20 deletions.
11 changes: 8 additions & 3 deletions src/search/components/SearchResults.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,13 @@ function SearchResultHit({
const title =
hit.highlights.title && hit.highlights.title.length > 0 ? hit.highlights.title[0] : hit.title

let content = ''
if (hit.highlights.content_explicit?.length) {
content = hit.highlights.content_explicit[0]
} else if (hit.highlights.content?.length) {
content = hit.highlights.content[0]
}

return (
<div className={cx('my-6', styles.search_result)} data-testid="search-result">
<p className="text-normal f5 color-fg-muted" style={{ wordSpacing: 2 }}>
Expand Down Expand Up @@ -99,9 +106,7 @@ function SearchResultHit({
}}
></Link>
</h2>
{hit.highlights.content && hit.highlights.content.length > 0 && (
<div dangerouslySetInnerHTML={{ __html: hit.highlights.content[0] }}></div>
)}
{content && <div dangerouslySetInnerHTML={{ __html: content }}></div>}
{debug && (
<Text as="p" fontWeight="bold">
score: <code style={{ marginRight: 10 }}>{hit.score}</code> popularity:{' '}
Expand Down
1 change: 1 addition & 0 deletions src/search/components/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ export type SearchResultHitT = {
highlights: {
title?: string[]
content?: string[]
content_explicit?: string[]
}
score?: number
popularity?: number
Expand Down
39 changes: 32 additions & 7 deletions src/search/middleware/es-search.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import { Client } from '@elastic/elasticsearch'

export const POSSIBLE_HIGHLIGHT_FIELDS = ['title', 'content', 'headings']
export const POSSIBLE_HIGHLIGHT_FIELDS = ['title', 'content']
// This needs to match what we *use* in the `<SearchResults>` component.
// For example, if we don't display "headings" we shouldn't request
// highlights for it either.
export const DEFAULT_HIGHLIGHT_FIELDS = ['title', 'content']

const ELASTICSEARCH_URL = process.env.ELASTICSEARCH_URL
Expand Down Expand Up @@ -87,7 +90,11 @@ export async function getSearchResults({
matchQuery.bool.filter = topicsFilter
}

const highlightFields = highlights || DEFAULT_HIGHLIGHT_FIELDS
const highlightFields = Array.from(highlights || DEFAULT_HIGHLIGHT_FIELDS)
// These acts as an alias convenience
if (highlightFields.includes('content')) {
highlightFields.push('content_explicit')
}
const highlight = getHighlightConfiguration(query, highlightFields)

const searchQuery = {
Expand Down Expand Up @@ -182,7 +189,7 @@ function getMatchQueries(query, { usePrefixSearch, fuzzy }) {
const BOOST_HEADINGS = 3.0
const BOOST_CONTENT = 1.0
const BOOST_AND = 2.5
const BOOST_EXPLICIT = 3.5
const BOOST_EXPLICIT = 6.5
// Number doesn't matter so much but just make sure it's
// boosted low. Because we only really want this to come into
// play if nothing else matches. E.g. a search for `AcIons`
Expand Down Expand Up @@ -419,17 +426,14 @@ function getHighlightConfiguration(query, highlights) {
number_of_fragments: 1,
}
}
if (highlights.includes('headings')) {
fields.headings = { fragment_size: 150, number_of_fragments: 2 }
}
if (highlights.includes('content')) {
// The 'no_match_size' is so we can display *something* for the
// preview if there was no highlight match at all within the content.
fields.content = {
// Fast Vector Highlighter
// Using this requires that you first index these fields
// with {term_vector: 'with_positions_offsets'}
type: 'fvh', //
type: 'fvh',
fragment_size: 150,
number_of_fragments: 1,
no_match_size: 150,
Expand All @@ -442,6 +446,27 @@ function getHighlightConfiguration(query, highlights) {
},
},
}
// NOTE (JAN 2024) THIS IS DELIBERATELY COMMENTED OUT. FOR NOW...
// Once we know the indexes have all been rebuilt with the
// new `term_vector: 'with_positions_offsets',` everywhere we
// can un-comment this.
// fields.content_explicit = {
// // Fast Vector Highlighter
// // Using this requires that you first index these fields
// // with {term_vector: 'with_positions_offsets'}
// type: 'fvh',
// fragment_size: 150,
// number_of_fragments: 1,
// no_match_size: 0,

// highlight_query: {
// match_phrase_prefix: {
// content_explicit: {
// query,
// },
// },
// },
// }
}

return {
Expand Down
27 changes: 24 additions & 3 deletions src/search/scripts/index-elasticsearch.js
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,19 @@ async function indexVersion(client, indexName, version, language, sourceDirector
// CREATE INDEX
const settings = {
analysis: {
char_filter: {
// This will turn `runs-on` into `runs_on` so that it can't be
// tokenized to `runs` because `on` is a stop word.
// It also means that prose terms, in English, like `opt-in`
// not be matched if someone searches for `opt in`. But this
// is why we have multiple different analyzers. So it becomes
// `opt_in` in the `text_analyzer_explicit` analyzer, but is
// left as `opt` in the `text_analyzer` analyzer.
hyphenation_filter: {
type: 'mapping',
mappings: ['- => _'],
},
},
analyzer: {
// We defined to analyzers. Both based on a "common core" with the
// `standard` tokenizer. But the second one adds Snowball filter.
Expand All @@ -306,6 +319,7 @@ async function indexVersion(client, indexName, version, language, sourceDirector
// A great use-case of this when users search for keywords that are
// code words like `dependency-name`.
text_analyzer_explicit: {
char_filter: ['hyphenation_filter'],
filter: ['lowercase', 'stop', 'asciifolding'],
tokenizer: 'standard',
type: 'custom',
Expand Down Expand Up @@ -355,7 +369,13 @@ async function indexVersion(client, indexName, version, language, sourceDirector
// the searches faster.
term_vector: 'with_positions_offsets',
},
content_explicit: { type: 'text', analyzer: 'text_analyzer_explicit' },
content_explicit: {
type: 'text',
analyzer: 'text_analyzer_explicit',
// This is used for fast highlighting. Uses more space but makes
// the searches faster.
term_vector: 'with_positions_offsets',
},
headings: { type: 'text', analyzer: 'text_analyzer', norms: false },
headings_explicit: { type: 'text', analyzer: 'text_analyzer_explicit', norms: false },
breadcrumbs: { type: 'text' },
Expand All @@ -371,15 +391,16 @@ async function indexVersion(client, indexName, version, language, sourceDirector
const operations = allRecords.flatMap((doc) => {
const { title, objectID, content, breadcrumbs, headings, intro } = doc
const contentEscaped = escapeHTML(content)
const headingsEscaped = escapeHTML(headings)
const record = {
url: objectID,
title,
title_explicit: title,
content: contentEscaped,
content_explicit: contentEscaped,
breadcrumbs,
headings,
headings_explicit: headings,
headings: headingsEscaped,
headings_explicit: headingsEscaped,
// This makes sure the popularities are always greater than 1.
// Generally the 'popularity' is a ratio where the most popular
// one of all is 1.0.
Expand Down
2 changes: 1 addition & 1 deletion src/search/scripts/parse-page-sections-into-records.js
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ export default function parsePageSectionsIntoRecords(page) {
const headings = $sections
.map((i, el) => $(el).text())
.get()
.join(' ')
.join('\n')
.trim()

const intro = $('[data-search=lead] p').text().trim()
Expand Down
7 changes: 2 additions & 5 deletions src/search/tests/api-search.js
Original file line number Diff line number Diff line change
Expand Up @@ -128,15 +128,13 @@ describeIfElasticsearchURL('search v1 middleware', () => {
test('configurable highlights', async () => {
const sp = new URLSearchParams()
sp.set('query', 'introduction heading')
sp.append('highlights', 'headings')
sp.append('highlights', 'content')
const res = await get('/api/search/v1?' + sp)
expect(res.statusCode).toBe(200)
const results = JSON.parse(res.body)
expect(results.meta.found.value).toBeGreaterThanOrEqual(1)
for (const hit of results.hits) {
expect(hit.highlights.title).toBeFalsy()
expect(hit.highlights.headings).toBeTruthy()
expect(hit.highlights.content).toBeTruthy()
}
})
Expand All @@ -145,14 +143,13 @@ describeIfElasticsearchURL('search v1 middleware', () => {
const sp = new URLSearchParams()
// This will match because it's in the 'content' but not in 'headings'
sp.set('query', 'Fact of life')
sp.set('highlights', 'headings')
sp.set('highlights', 'title')
const res = await get('/api/search/v1?' + sp)
expect(res.statusCode).toBe(200)
const results = JSON.parse(res.body)
expect(results.meta.found.value).toBeGreaterThanOrEqual(1)
for (const hit of results.hits) {
expect(hit.highlights.headings).toBeTruthy()
expect(hit.highlights.title).toBeFalsy()
expect(hit.highlights.title).toBeTruthy()
expect(hit.highlights.content).toBeFalsy()
}
})
Expand Down
2 changes: 1 addition & 1 deletion src/search/tests/parse-page-sections-into-records.js
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ describe('search parsePageSectionsIntoRecords module', () => {
objectID: '/example/href',
breadcrumbs: 'GitHub Actions / actions learning path',
title: 'I am the page title',
headings: 'First heading Second heading Table heading',
headings: 'First heading\nSecond heading\nTable heading',
content:
'This is an introduction to the article.\n' +
"In this article\nThis won't be ignored.\nFirst heading\n" +
Expand Down

0 comments on commit 971ca2f

Please sign in to comment.