From e8410d3c0b133605bed25d2dd1c83d91c05bf9a6 Mon Sep 17 00:00:00 2001 From: Liam Bigelow <40188355+bglw@users.noreply.github.com> Date: Mon, 27 Jan 2025 21:19:40 +1300 Subject: [PATCH 1/5] Add feature to include specific characters in the Pagefind index --- ...nd-matches-custom-characters.toolproof.yml | 39 +++++ pagefind/src/fossick/mod.rs | 162 ++++++++++++++++-- pagefind/src/lib.rs | 19 +- pagefind/src/options.rs | 20 ++- pagefind/src/output/entry.rs | 1 + pagefind/src/output/mod.rs | 8 +- pagefind/src/utils.rs | 4 + pagefind_web_js/lib/coupled_search.ts | 56 +++++- pagefind_web_js/types/internal.d.ts | 1 + 9 files changed, 279 insertions(+), 31 deletions(-) create mode 100644 pagefind/integration_tests/characters/pagefind-matches-custom-characters.toolproof.yml diff --git a/pagefind/integration_tests/characters/pagefind-matches-custom-characters.toolproof.yml b/pagefind/integration_tests/characters/pagefind-matches-custom-characters.toolproof.yml new file mode 100644 index 00000000..43396ea8 --- /dev/null +++ b/pagefind/integration_tests/characters/pagefind-matches-custom-characters.toolproof.yml @@ -0,0 +1,39 @@ +name: Character Tests > Pagefind matches custom characters +steps: + - ref: ./background.toolproof.yml + - step: I have a "public/page_a/index.html" file with the content {html} + html: >- +

Talking about @money

+ - step: I have a "public/page_b/index.html" file with the content {html} + html: >- +

Configure a^b^c^d

+ - macro: I run Pagefind with '--include-characters "@^"' + - step: stdout should contain "Running Pagefind" + - step: The file "public/pagefind/pagefind.js" should not be empty + - step: I serve the directory "public" + - step: In my browser, I load "/" + - step: In my browser, I evaluate {js} + js: |- + let pagefind = await import("/pagefind/pagefind.js"); + let search = await pagefind.search("@"); + let pages = await Promise.all(search.results.map(r => r.data())); + + toolproof.assert_eq(pages.length, 1); + toolproof.assert_eq(pages[0].url, "/page_a/"); + - step: In my browser, I evaluate {js} + js: |- + let pagefind = await import("/pagefind/pagefind.js"); + let search = await pagefind.search("money"); + let pages = await Promise.all(search.results.map(r => r.data())); + + toolproof.assert_eq(pages.length, 1); + toolproof.assert_eq(pages[0].url, "/page_a/"); + - step: In my browser, I evaluate {js} + js: |- + let pagefind = await import("/pagefind/pagefind.js"); + let search = await pagefind.search("a^b^c^d"); + let pages = await Promise.all(search.results.map(r => r.data())); + + toolproof.assert_eq(pages.length, 1); + toolproof.assert_eq(pages[0].url, "/page_b/"); + - step: In my browser, the console should be empty diff --git a/pagefind/src/fossick/mod.rs b/pagefind/src/fossick/mod.rs index b419b5ef..90b0f430 100644 --- a/pagefind/src/fossick/mod.rs +++ b/pagefind/src/fossick/mod.rs @@ -29,8 +29,6 @@ lazy_static! { static ref TRIM_NEWLINES: Regex = Regex::new("^[\n\r\\s]+|[\n\r\\s]+$").unwrap(); static ref EXTRANEOUS_SPACES: Regex = Regex::new("\\s{2,}").unwrap(); static ref PRIVATE_PAGEFIND: Regex = Regex::new("___PAGEFIND_[\\S]+\\s?").unwrap(); - // TODO: i18n? - static ref SPECIAL_CHARS: Regex = Regex::new("[^\\w]").unwrap(); } pub mod parser; @@ -197,6 +195,7 @@ impl Fossicker { fn parse_digest( &mut self, + options: &SearchOptions, ) -> ( String, HashMap>, @@ -352,10 +351,24 @@ impl Fossicker { if should_segment { content.push('\u{200B}'); } - let normalized_word = SPECIAL_CHARS - .replace_all(word, "") - .into_owned() - .to_lowercase(); + let mut normalized_word = String::with_capacity(word.len()); + let mut possibly_compound = false; + + for mut c in word.chars() { + let is_alpha = c.is_alphanumeric(); + if !is_alpha { + possibly_compound = true; + } + if is_alpha || options.include_characters.contains(&c) { + c.make_ascii_lowercase(); + if c.is_uppercase() { + // Non-ascii uppercase can lower to multiple chars + normalized_word.extend(c.to_lowercase()); + } else { + normalized_word.push(c); + } + } + } let word_weight = weight_stack.last().unwrap_or(&1); if !normalized_word.is_empty() { @@ -363,10 +376,12 @@ impl Fossicker { } // For words that may be CompoundWords, also index them as their constituent parts - if normalized_word != word { + if possibly_compound { let (word_parts, extras) = get_discrete_words(word); // Only proceed if the word was broken into multiple parts - if word_parts.contains(|c: char| c.is_whitespace()) { + if word_parts.contains(|c: char| c.is_whitespace()) + || (!normalized_word.starts_with(&word_parts)) + { let part_words: Vec<_> = word_parts.split_whitespace().collect(); if !part_words.is_empty() { @@ -456,7 +471,7 @@ impl Fossicker { self.fossick_html(options).await; }; - let (content, word_data, anchors, word_count) = self.parse_digest(); + let (content, word_data, anchors, word_count) = self.parse_digest(options); self.tidy_meta_and_filters(); let data = self.data.unwrap(); @@ -601,12 +616,14 @@ mod tests { assert_eq!(&output, "Hello Wor ld?"); } - async fn test_fossick(s: String) -> Fossicker { + fn test_opts() -> SearchOptions { std::env::set_var("PAGEFIND_SOURCE", "somewhere"); let config = PagefindInboundConfig::with_layers(&[Layer::Env(Some("PAGEFIND_".into()))]).unwrap(); - let opts = SearchOptions::load(config).unwrap(); + SearchOptions::load(config).unwrap() + } + async fn test_fossick(s: String) -> Fossicker { let mut f = Fossicker { file_path: Some("test/index.html".into()), root_path: None, @@ -615,7 +632,7 @@ mod tests { data: None, }; - _ = f.read_synthetic(&opts).await; + _ = f.read_synthetic(&test_opts()).await; f } @@ -625,7 +642,7 @@ mod tests { let mut f = test_fossick(["", "

Hello World!

", ""].concat()).await; - let (digest, words, _, _) = f.parse_digest(); + let (digest, words, _, _) = f.parse_digest(&test_opts()); assert_eq!(digest, "Hello World!".to_string()); assert_eq!( @@ -649,6 +666,117 @@ mod tests { ); } + #[tokio::test] + async fn parse_chars() { + let mut f = test_fossick( + [ + "", + "

He&llo htmltag<head> *before mid*dle after*

", + "", + ] + .concat(), + ) + .await; + + let mut opts = test_opts(); + opts.include_characters.extend(['<', '>', '*']); + let (digest, words, _, _) = f.parse_digest(&opts); + + assert_eq!( + digest, + "He&llo htmltag *before mid*dle after*.".to_string() + ); + assert_eq!( + words, + HashMap::from_iter([ + ( + "he".to_string(), + vec![FossickedWord { + position: 0, + weight: 12 + }] + ), + ( + "llo".to_string(), + vec![FossickedWord { + position: 0, + weight: 12 + }] + ), + ( + "hello".to_string(), + vec![FossickedWord { + position: 0, + weight: 24 + }] + ), + ( + "htmltag".to_string(), + vec![FossickedWord { + position: 1, + weight: 24 + }] + ), + ( + "htmltag".to_string(), + vec![FossickedWord { + position: 1, + weight: 12 + }] + ), + ( + "head".to_string(), + vec![FossickedWord { + position: 1, + weight: 12 + }] + ), + ( + "*before".to_string(), + vec![FossickedWord { + position: 2, + weight: 24 + }] + ), + ( + "before".to_string(), + vec![FossickedWord { + position: 2, + weight: 24 + }] + ), + ( + "mid*dle".to_string(), + vec![FossickedWord { + position: 3, + weight: 24 + }] + ), + ( + "mid".to_string(), + vec![FossickedWord { + position: 3, + weight: 12 + }] + ), + ( + "dle".to_string(), + vec![FossickedWord { + position: 3, + weight: 12 + }] + ), + ( + "after*".to_string(), + vec![FossickedWord { + position: 4, + weight: 24 + }] + ) + ]) + ); + } + #[tokio::test] async fn parse_weighted_file() { let mut f = test_fossick( @@ -665,7 +793,7 @@ mod tests { ) .await; - let (digest, words, _, _) = f.parse_digest(); + let (digest, words, _, _) = f.parse_digest(&test_opts()); assert_eq!(digest, "The Quick Brown. Fox Jumps Over. Ryan.".to_string()); assert_eq!( @@ -743,7 +871,7 @@ mod tests { ) .await; - let (_, words, _, _) = f.parse_digest(); + let (_, words, _, _) = f.parse_digest(&test_opts()); assert_eq!( words, @@ -802,7 +930,7 @@ mod tests { ) .await; - let (_, words, _, _) = f.parse_digest(); + let (_, words, _, _) = f.parse_digest(&test_opts()); assert_eq!( words, @@ -851,7 +979,7 @@ mod tests { ) .await; - let (_, words, _, _) = f.parse_digest(); + let (_, words, _, _) = f.parse_digest(&test_opts()); let mut words = words.keys().collect::>(); words.sort(); diff --git a/pagefind/src/lib.rs b/pagefind/src/lib.rs index cf7a95b3..403a7675 100644 --- a/pagefind/src/lib.rs +++ b/pagefind/src/lib.rs @@ -301,7 +301,13 @@ impl SearchState { ) .await; - output::write_common_to_disk(index_entries, self.options.write_playground, &outdir).await; + output::write_common_to_disk( + index_entries, + self.options.write_playground, + &outdir, + &self.options, + ) + .await; outdir } @@ -325,9 +331,14 @@ impl SearchState { .collect(); files.extend( - output::write_common_to_memory(index_entries, self.options.write_playground, outdir) - .await - .into_iter(), + output::write_common_to_memory( + index_entries, + self.options.write_playground, + outdir, + &self.options, + ) + .await + .into_iter(), ); // SyntheticFiles should only return the relative path to the file diff --git a/pagefind/src/options.rs b/pagefind/src/options.rs index e780a2cf..f8854a18 100644 --- a/pagefind/src/options.rs +++ b/pagefind/src/options.rs @@ -8,7 +8,10 @@ use std::{env, path::PathBuf}; use twelf::config; use typed_builder::TypedBuilder; -use crate::logging::{LogLevel, Logger}; +use crate::{ + logging::{LogLevel, Logger}, + utils::WORD_SYMBOLS, +}; // // If editing this configuration struct, @@ -86,6 +89,13 @@ pub(crate) struct PagefindInboundConfig { #[clap(required = false)] pub(crate) force_language: Option, + #[clap( + long, + help = "Include these characters when indexing and searching words. Useful for sites documenting technical topics such as programming languages." + )] + #[clap(required = false)] + pub(crate) include_characters: Option, + #[clap( long, help = "Serve the source directory after creating the search index" @@ -114,7 +124,6 @@ pub(crate) struct PagefindInboundConfig { #[clap( long, - short, help = "Only log errors while indexing the site. Does not impact the web-facing search." )] #[clap(required = false)] @@ -214,6 +223,7 @@ pub(crate) struct SearchOptions { pub(crate) exclude_selectors: Vec, pub(crate) glob: String, pub(crate) force_language: Option, + pub(crate) include_characters: Vec, pub(crate) version: &'static str, pub(crate) logger: Logger, pub(crate) keep_index_url: bool, @@ -280,6 +290,11 @@ impl SearchOptions { site_source.join(subdir) }; + let mut include_characters = WORD_SYMBOLS.to_vec(); + if let Some(custom_include_characters) = config.include_characters { + include_characters.extend(custom_include_characters.chars()); + } + Ok(Self { working_directory, site_source, @@ -288,6 +303,7 @@ impl SearchOptions { exclude_selectors: config.exclude_selectors, glob: config.glob, force_language: config.force_language, + include_characters, version: env!("CARGO_PKG_VERSION"), logger: Logger::new( log_level, diff --git a/pagefind/src/output/entry.rs b/pagefind/src/output/entry.rs index a7462420..2765416a 100644 --- a/pagefind/src/output/entry.rs +++ b/pagefind/src/output/entry.rs @@ -6,6 +6,7 @@ use serde::Serialize; pub struct PagefindEntryMeta { pub version: &'static str, pub languages: HashMap, + pub include_characters: Vec, } #[derive(Serialize, Debug)] diff --git a/pagefind/src/output/mod.rs b/pagefind/src/output/mod.rs index 7c134c37..96107a58 100644 --- a/pagefind/src/output/mod.rs +++ b/pagefind/src/output/mod.rs @@ -78,16 +78,18 @@ pub async fn write_common_to_disk( language_indexes: Vec, output_playground: bool, outdir: &PathBuf, + options: &SearchOptions, ) { - write_common(language_indexes, output_playground, outdir, false).await; + write_common(language_indexes, output_playground, outdir, options, false).await; } pub async fn write_common_to_memory( language_indexes: Vec, output_playground: bool, outdir: &PathBuf, + options: &SearchOptions, ) -> Vec { - write_common(language_indexes, output_playground, outdir, true) + write_common(language_indexes, output_playground, outdir, options, true) .await .unwrap() } @@ -96,6 +98,7 @@ async fn write_common( language_indexes: Vec, output_playground: bool, outdir: &PathBuf, + options: &SearchOptions, synthetic: bool, ) -> Option> { let js_version = format!("const pagefind_version = \"{PAGEFIND_VERSION}\";"); @@ -116,6 +119,7 @@ async fn write_common( }, ) })), + include_characters: options.include_characters.clone(), }; let encoded_entry_meta = serde_json::to_string(&entry_meta).unwrap(); diff --git a/pagefind/src/utils.rs b/pagefind/src/utils.rs index 4679f379..97195c85 100644 --- a/pagefind/src/utils.rs +++ b/pagefind/src/utils.rs @@ -1,5 +1,9 @@ use sha1::{Digest, Sha1}; +/// Symbols that count as part of a word +/// (specifically, the "Punctuation, Connector" Unicode category) +pub const WORD_SYMBOLS: [char; 10] = ['_', '‿', '⁀', '⁔', '︳', '︴', '﹍', '﹎', '﹏', '_']; + pub fn full_hash(bytes: &[u8]) -> String { let mut hasher = Sha1::new(); hasher.update(bytes); diff --git a/pagefind_web_js/lib/coupled_search.ts b/pagefind_web_js/lib/coupled_search.ts index e9c1a7f7..731c8e36 100644 --- a/pagefind_web_js/lib/coupled_search.ts +++ b/pagefind_web_js/lib/coupled_search.ts @@ -33,6 +33,7 @@ export class PagefindInstance { searchMeta: any; languages: Record | null; loadedLanguage?: string; + includeCharacters?: string[]; version: string; loadedVersion?: string; @@ -203,6 +204,7 @@ export class PagefindInstance { (await entry_response.json()) as internal.PagefindEntryJson; this.languages = entry_json.languages; this.loadedVersion = entry_json.version; + this.includeCharacters = entry_json.include_characters ?? []; if (entry_json.version !== this.version) { if (this.primary) { console.warn( @@ -491,14 +493,56 @@ export class PagefindInstance { if (exact_search) { log(`Running an exact search`); } - // Strip special characters to match the indexing operation - // TODO: Maybe move regex over the wasm boundary, or otherwise work to match the Rust regex engine - term = term - .toLowerCase() - .trim() - .replace(/[\.`~!@#\$%\^&\*\(\)\{\}\[\]\\\|:;'",<>\/\?\-]/g, "") + + let trueLanguage: string | null = null; + try { + trueLanguage = Intl.getCanonicalLocales(this.loadedLanguage)[0]; + } catch (err) { + // Loaded language is not valid + } + const term_chunks: string[] = []; + let segments: string[]; + + // TODO: resolve type error for Intl.Segmenter + //@ts-ignore: Property 'Segmenter' does not exist on type 'typeof Intl' + if (trueLanguage && typeof Intl.Segmenter !== "undefined") { + //@ts-ignore: Property 'Segmenter' does not exist on type 'typeof Intl' + const segmenter = new Intl.Segmenter(trueLanguage, { + granularity: "grapheme", + }); + segments = [...segmenter.segment(term)].map( + ({ segment }: { segment: string }) => segment, + ); + } else { + segments = [...term]; + } + + for (const segment of segments) { + if (this.includeCharacters?.includes(segment)) { + term_chunks.push(segment); + } else if ( + !/^\p{Pd}|\p{Pe}|\p{Pf}|\p{Pi}|\p{Po}|\p{Ps}$/u.test(segment) + ) { + term_chunks.push(segment.toLocaleLowerCase()); + } + + /** + * Notes: + * Regex to match the Rust \w class if we need it: + * /^\p{Pc}|\p{LC}|\p{Ll}|\p{Lm}|\p{Lo}|\p{Lt}|\p{Lu}|\p{Nd}|\p{Nl}|\p{No}|\s$/u + * ES2024 regex to match emoji if we need it: + * /\p{RGI_Emoji}/v + */ + } + + // TODO: We could use the "word" granularity for Intl.Segmenter to handle + // segmentation for non-whitespace-delimited languages. + + term = term_chunks + .join("") .replace(/\s{2,}/g, " ") .trim(); + log(`Normalized search term to ${term}`); if (!term?.length && !filter_only) { diff --git a/pagefind_web_js/types/internal.d.ts b/pagefind_web_js/types/internal.d.ts index 14fed99b..2521f1d3 100644 --- a/pagefind_web_js/types/internal.d.ts +++ b/pagefind_web_js/types/internal.d.ts @@ -3,6 +3,7 @@ import "pagefindWeb"; export type PagefindEntryJson = { version: string; languages: Record; + include_characters: string[]; }; export type PagefindEntryLanguage = { From 7daffc37652a1872597135308d6c456ed17b7bc6 Mon Sep 17 00:00:00 2001 From: Liam Bigelow <40188355+bglw@users.noreply.github.com> Date: Mon, 27 Jan 2025 21:58:25 +1300 Subject: [PATCH 2/5] Docs --- docs/content/docs/config-options.md | 23 ++++++++++++++++++++--- docs/content/docs/indexing.md | 22 +++++++++++++++++++++- 2 files changed, 41 insertions(+), 4 deletions(-) diff --git a/docs/content/docs/config-options.md b/docs/content/docs/config-options.md index 209e9968..57b1c6a0 100644 --- a/docs/content/docs/config-options.md +++ b/docs/content/docs/config-options.md @@ -71,6 +71,23 @@ Note that currently Pagefind only supports lists of options via configuration fi |---------------------------|------------------------------|---------------------| | `--exclude-selectors ` | `PAGEFIND_EXCLUDE_SELECTORS` | `exclude_selectors` | +### Include characters +Prevents Pagefind from stripping the provided characters when indexing content. +Allows users to search for words including these characters. + +See [Indexing special characters](/docs/indexing/#indexing-special-characters) for more documentation. + +Care is needed if setting this argument via the CLI, as special characters may be interpreted by your shell. +Configure this via a [configuration file](/docs/config-sources/#config-files) if you encounter issues. + +```yml +include_characters: "<>$" +``` + +| CLI Flag | ENV Variable | Config Key | +|----------------------------|-------------------------------|---------------------| +| `--include-characters ` | `PAGEFIND_INCLUDE_CHARACTERS` | `include_characters` | + ### Glob Configures the glob used by Pagefind to discover HTML files. Defaults to `**/*.{html}`. See [Wax patterns documentation](https://github.com/olson-sean-k/wax#patterns) for more details. @@ -79,7 +96,7 @@ See [Wax patterns documentation](https://github.com/olson-sean-k/wax#patterns) f |-----------------|-----------------|------------| | `--glob ` | `PAGEFIND_GLOB` | `glob` | -### Force Language +### Force language Ignores any detected languages and creates a single index for the entire site as the provided language. Expects an ISO 639-1 code, such as `en` or `pt`. See [Multilingual search](/docs/multilingual/) for more details. @@ -88,14 +105,14 @@ See [Multilingual search](/docs/multilingual/) for more details. |---------------------------|---------------------------|------------------| | `--force-language ` | `PAGEFIND_FORCE_LANGUAGE` | `force_language` | -### Keep Index URL +### Keep index URL Keeps `index.html` at the end of search result paths. By default, a file at `animals/cat/index.html` will be given the URL `/animals/cat/`. Setting this option to `true` will result in the URL `/animals/cat/index.html`. | CLI Flag | ENV Variable | Config Key | |--------------------|------------------|------------------| | `--keep-index-url` | `KEEP_INDEX_URL` | `keep_index_url` | -### Write Playground +### Write playground Writes the Pagefind playground files to `/playground` within your bundle directory. For most sites, this will make the Pagefind playground available at `/pagefind/playground/`. This defaults to false, so playground files are not written to your live site. Playground files are always available when running Pagefind with `--serve`. diff --git a/docs/content/docs/indexing.md b/docs/content/docs/indexing.md index 08a3ed3c..44687028 100644 --- a/docs/content/docs/indexing.md +++ b/docs/content/docs/indexing.md @@ -92,5 +92,25 @@ Attributes of HTML elements can be added to the search index with the `data-page ``` {{< /diffcode >}} -This attribute takes a comma-separated list of other attributes to include inline with the indexed content. +This attribute takes a comma-separated list of other attributes to include inline with the indexed content. The above example will be indexed as: `Condimentum Nullam. Image Title. Image Alt. Nullam id dolor id nibh ultricies.` + +## Indexing special characters + +By default, Pagefind strips most punctuation out of the page when indexing content. Punctuation is also removed from the search term when searching. + +For some sites, such as documentation for programming languages, searching for punctuation can be important. In these cases, +the default behavior can be changed using the [Include Characters](/docs/config-options/#include-characters) option when running Pagefind. + +For example, given the following HTML: + +```html +

The <head> tag

+``` + +Pagefind's default indexing would index `the`, `head`, and `tag`, +and a user typing in a search term of `` will have their search adapted to `head`. +While this will still match the correct page, it won't distinguish between this result and a result talking about the head of a git repository. + +With the [Include Characters](/docs/config-options/#include-characters) option set to `<>`, Pagefind will instead index `the`, ``, `head`, and `tag`. +A search for `head` will still locate this page, while a search for `` won't be rewritten and will specifically match this page. From c692a639f6133e1449b5fe6fb3f8d51bebf0015f Mon Sep 17 00:00:00 2001 From: Liam Bigelow <40188355+bglw@users.noreply.github.com> Date: Mon, 27 Jan 2025 21:59:02 +1300 Subject: [PATCH 3/5] Note in changelog to expand later --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 11c88e1a..88a2a231 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ ## Unreleased +* Added the "Include Characters" option * Added the Pagefind Playground * Reduced filesizes for the Pagefind WebAssembly From 3bdb9bd6b753558eeda2dabdedcd7a91d9a0d728 Mon Sep 17 00:00:00 2001 From: Liam Bigelow <40188355+bglw@users.noreply.github.com> Date: Mon, 27 Jan 2025 22:09:16 +1300 Subject: [PATCH 4/5] Fix missing argument --- pagefind/src/fossick/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pagefind/src/fossick/mod.rs b/pagefind/src/fossick/mod.rs index 90b0f430..7b71751f 100644 --- a/pagefind/src/fossick/mod.rs +++ b/pagefind/src/fossick/mod.rs @@ -999,7 +999,7 @@ mod tests { ) .await; - let (content, words, _, _) = f.parse_digest(); + let (content, words, _, _) = f.parse_digest(&test_opts()); let mut words = words.keys().collect::>(); words.sort(); From b310e17f851f7a1d0f461d9f7e454ff1632b8baa Mon Sep 17 00:00:00 2001 From: Liam Bigelow <40188355+bglw@users.noreply.github.com> Date: Mon, 27 Jan 2025 22:36:55 +1300 Subject: [PATCH 5/5] Resolve regressions around character-only words, and standalone emoji --- pagefind/src/fossick/mod.rs | 48 +++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/pagefind/src/fossick/mod.rs b/pagefind/src/fossick/mod.rs index 7b71751f..c33597f4 100644 --- a/pagefind/src/fossick/mod.rs +++ b/pagefind/src/fossick/mod.rs @@ -338,12 +338,12 @@ impl Fossicker { // We use zero-width spaces as boundary values for some languages, // so we make sure that all are removed from the source content before going into the index. - let normalized_word = word.replace('\u{200B}', ""); - if normalized_word.is_empty() { + let base_word = word.replace('\u{200B}', ""); + if base_word.is_empty() { return; } - content.push_str(&word.replace('\u{200B}', "")); + content.push_str(&base_word); if append_whitespace { content.push(' '); } @@ -351,10 +351,10 @@ impl Fossicker { if should_segment { content.push('\u{200B}'); } - let mut normalized_word = String::with_capacity(word.len()); + let mut normalized_word = String::with_capacity(base_word.len()); let mut possibly_compound = false; - for mut c in word.chars() { + for mut c in base_word.chars() { let is_alpha = c.is_alphanumeric(); if !is_alpha { possibly_compound = true; @@ -378,25 +378,31 @@ impl Fossicker { // For words that may be CompoundWords, also index them as their constituent parts if possibly_compound { let (word_parts, extras) = get_discrete_words(word); - // Only proceed if the word was broken into multiple parts - if word_parts.contains(|c: char| c.is_whitespace()) - || (!normalized_word.starts_with(&word_parts)) - { - let part_words: Vec<_> = word_parts.split_whitespace().collect(); - - if !part_words.is_empty() { - // Index constituents of a compound word as a proportion of the - // weight of the full word. - let per_weight = (word_weight - / part_words.len().try_into().unwrap_or(std::u8::MAX)) - .max(1); - - // Only index two+ character words - for part_word in part_words.into_iter().filter(|w| w.len() > 1) { - store_word(part_word, total_word_index, per_weight); + + // If this word normalized to nothing, we don't want to insert it here. + // (Though we do want to process the extras below, for things like emoji). + if !normalized_word.is_empty() { + // Only proceed if the word was broken into multiple parts + if word_parts.contains(|c: char| c.is_whitespace()) + || (!normalized_word.starts_with(&word_parts)) + { + let part_words: Vec<_> = word_parts.split_whitespace().collect(); + + if !part_words.is_empty() { + // Index constituents of a compound word as a proportion of the + // weight of the full word. + let per_weight = (word_weight + / part_words.len().try_into().unwrap_or(std::u8::MAX)) + .max(1); + + // Only index two+ character words + for part_word in part_words.into_iter().filter(|w| w.len() > 1) { + store_word(part_word, total_word_index, per_weight); + } } } } + // Additionally store any special extra characters we are given if let Some(extras) = extras { for extra in extras {