From e8410d3c0b133605bed25d2dd1c83d91c05bf9a6 Mon Sep 17 00:00:00 2001
From: Liam Bigelow <40188355+bglw@users.noreply.github.com>
Date: Mon, 27 Jan 2025 21:19:40 +1300
Subject: [PATCH 1/5] Add feature to include specific characters in the
Pagefind index
---
...nd-matches-custom-characters.toolproof.yml | 39 +++++
pagefind/src/fossick/mod.rs | 162 ++++++++++++++++--
pagefind/src/lib.rs | 19 +-
pagefind/src/options.rs | 20 ++-
pagefind/src/output/entry.rs | 1 +
pagefind/src/output/mod.rs | 8 +-
pagefind/src/utils.rs | 4 +
pagefind_web_js/lib/coupled_search.ts | 56 +++++-
pagefind_web_js/types/internal.d.ts | 1 +
9 files changed, 279 insertions(+), 31 deletions(-)
create mode 100644 pagefind/integration_tests/characters/pagefind-matches-custom-characters.toolproof.yml
diff --git a/pagefind/integration_tests/characters/pagefind-matches-custom-characters.toolproof.yml b/pagefind/integration_tests/characters/pagefind-matches-custom-characters.toolproof.yml
new file mode 100644
index 00000000..43396ea8
--- /dev/null
+++ b/pagefind/integration_tests/characters/pagefind-matches-custom-characters.toolproof.yml
@@ -0,0 +1,39 @@
+name: Character Tests > Pagefind matches custom characters
+steps:
+ - ref: ./background.toolproof.yml
+ - step: I have a "public/page_a/index.html" file with the content {html}
+ html: >-
+
Talking about @money
+ - step: I have a "public/page_b/index.html" file with the content {html}
+ html: >-
+ Configure a^b^c^d
+ - macro: I run Pagefind with '--include-characters "@^"'
+ - step: stdout should contain "Running Pagefind"
+ - step: The file "public/pagefind/pagefind.js" should not be empty
+ - step: I serve the directory "public"
+ - step: In my browser, I load "/"
+ - step: In my browser, I evaluate {js}
+ js: |-
+ let pagefind = await import("/pagefind/pagefind.js");
+ let search = await pagefind.search("@");
+ let pages = await Promise.all(search.results.map(r => r.data()));
+
+ toolproof.assert_eq(pages.length, 1);
+ toolproof.assert_eq(pages[0].url, "/page_a/");
+ - step: In my browser, I evaluate {js}
+ js: |-
+ let pagefind = await import("/pagefind/pagefind.js");
+ let search = await pagefind.search("money");
+ let pages = await Promise.all(search.results.map(r => r.data()));
+
+ toolproof.assert_eq(pages.length, 1);
+ toolproof.assert_eq(pages[0].url, "/page_a/");
+ - step: In my browser, I evaluate {js}
+ js: |-
+ let pagefind = await import("/pagefind/pagefind.js");
+ let search = await pagefind.search("a^b^c^d");
+ let pages = await Promise.all(search.results.map(r => r.data()));
+
+ toolproof.assert_eq(pages.length, 1);
+ toolproof.assert_eq(pages[0].url, "/page_b/");
+ - step: In my browser, the console should be empty
diff --git a/pagefind/src/fossick/mod.rs b/pagefind/src/fossick/mod.rs
index b419b5ef..90b0f430 100644
--- a/pagefind/src/fossick/mod.rs
+++ b/pagefind/src/fossick/mod.rs
@@ -29,8 +29,6 @@ lazy_static! {
static ref TRIM_NEWLINES: Regex = Regex::new("^[\n\r\\s]+|[\n\r\\s]+$").unwrap();
static ref EXTRANEOUS_SPACES: Regex = Regex::new("\\s{2,}").unwrap();
static ref PRIVATE_PAGEFIND: Regex = Regex::new("___PAGEFIND_[\\S]+\\s?").unwrap();
- // TODO: i18n?
- static ref SPECIAL_CHARS: Regex = Regex::new("[^\\w]").unwrap();
}
pub mod parser;
@@ -197,6 +195,7 @@ impl Fossicker {
fn parse_digest(
&mut self,
+ options: &SearchOptions,
) -> (
String,
HashMap>,
@@ -352,10 +351,24 @@ impl Fossicker {
if should_segment {
content.push('\u{200B}');
}
- let normalized_word = SPECIAL_CHARS
- .replace_all(word, "")
- .into_owned()
- .to_lowercase();
+ let mut normalized_word = String::with_capacity(word.len());
+ let mut possibly_compound = false;
+
+ for mut c in word.chars() {
+ let is_alpha = c.is_alphanumeric();
+ if !is_alpha {
+ possibly_compound = true;
+ }
+ if is_alpha || options.include_characters.contains(&c) {
+ c.make_ascii_lowercase();
+ if c.is_uppercase() {
+ // Non-ascii uppercase can lower to multiple chars
+ normalized_word.extend(c.to_lowercase());
+ } else {
+ normalized_word.push(c);
+ }
+ }
+ }
let word_weight = weight_stack.last().unwrap_or(&1);
if !normalized_word.is_empty() {
@@ -363,10 +376,12 @@ impl Fossicker {
}
// For words that may be CompoundWords, also index them as their constituent parts
- if normalized_word != word {
+ if possibly_compound {
let (word_parts, extras) = get_discrete_words(word);
// Only proceed if the word was broken into multiple parts
- if word_parts.contains(|c: char| c.is_whitespace()) {
+ if word_parts.contains(|c: char| c.is_whitespace())
+ || (!normalized_word.starts_with(&word_parts))
+ {
let part_words: Vec<_> = word_parts.split_whitespace().collect();
if !part_words.is_empty() {
@@ -456,7 +471,7 @@ impl Fossicker {
self.fossick_html(options).await;
};
- let (content, word_data, anchors, word_count) = self.parse_digest();
+ let (content, word_data, anchors, word_count) = self.parse_digest(options);
self.tidy_meta_and_filters();
let data = self.data.unwrap();
@@ -601,12 +616,14 @@ mod tests {
assert_eq!(&output, "Hello Wor ld?");
}
- async fn test_fossick(s: String) -> Fossicker {
+ fn test_opts() -> SearchOptions {
std::env::set_var("PAGEFIND_SOURCE", "somewhere");
let config =
PagefindInboundConfig::with_layers(&[Layer::Env(Some("PAGEFIND_".into()))]).unwrap();
- let opts = SearchOptions::load(config).unwrap();
+ SearchOptions::load(config).unwrap()
+ }
+ async fn test_fossick(s: String) -> Fossicker {
let mut f = Fossicker {
file_path: Some("test/index.html".into()),
root_path: None,
@@ -615,7 +632,7 @@ mod tests {
data: None,
};
- _ = f.read_synthetic(&opts).await;
+ _ = f.read_synthetic(&test_opts()).await;
f
}
@@ -625,7 +642,7 @@ mod tests {
let mut f =
test_fossick(["", "Hello World!
", ""].concat()).await;
- let (digest, words, _, _) = f.parse_digest();
+ let (digest, words, _, _) = f.parse_digest(&test_opts());
assert_eq!(digest, "Hello World!".to_string());
assert_eq!(
@@ -649,6 +666,117 @@ mod tests {
);
}
+ #[tokio::test]
+ async fn parse_chars() {
+ let mut f = test_fossick(
+ [
+ "",
+ "He&llo htmltag<head> *before mid*dle after*
",
+ "",
+ ]
+ .concat(),
+ )
+ .await;
+
+ let mut opts = test_opts();
+ opts.include_characters.extend(['<', '>', '*']);
+ let (digest, words, _, _) = f.parse_digest(&opts);
+
+ assert_eq!(
+ digest,
+ "He&llo htmltag *before mid*dle after*.".to_string()
+ );
+ assert_eq!(
+ words,
+ HashMap::from_iter([
+ (
+ "he".to_string(),
+ vec![FossickedWord {
+ position: 0,
+ weight: 12
+ }]
+ ),
+ (
+ "llo".to_string(),
+ vec![FossickedWord {
+ position: 0,
+ weight: 12
+ }]
+ ),
+ (
+ "hello".to_string(),
+ vec![FossickedWord {
+ position: 0,
+ weight: 24
+ }]
+ ),
+ (
+ "htmltag".to_string(),
+ vec![FossickedWord {
+ position: 1,
+ weight: 24
+ }]
+ ),
+ (
+ "htmltag".to_string(),
+ vec![FossickedWord {
+ position: 1,
+ weight: 12
+ }]
+ ),
+ (
+ "head".to_string(),
+ vec![FossickedWord {
+ position: 1,
+ weight: 12
+ }]
+ ),
+ (
+ "*before".to_string(),
+ vec![FossickedWord {
+ position: 2,
+ weight: 24
+ }]
+ ),
+ (
+ "before".to_string(),
+ vec![FossickedWord {
+ position: 2,
+ weight: 24
+ }]
+ ),
+ (
+ "mid*dle".to_string(),
+ vec![FossickedWord {
+ position: 3,
+ weight: 24
+ }]
+ ),
+ (
+ "mid".to_string(),
+ vec![FossickedWord {
+ position: 3,
+ weight: 12
+ }]
+ ),
+ (
+ "dle".to_string(),
+ vec![FossickedWord {
+ position: 3,
+ weight: 12
+ }]
+ ),
+ (
+ "after*".to_string(),
+ vec![FossickedWord {
+ position: 4,
+ weight: 24
+ }]
+ )
+ ])
+ );
+ }
+
#[tokio::test]
async fn parse_weighted_file() {
let mut f = test_fossick(
@@ -665,7 +793,7 @@ mod tests {
)
.await;
- let (digest, words, _, _) = f.parse_digest();
+ let (digest, words, _, _) = f.parse_digest(&test_opts());
assert_eq!(digest, "The Quick Brown. Fox Jumps Over. Ryan.".to_string());
assert_eq!(
@@ -743,7 +871,7 @@ mod tests {
)
.await;
- let (_, words, _, _) = f.parse_digest();
+ let (_, words, _, _) = f.parse_digest(&test_opts());
assert_eq!(
words,
@@ -802,7 +930,7 @@ mod tests {
)
.await;
- let (_, words, _, _) = f.parse_digest();
+ let (_, words, _, _) = f.parse_digest(&test_opts());
assert_eq!(
words,
@@ -851,7 +979,7 @@ mod tests {
)
.await;
- let (_, words, _, _) = f.parse_digest();
+ let (_, words, _, _) = f.parse_digest(&test_opts());
let mut words = words.keys().collect::>();
words.sort();
diff --git a/pagefind/src/lib.rs b/pagefind/src/lib.rs
index cf7a95b3..403a7675 100644
--- a/pagefind/src/lib.rs
+++ b/pagefind/src/lib.rs
@@ -301,7 +301,13 @@ impl SearchState {
)
.await;
- output::write_common_to_disk(index_entries, self.options.write_playground, &outdir).await;
+ output::write_common_to_disk(
+ index_entries,
+ self.options.write_playground,
+ &outdir,
+ &self.options,
+ )
+ .await;
outdir
}
@@ -325,9 +331,14 @@ impl SearchState {
.collect();
files.extend(
- output::write_common_to_memory(index_entries, self.options.write_playground, outdir)
- .await
- .into_iter(),
+ output::write_common_to_memory(
+ index_entries,
+ self.options.write_playground,
+ outdir,
+ &self.options,
+ )
+ .await
+ .into_iter(),
);
// SyntheticFiles should only return the relative path to the file
diff --git a/pagefind/src/options.rs b/pagefind/src/options.rs
index e780a2cf..f8854a18 100644
--- a/pagefind/src/options.rs
+++ b/pagefind/src/options.rs
@@ -8,7 +8,10 @@ use std::{env, path::PathBuf};
use twelf::config;
use typed_builder::TypedBuilder;
-use crate::logging::{LogLevel, Logger};
+use crate::{
+ logging::{LogLevel, Logger},
+ utils::WORD_SYMBOLS,
+};
//
// If editing this configuration struct,
@@ -86,6 +89,13 @@ pub(crate) struct PagefindInboundConfig {
#[clap(required = false)]
pub(crate) force_language: Option,
+ #[clap(
+ long,
+ help = "Include these characters when indexing and searching words. Useful for sites documenting technical topics such as programming languages."
+ )]
+ #[clap(required = false)]
+ pub(crate) include_characters: Option,
+
#[clap(
long,
help = "Serve the source directory after creating the search index"
@@ -114,7 +124,6 @@ pub(crate) struct PagefindInboundConfig {
#[clap(
long,
- short,
help = "Only log errors while indexing the site. Does not impact the web-facing search."
)]
#[clap(required = false)]
@@ -214,6 +223,7 @@ pub(crate) struct SearchOptions {
pub(crate) exclude_selectors: Vec,
pub(crate) glob: String,
pub(crate) force_language: Option,
+ pub(crate) include_characters: Vec,
pub(crate) version: &'static str,
pub(crate) logger: Logger,
pub(crate) keep_index_url: bool,
@@ -280,6 +290,11 @@ impl SearchOptions {
site_source.join(subdir)
};
+ let mut include_characters = WORD_SYMBOLS.to_vec();
+ if let Some(custom_include_characters) = config.include_characters {
+ include_characters.extend(custom_include_characters.chars());
+ }
+
Ok(Self {
working_directory,
site_source,
@@ -288,6 +303,7 @@ impl SearchOptions {
exclude_selectors: config.exclude_selectors,
glob: config.glob,
force_language: config.force_language,
+ include_characters,
version: env!("CARGO_PKG_VERSION"),
logger: Logger::new(
log_level,
diff --git a/pagefind/src/output/entry.rs b/pagefind/src/output/entry.rs
index a7462420..2765416a 100644
--- a/pagefind/src/output/entry.rs
+++ b/pagefind/src/output/entry.rs
@@ -6,6 +6,7 @@ use serde::Serialize;
pub struct PagefindEntryMeta {
pub version: &'static str,
pub languages: HashMap,
+ pub include_characters: Vec,
}
#[derive(Serialize, Debug)]
diff --git a/pagefind/src/output/mod.rs b/pagefind/src/output/mod.rs
index 7c134c37..96107a58 100644
--- a/pagefind/src/output/mod.rs
+++ b/pagefind/src/output/mod.rs
@@ -78,16 +78,18 @@ pub async fn write_common_to_disk(
language_indexes: Vec,
output_playground: bool,
outdir: &PathBuf,
+ options: &SearchOptions,
) {
- write_common(language_indexes, output_playground, outdir, false).await;
+ write_common(language_indexes, output_playground, outdir, options, false).await;
}
pub async fn write_common_to_memory(
language_indexes: Vec,
output_playground: bool,
outdir: &PathBuf,
+ options: &SearchOptions,
) -> Vec {
- write_common(language_indexes, output_playground, outdir, true)
+ write_common(language_indexes, output_playground, outdir, options, true)
.await
.unwrap()
}
@@ -96,6 +98,7 @@ async fn write_common(
language_indexes: Vec,
output_playground: bool,
outdir: &PathBuf,
+ options: &SearchOptions,
synthetic: bool,
) -> Option> {
let js_version = format!("const pagefind_version = \"{PAGEFIND_VERSION}\";");
@@ -116,6 +119,7 @@ async fn write_common(
},
)
})),
+ include_characters: options.include_characters.clone(),
};
let encoded_entry_meta = serde_json::to_string(&entry_meta).unwrap();
diff --git a/pagefind/src/utils.rs b/pagefind/src/utils.rs
index 4679f379..97195c85 100644
--- a/pagefind/src/utils.rs
+++ b/pagefind/src/utils.rs
@@ -1,5 +1,9 @@
use sha1::{Digest, Sha1};
+/// Symbols that count as part of a word
+/// (specifically, the "Punctuation, Connector" Unicode category)
+pub const WORD_SYMBOLS: [char; 10] = ['_', '‿', '⁀', '⁔', '︳', '︴', '﹍', '﹎', '﹏', '_'];
+
pub fn full_hash(bytes: &[u8]) -> String {
let mut hasher = Sha1::new();
hasher.update(bytes);
diff --git a/pagefind_web_js/lib/coupled_search.ts b/pagefind_web_js/lib/coupled_search.ts
index e9c1a7f7..731c8e36 100644
--- a/pagefind_web_js/lib/coupled_search.ts
+++ b/pagefind_web_js/lib/coupled_search.ts
@@ -33,6 +33,7 @@ export class PagefindInstance {
searchMeta: any;
languages: Record | null;
loadedLanguage?: string;
+ includeCharacters?: string[];
version: string;
loadedVersion?: string;
@@ -203,6 +204,7 @@ export class PagefindInstance {
(await entry_response.json()) as internal.PagefindEntryJson;
this.languages = entry_json.languages;
this.loadedVersion = entry_json.version;
+ this.includeCharacters = entry_json.include_characters ?? [];
if (entry_json.version !== this.version) {
if (this.primary) {
console.warn(
@@ -491,14 +493,56 @@ export class PagefindInstance {
if (exact_search) {
log(`Running an exact search`);
}
- // Strip special characters to match the indexing operation
- // TODO: Maybe move regex over the wasm boundary, or otherwise work to match the Rust regex engine
- term = term
- .toLowerCase()
- .trim()
- .replace(/[\.`~!@#\$%\^&\*\(\)\{\}\[\]\\\|:;'",<>\/\?\-]/g, "")
+
+ let trueLanguage: string | null = null;
+ try {
+ trueLanguage = Intl.getCanonicalLocales(this.loadedLanguage)[0];
+ } catch (err) {
+ // Loaded language is not valid
+ }
+ const term_chunks: string[] = [];
+ let segments: string[];
+
+ // TODO: resolve type error for Intl.Segmenter
+ //@ts-ignore: Property 'Segmenter' does not exist on type 'typeof Intl'
+ if (trueLanguage && typeof Intl.Segmenter !== "undefined") {
+ //@ts-ignore: Property 'Segmenter' does not exist on type 'typeof Intl'
+ const segmenter = new Intl.Segmenter(trueLanguage, {
+ granularity: "grapheme",
+ });
+ segments = [...segmenter.segment(term)].map(
+ ({ segment }: { segment: string }) => segment,
+ );
+ } else {
+ segments = [...term];
+ }
+
+ for (const segment of segments) {
+ if (this.includeCharacters?.includes(segment)) {
+ term_chunks.push(segment);
+ } else if (
+ !/^\p{Pd}|\p{Pe}|\p{Pf}|\p{Pi}|\p{Po}|\p{Ps}$/u.test(segment)
+ ) {
+ term_chunks.push(segment.toLocaleLowerCase());
+ }
+
+ /**
+ * Notes:
+ * Regex to match the Rust \w class if we need it:
+ * /^\p{Pc}|\p{LC}|\p{Ll}|\p{Lm}|\p{Lo}|\p{Lt}|\p{Lu}|\p{Nd}|\p{Nl}|\p{No}|\s$/u
+ * ES2024 regex to match emoji if we need it:
+ * /\p{RGI_Emoji}/v
+ */
+ }
+
+ // TODO: We could use the "word" granularity for Intl.Segmenter to handle
+ // segmentation for non-whitespace-delimited languages.
+
+ term = term_chunks
+ .join("")
.replace(/\s{2,}/g, " ")
.trim();
+
log(`Normalized search term to ${term}`);
if (!term?.length && !filter_only) {
diff --git a/pagefind_web_js/types/internal.d.ts b/pagefind_web_js/types/internal.d.ts
index 14fed99b..2521f1d3 100644
--- a/pagefind_web_js/types/internal.d.ts
+++ b/pagefind_web_js/types/internal.d.ts
@@ -3,6 +3,7 @@ import "pagefindWeb";
export type PagefindEntryJson = {
version: string;
languages: Record;
+ include_characters: string[];
};
export type PagefindEntryLanguage = {
From 7daffc37652a1872597135308d6c456ed17b7bc6 Mon Sep 17 00:00:00 2001
From: Liam Bigelow <40188355+bglw@users.noreply.github.com>
Date: Mon, 27 Jan 2025 21:58:25 +1300
Subject: [PATCH 2/5] Docs
---
docs/content/docs/config-options.md | 23 ++++++++++++++++++++---
docs/content/docs/indexing.md | 22 +++++++++++++++++++++-
2 files changed, 41 insertions(+), 4 deletions(-)
diff --git a/docs/content/docs/config-options.md b/docs/content/docs/config-options.md
index 209e9968..57b1c6a0 100644
--- a/docs/content/docs/config-options.md
+++ b/docs/content/docs/config-options.md
@@ -71,6 +71,23 @@ Note that currently Pagefind only supports lists of options via configuration fi
|---------------------------|------------------------------|---------------------|
| `--exclude-selectors ` | `PAGEFIND_EXCLUDE_SELECTORS` | `exclude_selectors` |
+### Include characters
+Prevents Pagefind from stripping the provided characters when indexing content.
+Allows users to search for words including these characters.
+
+See [Indexing special characters](/docs/indexing/#indexing-special-characters) for more documentation.
+
+Care is needed if setting this argument via the CLI, as special characters may be interpreted by your shell.
+Configure this via a [configuration file](/docs/config-sources/#config-files) if you encounter issues.
+
+```yml
+include_characters: "<>$"
+```
+
+| CLI Flag | ENV Variable | Config Key |
+|----------------------------|-------------------------------|---------------------|
+| `--include-characters ` | `PAGEFIND_INCLUDE_CHARACTERS` | `include_characters` |
+
### Glob
Configures the glob used by Pagefind to discover HTML files. Defaults to `**/*.{html}`.
See [Wax patterns documentation](https://github.com/olson-sean-k/wax#patterns) for more details.
@@ -79,7 +96,7 @@ See [Wax patterns documentation](https://github.com/olson-sean-k/wax#patterns) f
|-----------------|-----------------|------------|
| `--glob ` | `PAGEFIND_GLOB` | `glob` |
-### Force Language
+### Force language
Ignores any detected languages and creates a single index for the entire site as the provided language. Expects an ISO 639-1 code, such as `en` or `pt`.
See [Multilingual search](/docs/multilingual/) for more details.
@@ -88,14 +105,14 @@ See [Multilingual search](/docs/multilingual/) for more details.
|---------------------------|---------------------------|------------------|
| `--force-language ` | `PAGEFIND_FORCE_LANGUAGE` | `force_language` |
-### Keep Index URL
+### Keep index URL
Keeps `index.html` at the end of search result paths. By default, a file at `animals/cat/index.html` will be given the URL `/animals/cat/`. Setting this option to `true` will result in the URL `/animals/cat/index.html`.
| CLI Flag | ENV Variable | Config Key |
|--------------------|------------------|------------------|
| `--keep-index-url` | `KEEP_INDEX_URL` | `keep_index_url` |
-### Write Playground
+### Write playground
Writes the Pagefind playground files to `/playground` within your bundle directory. For most sites, this will make the Pagefind playground available at `/pagefind/playground/`.
This defaults to false, so playground files are not written to your live site. Playground files are always available when running Pagefind with `--serve`.
diff --git a/docs/content/docs/indexing.md b/docs/content/docs/indexing.md
index 08a3ed3c..44687028 100644
--- a/docs/content/docs/indexing.md
+++ b/docs/content/docs/indexing.md
@@ -92,5 +92,25 @@ Attributes of HTML elements can be added to the search index with the `data-page
```
{{< /diffcode >}}
-This attribute takes a comma-separated list of other attributes to include inline with the indexed content.
+This attribute takes a comma-separated list of other attributes to include inline with the indexed content.
The above example will be indexed as: `Condimentum Nullam. Image Title. Image Alt. Nullam id dolor id nibh ultricies.`
+
+## Indexing special characters
+
+By default, Pagefind strips most punctuation out of the page when indexing content. Punctuation is also removed from the search term when searching.
+
+For some sites, such as documentation for programming languages, searching for punctuation can be important. In these cases,
+the default behavior can be changed using the [Include Characters](/docs/config-options/#include-characters) option when running Pagefind.
+
+For example, given the following HTML:
+
+```html
+The <head> tag
+```
+
+Pagefind's default indexing would index `the`, `head`, and `tag`,
+and a user typing in a search term of `` will have their search adapted to `head`.
+While this will still match the correct page, it won't distinguish between this result and a result talking about the head of a git repository.
+
+With the [Include Characters](/docs/config-options/#include-characters) option set to `<>`, Pagefind will instead index `the`, ``, `head`, and `tag`.
+A search for `head` will still locate this page, while a search for `` won't be rewritten and will specifically match this page.
From c692a639f6133e1449b5fe6fb3f8d51bebf0015f Mon Sep 17 00:00:00 2001
From: Liam Bigelow <40188355+bglw@users.noreply.github.com>
Date: Mon, 27 Jan 2025 21:59:02 +1300
Subject: [PATCH 3/5] Note in changelog to expand later
---
CHANGELOG.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 11c88e1a..88a2a231 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@
## Unreleased
+* Added the "Include Characters" option
* Added the Pagefind Playground
* Reduced filesizes for the Pagefind WebAssembly
From 3bdb9bd6b753558eeda2dabdedcd7a91d9a0d728 Mon Sep 17 00:00:00 2001
From: Liam Bigelow <40188355+bglw@users.noreply.github.com>
Date: Mon, 27 Jan 2025 22:09:16 +1300
Subject: [PATCH 4/5] Fix missing argument
---
pagefind/src/fossick/mod.rs | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pagefind/src/fossick/mod.rs b/pagefind/src/fossick/mod.rs
index 90b0f430..7b71751f 100644
--- a/pagefind/src/fossick/mod.rs
+++ b/pagefind/src/fossick/mod.rs
@@ -999,7 +999,7 @@ mod tests {
)
.await;
- let (content, words, _, _) = f.parse_digest();
+ let (content, words, _, _) = f.parse_digest(&test_opts());
let mut words = words.keys().collect::>();
words.sort();
From b310e17f851f7a1d0f461d9f7e454ff1632b8baa Mon Sep 17 00:00:00 2001
From: Liam Bigelow <40188355+bglw@users.noreply.github.com>
Date: Mon, 27 Jan 2025 22:36:55 +1300
Subject: [PATCH 5/5] Resolve regressions around character-only words, and
standalone emoji
---
pagefind/src/fossick/mod.rs | 48 +++++++++++++++++++++----------------
1 file changed, 27 insertions(+), 21 deletions(-)
diff --git a/pagefind/src/fossick/mod.rs b/pagefind/src/fossick/mod.rs
index 7b71751f..c33597f4 100644
--- a/pagefind/src/fossick/mod.rs
+++ b/pagefind/src/fossick/mod.rs
@@ -338,12 +338,12 @@ impl Fossicker {
// We use zero-width spaces as boundary values for some languages,
// so we make sure that all are removed from the source content before going into the index.
- let normalized_word = word.replace('\u{200B}', "");
- if normalized_word.is_empty() {
+ let base_word = word.replace('\u{200B}', "");
+ if base_word.is_empty() {
return;
}
- content.push_str(&word.replace('\u{200B}', ""));
+ content.push_str(&base_word);
if append_whitespace {
content.push(' ');
}
@@ -351,10 +351,10 @@ impl Fossicker {
if should_segment {
content.push('\u{200B}');
}
- let mut normalized_word = String::with_capacity(word.len());
+ let mut normalized_word = String::with_capacity(base_word.len());
let mut possibly_compound = false;
- for mut c in word.chars() {
+ for mut c in base_word.chars() {
let is_alpha = c.is_alphanumeric();
if !is_alpha {
possibly_compound = true;
@@ -378,25 +378,31 @@ impl Fossicker {
// For words that may be CompoundWords, also index them as their constituent parts
if possibly_compound {
let (word_parts, extras) = get_discrete_words(word);
- // Only proceed if the word was broken into multiple parts
- if word_parts.contains(|c: char| c.is_whitespace())
- || (!normalized_word.starts_with(&word_parts))
- {
- let part_words: Vec<_> = word_parts.split_whitespace().collect();
-
- if !part_words.is_empty() {
- // Index constituents of a compound word as a proportion of the
- // weight of the full word.
- let per_weight = (word_weight
- / part_words.len().try_into().unwrap_or(std::u8::MAX))
- .max(1);
-
- // Only index two+ character words
- for part_word in part_words.into_iter().filter(|w| w.len() > 1) {
- store_word(part_word, total_word_index, per_weight);
+
+ // If this word normalized to nothing, we don't want to insert it here.
+ // (Though we do want to process the extras below, for things like emoji).
+ if !normalized_word.is_empty() {
+ // Only proceed if the word was broken into multiple parts
+ if word_parts.contains(|c: char| c.is_whitespace())
+ || (!normalized_word.starts_with(&word_parts))
+ {
+ let part_words: Vec<_> = word_parts.split_whitespace().collect();
+
+ if !part_words.is_empty() {
+ // Index constituents of a compound word as a proportion of the
+ // weight of the full word.
+ let per_weight = (word_weight
+ / part_words.len().try_into().unwrap_or(std::u8::MAX))
+ .max(1);
+
+ // Only index two+ character words
+ for part_word in part_words.into_iter().filter(|w| w.len() > 1) {
+ store_word(part_word, total_word_index, per_weight);
+ }
}
}
}
+
// Additionally store any special extra characters we are given
if let Some(extras) = extras {
for extra in extras {