From e8410d3c0b133605bed25d2dd1c83d91c05bf9a6 Mon Sep 17 00:00:00 2001
From: Liam Bigelow <40188355+bglw@users.noreply.github.com>
Date: Mon, 27 Jan 2025 21:19:40 +1300
Subject: [PATCH 1/5] Add feature to include specific characters in the
 Pagefind index

---
 ...nd-matches-custom-characters.toolproof.yml |  39 +++++
 pagefind/src/fossick/mod.rs                   | 162 ++++++++++++++++--
 pagefind/src/lib.rs                           |  19 +-
 pagefind/src/options.rs                       |  20 ++-
 pagefind/src/output/entry.rs                  |   1 +
 pagefind/src/output/mod.rs                    |   8 +-
 pagefind/src/utils.rs                         |   4 +
 pagefind_web_js/lib/coupled_search.ts         |  56 +++++-
 pagefind_web_js/types/internal.d.ts           |   1 +
 9 files changed, 279 insertions(+), 31 deletions(-)
 create mode 100644 pagefind/integration_tests/characters/pagefind-matches-custom-characters.toolproof.yml
diff --git a/pagefind/integration_tests/characters/pagefind-matches-custom-characters.toolproof.yml b/pagefind/integration_tests/characters/pagefind-matches-custom-characters.toolproof.yml
new file mode 100644
index 00000000..43396ea8
--- /dev/null
+++ b/pagefind/integration_tests/characters/pagefind-matches-custom-characters.toolproof.yml
@@ -0,0 +1,39 @@
+name: Character Tests > Pagefind matches custom characters
+steps:
+  - ref: ./background.toolproof.yml
+  - step: I have a "public/page_a/index.html" file with the content {html}
+    html: >-
+      <!DOCTYPE html><html lang="en"><head></head><body><h1>Talking about @money</h1></body></html>
+  - step: I have a "public/page_b/index.html" file with the content {html}
+    html: >-
+      <!DOCTYPE html><html lang="en"><head></head><body><h1>Configure a^b^c^d</h1></body></html>
+  - macro: I run Pagefind with '--include-characters "@^"'
+  - step: stdout should contain "Running Pagefind"
+  - step: The file "public/pagefind/pagefind.js" should not be empty
+  - step: I serve the directory "public"
+  - step: In my browser, I load "/"
+  - step: In my browser, I evaluate {js}
+    js: |-
+      let pagefind = await import("/pagefind/pagefind.js");
+      let search = await pagefind.search("@");
+      let pages = await Promise.all(search.results.map(r => r.data()));
+
+      toolproof.assert_eq(pages.length, 1);
+      toolproof.assert_eq(pages[0].url, "/page_a/");
+  - step: In my browser, I evaluate {js}
+    js: |-
+      let pagefind = await import("/pagefind/pagefind.js");
+      let search = await pagefind.search("money");
+      let pages = await Promise.all(search.results.map(r => r.data()));
+
+      toolproof.assert_eq(pages.length, 1);
+      toolproof.assert_eq(pages[0].url, "/page_a/");
+  - step: In my browser, I evaluate {js}
+    js: |-
+      let pagefind = await import("/pagefind/pagefind.js");
+      let search = await pagefind.search("a^b^c^d");
+      let pages = await Promise.all(search.results.map(r => r.data()));
+
+      toolproof.assert_eq(pages.length, 1);
+      toolproof.assert_eq(pages[0].url, "/page_b/");
+  - step: In my browser, the console should be empty
diff --git a/pagefind/src/fossick/mod.rs b/pagefind/src/fossick/mod.rs
index b419b5ef..90b0f430 100644
--- a/pagefind/src/fossick/mod.rs
+++ b/pagefind/src/fossick/mod.rs
@@ -29,8 +29,6 @@ lazy_static! {
     static ref TRIM_NEWLINES: Regex = Regex::new("^[\n\r\\s]+|[\n\r\\s]+$").unwrap();
     static ref EXTRANEOUS_SPACES: Regex = Regex::new("\\s{2,}").unwrap();
     static ref PRIVATE_PAGEFIND: Regex = Regex::new("___PAGEFIND_[\\S]+\\s?").unwrap();
-    // TODO: i18n?
-    static ref SPECIAL_CHARS: Regex = Regex::new("[^\\w]").unwrap();
 }
 
 pub mod parser;
@@ -197,6 +195,7 @@ impl Fossicker {
 
     fn parse_digest(
         &mut self,
+        options: &SearchOptions,
     ) -> (
         String,
         HashMap<String, Vec<FossickedWord>>,
@@ -352,10 +351,24 @@ impl Fossicker {
             if should_segment {
                 content.push('\u{200B}');
             }
-            let normalized_word = SPECIAL_CHARS
-                .replace_all(word, "")
-                .into_owned()
-                .to_lowercase();
+            let mut normalized_word = String::with_capacity(word.len());
+            let mut possibly_compound = false;
+
+            for mut c in word.chars() {
+                let is_alpha = c.is_alphanumeric();
+                if !is_alpha {
+                    possibly_compound = true;
+                }
+                if is_alpha || options.include_characters.contains(&c) {
+                    c.make_ascii_lowercase();
+                    if c.is_uppercase() {
+                        // Non-ascii uppercase can lower to multiple chars
+                        normalized_word.extend(c.to_lowercase());
+                    } else {
+                        normalized_word.push(c);
+                    }
+                }
+            }
 
             let word_weight = weight_stack.last().unwrap_or(&1);
             if !normalized_word.is_empty() {
@@ -363,10 +376,12 @@ impl Fossicker {
             }
 
             // For words that may be CompoundWords, also index them as their constituent parts
-            if normalized_word != word {
+            if possibly_compound {
                 let (word_parts, extras) = get_discrete_words(word);
                 // Only proceed if the word was broken into multiple parts
-                if word_parts.contains(|c: char| c.is_whitespace()) {
+                if word_parts.contains(|c: char| c.is_whitespace())
+                    || (!normalized_word.starts_with(&word_parts))
+                {
                     let part_words: Vec<_> = word_parts.split_whitespace().collect();
 
                     if !part_words.is_empty() {
@@ -456,7 +471,7 @@ impl Fossicker {
             self.fossick_html(options).await;
         };
 
-        let (content, word_data, anchors, word_count) = self.parse_digest();
+        let (content, word_data, anchors, word_count) = self.parse_digest(options);
         self.tidy_meta_and_filters();
 
         let data = self.data.unwrap();
@@ -601,12 +616,14 @@ mod tests {
         assert_eq!(&output, "Hello Wor ld?");
     }
 
-    async fn test_fossick(s: String) -> Fossicker {
+    fn test_opts() -> SearchOptions {
         std::env::set_var("PAGEFIND_SOURCE", "somewhere");
         let config =
             PagefindInboundConfig::with_layers(&[Layer::Env(Some("PAGEFIND_".into()))]).unwrap();
-        let opts = SearchOptions::load(config).unwrap();
+        SearchOptions::load(config).unwrap()
+    }
 
+    async fn test_fossick(s: String) -> Fossicker {
         let mut f = Fossicker {
             file_path: Some("test/index.html".into()),
             root_path: None,
@@ -615,7 +632,7 @@ mod tests {
             data: None,
         };
 
-        _ = f.read_synthetic(&opts).await;
+        _ = f.read_synthetic(&test_opts()).await;
 
         f
     }
@@ -625,7 +642,7 @@ mod tests {
         let mut f =
             test_fossick(["<html><body>", "<p>Hello World!</p>", "</body></html>"].concat()).await;
 
-        let (digest, words, _, _) = f.parse_digest();
+        let (digest, words, _, _) = f.parse_digest(&test_opts());
 
         assert_eq!(digest, "Hello World!".to_string());
         assert_eq!(
@@ -649,6 +666,117 @@ mod tests {
         );
     }
 
+    #[tokio::test]
+    async fn parse_chars() {
+        let mut f = test_fossick(
+            [
+                "<html><body>",
+                "<p>He&amp;llo htmltag&lt;head&gt; *before mid*dle after*</p>",
+                "</body></html>",
+            ]
+            .concat(),
+        )
+        .await;
+
+        let mut opts = test_opts();
+        opts.include_characters.extend(['<', '>', '*']);
+        let (digest, words, _, _) = f.parse_digest(&opts);
+
+        assert_eq!(
+            digest,
+            "He&llo htmltag<head> *before mid*dle after*.".to_string()
+        );
+        assert_eq!(
+            words,
+            HashMap::from_iter([
+                (
+                    "he".to_string(),
+                    vec![FossickedWord {
+                        position: 0,
+                        weight: 12
+                    }]
+                ),
+                (
+                    "llo".to_string(),
+                    vec![FossickedWord {
+                        position: 0,
+                        weight: 12
+                    }]
+                ),
+                (
+                    "hello".to_string(),
+                    vec![FossickedWord {
+                        position: 0,
+                        weight: 24
+                    }]
+                ),
+                (
+                    "htmltag<head>".to_string(),
+                    vec![FossickedWord {
+                        position: 1,
+                        weight: 24
+                    }]
+                ),
+                (
+                    "htmltag".to_string(),
+                    vec![FossickedWord {
+                        position: 1,
+                        weight: 12
+                    }]
+                ),
+                (
+                    "head".to_string(),
+                    vec![FossickedWord {
+                        position: 1,
+                        weight: 12
+                    }]
+                ),
+                (
+                    "*before".to_string(),
+                    vec![FossickedWord {
+                        position: 2,
+                        weight: 24
+                    }]
+                ),
+                (
+                    "before".to_string(),
+                    vec![FossickedWord {
+                        position: 2,
+                        weight: 24
+                    }]
+                ),
+                (
+                    "mid*dle".to_string(),
+                    vec![FossickedWord {
+                        position: 3,
+                        weight: 24
+                    }]
+                ),
+                (
+                    "mid".to_string(),
+                    vec![FossickedWord {
+                        position: 3,
+                        weight: 12
+                    }]
+                ),
+                (
+                    "dle".to_string(),
+                    vec![FossickedWord {
+                        position: 3,
+                        weight: 12
+                    }]
+                ),
+                (
+                    "after*".to_string(),
+                    vec![FossickedWord {
+                        position: 4,
+                        weight: 24
+                    }]
+                )
+            ])
+        );
+    }
+
     #[tokio::test]
     async fn parse_weighted_file() {
         let mut f = test_fossick(
@@ -665,7 +793,7 @@ mod tests {
         )
         .await;
 
-        let (digest, words, _, _) = f.parse_digest();
+        let (digest, words, _, _) = f.parse_digest(&test_opts());
 
         assert_eq!(digest, "The Quick Brown. Fox Jumps Over. Ryan.".to_string());
         assert_eq!(
@@ -743,7 +871,7 @@ mod tests {
         )
         .await;
 
-        let (_, words, _, _) = f.parse_digest();
+        let (_, words, _, _) = f.parse_digest(&test_opts());
 
         assert_eq!(
             words,
@@ -802,7 +930,7 @@ mod tests {
         )
         .await;
 
-        let (_, words, _, _) = f.parse_digest();
+        let (_, words, _, _) = f.parse_digest(&test_opts());
 
         assert_eq!(
             words,
@@ -851,7 +979,7 @@ mod tests {
         )
         .await;
 
-        let (_, words, _, _) = f.parse_digest();
+        let (_, words, _, _) = f.parse_digest(&test_opts());
 
         let mut words = words.keys().collect::<Vec<_>>();
         words.sort();
diff --git a/pagefind/src/lib.rs b/pagefind/src/lib.rs
index cf7a95b3..403a7675 100644
--- a/pagefind/src/lib.rs
+++ b/pagefind/src/lib.rs
@@ -301,7 +301,13 @@ impl SearchState {
         )
         .await;
 
-        output::write_common_to_disk(index_entries, self.options.write_playground, &outdir).await;
+        output::write_common_to_disk(
+            index_entries,
+            self.options.write_playground,
+            &outdir,
+            &self.options,
+        )
+        .await;
 
         outdir
     }
@@ -325,9 +331,14 @@ impl SearchState {
             .collect();
 
         files.extend(
-            output::write_common_to_memory(index_entries, self.options.write_playground, outdir)
-                .await
-                .into_iter(),
+            output::write_common_to_memory(
+                index_entries,
+                self.options.write_playground,
+                outdir,
+                &self.options,
+            )
+            .await
+            .into_iter(),
         );
 
         // SyntheticFiles should only return the relative path to the file
diff --git a/pagefind/src/options.rs b/pagefind/src/options.rs
index e780a2cf..f8854a18 100644
--- a/pagefind/src/options.rs
+++ b/pagefind/src/options.rs
@@ -8,7 +8,10 @@ use std::{env, path::PathBuf};
 use twelf::config;
 use typed_builder::TypedBuilder;
 
-use crate::logging::{LogLevel, Logger};
+use crate::{
+    logging::{LogLevel, Logger},
+    utils::WORD_SYMBOLS,
+};
 
 //
 // If editing this configuration struct,
@@ -86,6 +89,13 @@ pub(crate) struct PagefindInboundConfig {
     #[clap(required = false)]
     pub(crate) force_language: Option<String>,
 
+    #[clap(
+        long,
+        help = "Include these characters when indexing and searching words. Useful for sites documenting technical topics such as programming languages."
+    )]
+    #[clap(required = false)]
+    pub(crate) include_characters: Option<String>,
+
     #[clap(
         long,
         help = "Serve the source directory after creating the search index"
@@ -114,7 +124,6 @@ pub(crate) struct PagefindInboundConfig {
 
     #[clap(
         long,
-        short,
         help = "Only log errors while indexing the site. Does not impact the web-facing search."
     )]
     #[clap(required = false)]
@@ -214,6 +223,7 @@ pub(crate) struct SearchOptions {
     pub(crate) exclude_selectors: Vec<String>,
     pub(crate) glob: String,
     pub(crate) force_language: Option<String>,
+    pub(crate) include_characters: Vec<char>,
     pub(crate) version: &'static str,
     pub(crate) logger: Logger,
     pub(crate) keep_index_url: bool,
@@ -280,6 +290,11 @@ impl SearchOptions {
                 site_source.join(subdir)
             };
 
+            let mut include_characters = WORD_SYMBOLS.to_vec();
+            if let Some(custom_include_characters) = config.include_characters {
+                include_characters.extend(custom_include_characters.chars());
+            }
+
             Ok(Self {
                 working_directory,
                 site_source,
@@ -288,6 +303,7 @@ impl SearchOptions {
                 exclude_selectors: config.exclude_selectors,
                 glob: config.glob,
                 force_language: config.force_language,
+                include_characters,
                 version: env!("CARGO_PKG_VERSION"),
                 logger: Logger::new(
                     log_level,
diff --git a/pagefind/src/output/entry.rs b/pagefind/src/output/entry.rs
index a7462420..2765416a 100644
--- a/pagefind/src/output/entry.rs
+++ b/pagefind/src/output/entry.rs
@@ -6,6 +6,7 @@ use serde::Serialize;
 pub struct PagefindEntryMeta {
     pub version: &'static str,
     pub languages: HashMap<String, PagefindEntryLanguage>,
+    pub include_characters: Vec<char>,
 }
 
 #[derive(Serialize, Debug)]
diff --git a/pagefind/src/output/mod.rs b/pagefind/src/output/mod.rs
index 7c134c37..96107a58 100644
--- a/pagefind/src/output/mod.rs
+++ b/pagefind/src/output/mod.rs
@@ -78,16 +78,18 @@ pub async fn write_common_to_disk(
     language_indexes: Vec<LanguageMeta>,
     output_playground: bool,
     outdir: &PathBuf,
+    options: &SearchOptions,
 ) {
-    write_common(language_indexes, output_playground, outdir, false).await;
+    write_common(language_indexes, output_playground, outdir, options, false).await;
 }
 
 pub async fn write_common_to_memory(
     language_indexes: Vec<LanguageMeta>,
     output_playground: bool,
     outdir: &PathBuf,
+    options: &SearchOptions,
 ) -> Vec<SyntheticFile> {
-    write_common(language_indexes, output_playground, outdir, true)
+    write_common(language_indexes, output_playground, outdir, options, true)
         .await
         .unwrap()
 }
@@ -96,6 +98,7 @@ async fn write_common(
     language_indexes: Vec<LanguageMeta>,
     output_playground: bool,
     outdir: &PathBuf,
+    options: &SearchOptions,
     synthetic: bool,
 ) -> Option<Vec<SyntheticFile>> {
     let js_version = format!("const pagefind_version = \"{PAGEFIND_VERSION}\";");
@@ -116,6 +119,7 @@ async fn write_common(
                 },
             )
         })),
+        include_characters: options.include_characters.clone(),
     };
     let encoded_entry_meta = serde_json::to_string(&entry_meta).unwrap();
 
diff --git a/pagefind/src/utils.rs b/pagefind/src/utils.rs
index 4679f379..97195c85 100644
--- a/pagefind/src/utils.rs
+++ b/pagefind/src/utils.rs
@@ -1,5 +1,9 @@
 use sha1::{Digest, Sha1};
 
+/// Symbols that count as part of a word
+/// (specifically, the "Punctuation, Connector" Unicode category)
+pub const WORD_SYMBOLS: [char; 10] = ['_', '‿', '⁀', '⁔', '︳', '︴', '﹍', '﹎', '﹏', '＿'];
+
 pub fn full_hash(bytes: &[u8]) -> String {
     let mut hasher = Sha1::new();
     hasher.update(bytes);
diff --git a/pagefind_web_js/lib/coupled_search.ts b/pagefind_web_js/lib/coupled_search.ts
index e9c1a7f7..731c8e36 100644
--- a/pagefind_web_js/lib/coupled_search.ts
+++ b/pagefind_web_js/lib/coupled_search.ts
@@ -33,6 +33,7 @@ export class PagefindInstance {
   searchMeta: any;
   languages: Record<string, internal.PagefindEntryLanguage> | null;
   loadedLanguage?: string;
+  includeCharacters?: string[];
 
   version: string;
   loadedVersion?: string;
@@ -203,6 +204,7 @@ export class PagefindInstance {
         (await entry_response.json()) as internal.PagefindEntryJson;
       this.languages = entry_json.languages;
       this.loadedVersion = entry_json.version;
+      this.includeCharacters = entry_json.include_characters ?? [];
       if (entry_json.version !== this.version) {
         if (this.primary) {
           console.warn(
@@ -491,14 +493,56 @@ export class PagefindInstance {
     if (exact_search) {
       log(`Running an exact search`);
     }
-    // Strip special characters to match the indexing operation
-    // TODO: Maybe move regex over the wasm boundary, or otherwise work to match the Rust regex engine
-    term = term
-      .toLowerCase()
-      .trim()
-      .replace(/[\.`~!@#\$%\^&\*\(\)\{\}\[\]\\\|:;'",<>\/\?\-]/g, "")
+
+    let trueLanguage: string | null = null;
+    try {
+      trueLanguage = Intl.getCanonicalLocales(this.loadedLanguage)[0];
+    } catch (err) {
+      // Loaded language is not valid
+    }
+    const term_chunks: string[] = [];
+    let segments: string[];
+
+    // TODO: resolve type error for Intl.Segmenter
+    //@ts-ignore: Property 'Segmenter' does not exist on type 'typeof Intl'
+    if (trueLanguage && typeof Intl.Segmenter !== "undefined") {
+      //@ts-ignore: Property 'Segmenter' does not exist on type 'typeof Intl'
+      const segmenter = new Intl.Segmenter(trueLanguage, {
+        granularity: "grapheme",
+      });
+      segments = [...segmenter.segment(term)].map(
+        ({ segment }: { segment: string }) => segment,
+      );
+    } else {
+      segments = [...term];
+    }
+
+    for (const segment of segments) {
+      if (this.includeCharacters?.includes(segment)) {
+        term_chunks.push(segment);
+      } else if (
+        !/^\p{Pd}|\p{Pe}|\p{Pf}|\p{Pi}|\p{Po}|\p{Ps}$/u.test(segment)
+      ) {
+        term_chunks.push(segment.toLocaleLowerCase());
+      }
+
+      /**
+       * Notes:
+       * Regex to match the Rust \w class if we need it:
+       * /^\p{Pc}|\p{LC}|\p{Ll}|\p{Lm}|\p{Lo}|\p{Lt}|\p{Lu}|\p{Nd}|\p{Nl}|\p{No}|\s$/u
+       * ES2024 regex to match emoji if we need it:
+       * /\p{RGI_Emoji}/v
+       */
+    }
+
+    // TODO: We could use the "word" granularity for Intl.Segmenter to handle
+    // segmentation for non-whitespace-delimited languages.
+
+    term = term_chunks
+      .join("")
       .replace(/\s{2,}/g, " ")
       .trim();
+
     log(`Normalized search term to ${term}`);
 
     if (!term?.length && !filter_only) {
diff --git a/pagefind_web_js/types/internal.d.ts b/pagefind_web_js/types/internal.d.ts
index 14fed99b..2521f1d3 100644
--- a/pagefind_web_js/types/internal.d.ts
+++ b/pagefind_web_js/types/internal.d.ts
@@ -3,6 +3,7 @@ import "pagefindWeb";
 export type PagefindEntryJson = {
   version: string;
   languages: Record<string, PagefindEntryLanguage>;
+  include_characters: string[];
 };
 
 export type PagefindEntryLanguage = {

From 7daffc37652a1872597135308d6c456ed17b7bc6 Mon Sep 17 00:00:00 2001
From: Liam Bigelow <40188355+bglw@users.noreply.github.com>
Date: Mon, 27 Jan 2025 21:58:25 +1300
Subject: [PATCH 2/5] Docs

---
 docs/content/docs/config-options.md | 23 ++++++++++++++++++++---
 docs/content/docs/indexing.md       | 22 +++++++++++++++++++++-
 2 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/docs/content/docs/config-options.md b/docs/content/docs/config-options.md
index 209e9968..57b1c6a0 100644
--- a/docs/content/docs/config-options.md
+++ b/docs/content/docs/config-options.md
@@ -71,6 +71,23 @@ Note that currently Pagefind only supports lists of options via configuration fi
 |---------------------------|------------------------------|---------------------|
 | `--exclude-selectors <S>` | `PAGEFIND_EXCLUDE_SELECTORS` | `exclude_selectors` |
 
+### Include characters
+Prevents Pagefind from stripping the provided characters when indexing content.
+Allows users to search for words including these characters.
+
+See [Indexing special characters](/docs/indexing/#indexing-special-characters) for more documentation.
+
+Care is needed if setting this argument via the CLI, as special characters may be interpreted by your shell.
+Configure this via a [configuration file](/docs/config-sources/#config-files) if you encounter issues.
+
+```yml
+include_characters: "<>$"
+```
+
+| CLI Flag                   | ENV Variable                  | Config Key          |
+|----------------------------|-------------------------------|---------------------|
+| `--include-characters <S>` | `PAGEFIND_INCLUDE_CHARACTERS` | `include_characters` |
+
 ### Glob
 Configures the glob used by Pagefind to discover HTML files. Defaults to `**/*.{html}`.
 See [Wax patterns documentation](https://github.com/olson-sean-k/wax#patterns) for more details.
@@ -79,7 +96,7 @@ See [Wax patterns documentation](https://github.com/olson-sean-k/wax#patterns) f
 |-----------------|-----------------|------------|
 | `--glob <GLOB>` | `PAGEFIND_GLOB` | `glob`     |
 
-### Force Language
+### Force language
 Ignores any detected languages and creates a single index for the entire site as the provided language. Expects an ISO 639-1 code, such as `en` or `pt`.
 
 See [Multilingual search](/docs/multilingual/) for more details.
@@ -88,14 +105,14 @@ See [Multilingual search](/docs/multilingual/) for more details.
 |---------------------------|---------------------------|------------------|
 | `--force-language <LANG>` | `PAGEFIND_FORCE_LANGUAGE` | `force_language` |
 
-### Keep Index URL
+### Keep index URL
 Keeps `index.html` at the end of search result paths. By default, a file at `animals/cat/index.html` will be given the URL `/animals/cat/`. Setting this option to `true` will result in the URL `/animals/cat/index.html`.
 
 | CLI Flag           | ENV Variable     | Config Key       |
 |--------------------|------------------|------------------|
 | `--keep-index-url` | `KEEP_INDEX_URL` | `keep_index_url` |
 
-### Write Playground
+### Write playground
 Writes the Pagefind playground files to `/playground` within your bundle directory. For most sites, this will make the Pagefind playground available at `/pagefind/playground/`.
 
 This defaults to false, so playground files are not written to your live site. Playground files are always available when running Pagefind with `--serve`.
diff --git a/docs/content/docs/indexing.md b/docs/content/docs/indexing.md
index 08a3ed3c..44687028 100644
--- a/docs/content/docs/indexing.md
+++ b/docs/content/docs/indexing.md
@@ -92,5 +92,25 @@ Attributes of HTML elements can be added to the search index with the `data-page
 ```
 {{< /diffcode >}}
 
-This attribute takes a comma-separated list of other attributes to include inline with the indexed content.  
+This attribute takes a comma-separated list of other attributes to include inline with the indexed content.
 The above example will be indexed as: `Condimentum Nullam. Image Title. Image Alt. Nullam id dolor id nibh ultricies.`
+
+## Indexing special characters
+
+By default, Pagefind strips most punctuation out of the page when indexing content. Punctuation is also removed from the search term when searching.
+
+For some sites, such as documentation for programming languages, searching for punctuation can be important. In these cases,
+the default behavior can be changed using the [Include Characters](/docs/config-options/#include-characters) option when running Pagefind.
+
+For example, given the following HTML:
+
+```html
+<p>The &lt;head&gt; tag</p>
+```
+
+Pagefind's default indexing would index `the`, `head`, and `tag`,
+and a user typing in a search term of `<head>` will have their search adapted to `head`.
+While this will still match the correct page, it won't distinguish between this result and a result talking about the head of a git repository.
+
+With the [Include Characters](/docs/config-options/#include-characters) option set to `<>`, Pagefind will instead index `the`, `<head>`, `head`, and `tag`.
+A search for `head` will still locate this page, while a search for `<head>` won't be rewritten and will specifically match this page.

From c692a639f6133e1449b5fe6fb3f8d51bebf0015f Mon Sep 17 00:00:00 2001
From: Liam Bigelow <40188355+bglw@users.noreply.github.com>
Date: Mon, 27 Jan 2025 21:59:02 +1300
Subject: [PATCH 3/5] Note in changelog to expand later

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 11c88e1a..88a2a231 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@
 
 ## Unreleased
 
+* Added the "Include Characters" option
 * Added the Pagefind Playground
 * Reduced filesizes for the Pagefind WebAssembly
 

From 3bdb9bd6b753558eeda2dabdedcd7a91d9a0d728 Mon Sep 17 00:00:00 2001
From: Liam Bigelow <40188355+bglw@users.noreply.github.com>
Date: Mon, 27 Jan 2025 22:09:16 +1300
Subject: [PATCH 4/5] Fix missing argument

---
 pagefind/src/fossick/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pagefind/src/fossick/mod.rs b/pagefind/src/fossick/mod.rs
index 90b0f430..7b71751f 100644
--- a/pagefind/src/fossick/mod.rs
+++ b/pagefind/src/fossick/mod.rs
@@ -999,7 +999,7 @@ mod tests {
         )
         .await;
 
-        let (content, words, _, _) = f.parse_digest();
+        let (content, words, _, _) = f.parse_digest(&test_opts());
 
         let mut words = words.keys().collect::<Vec<_>>();
         words.sort();

From b310e17f851f7a1d0f461d9f7e454ff1632b8baa Mon Sep 17 00:00:00 2001
From: Liam Bigelow <40188355+bglw@users.noreply.github.com>
Date: Mon, 27 Jan 2025 22:36:55 +1300
Subject: [PATCH 5/5] Resolve regressions around character-only words, and
 standalone emoji

---
 pagefind/src/fossick/mod.rs | 48 +++++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 21 deletions(-)

diff --git a/pagefind/src/fossick/mod.rs b/pagefind/src/fossick/mod.rs
index 7b71751f..c33597f4 100644
--- a/pagefind/src/fossick/mod.rs
+++ b/pagefind/src/fossick/mod.rs
@@ -338,12 +338,12 @@ impl Fossicker {
 
             // We use zero-width spaces as boundary values for some languages,
             // so we make sure that all are removed from the source content before going into the index.
-            let normalized_word = word.replace('\u{200B}', "");
-            if normalized_word.is_empty() {
+            let base_word = word.replace('\u{200B}', "");
+            if base_word.is_empty() {
                 return;
             }
 
-            content.push_str(&word.replace('\u{200B}', ""));
+            content.push_str(&base_word);
             if append_whitespace {
                 content.push(' ');
             }
@@ -351,10 +351,10 @@ impl Fossicker {
             if should_segment {
                 content.push('\u{200B}');
             }
-            let mut normalized_word = String::with_capacity(word.len());
+            let mut normalized_word = String::with_capacity(base_word.len());
             let mut possibly_compound = false;
 
-            for mut c in word.chars() {
+            for mut c in base_word.chars() {
                 let is_alpha = c.is_alphanumeric();
                 if !is_alpha {
                     possibly_compound = true;
@@ -378,25 +378,31 @@ impl Fossicker {
             // For words that may be CompoundWords, also index them as their constituent parts
             if possibly_compound {
                 let (word_parts, extras) = get_discrete_words(word);
-                // Only proceed if the word was broken into multiple parts
-                if word_parts.contains(|c: char| c.is_whitespace())
-                    || (!normalized_word.starts_with(&word_parts))
-                {
-                    let part_words: Vec<_> = word_parts.split_whitespace().collect();
-
-                    if !part_words.is_empty() {
-                        // Index constituents of a compound word as a proportion of the
-                        // weight of the full word.
-                        let per_weight = (word_weight
-                            / part_words.len().try_into().unwrap_or(std::u8::MAX))
-                        .max(1);
-
-                        // Only index two+ character words
-                        for part_word in part_words.into_iter().filter(|w| w.len() > 1) {
-                            store_word(part_word, total_word_index, per_weight);
+
+                // If this word normalized to nothing, we don't want to insert it here.
+                // (Though we do want to process the extras below, for things like emoji).
+                if !normalized_word.is_empty() {
+                    // Only proceed if the word was broken into multiple parts
+                    if word_parts.contains(|c: char| c.is_whitespace())
+                        || (!normalized_word.starts_with(&word_parts))
+                    {
+                        let part_words: Vec<_> = word_parts.split_whitespace().collect();
+
+                        if !part_words.is_empty() {
+                            // Index constituents of a compound word as a proportion of the
+                            // weight of the full word.
+                            let per_weight = (word_weight
+                                / part_words.len().try_into().unwrap_or(std::u8::MAX))
+                            .max(1);
+
+                            // Only index two+ character words
+                            for part_word in part_words.into_iter().filter(|w| w.len() > 1) {
+                                store_word(part_word, total_word_index, per_weight);
+                            }
                         }
                     }
                 }
+
                 // Additionally store any special extra characters we are given
                 if let Some(extras) = extras {
                     for extra in extras {