From 8b80d254054b7c41a6a240e92e91de8f30692a7a Mon Sep 17 00:00:00 2001 From: Michael Davis Date: Tue, 12 Nov 2024 21:27:58 -0500 Subject: [PATCH] suggester: Switch from UTF-32 to UTF-8 in ngram module Nuspell switches into UTF-32 for the ngram part of the suggester. This makes plenty of the metrics easier to calculate since, for example, `s[3]` is the third character in a UTF-32 string. (Not so with UTF-8: indexing into a UTF-8 string is a byte index and not necessarily a character boundary.) UTF-32 in Rust is not very well supported compared to UTF-8. Instead of a `String` you use a `Vec` and instead of `&str` you have `&[char]`. These alternatives do not have as well optimized routines as UTF-8, especially when it comes to the standard library's unstable `Pattern` trait. The standard library will use platform `memcmp` and the `memchr` crate for operations like `[u8]::eq` and `[u8]::contains` respectively, which far outperform generic/dumb `[T]::eq` or `[T]::starts_with`. The most expensive part of ngram analysis seems to be the first step: iterating over the whole word list and comparing the input word with the basic `ngram_similarity` function. A flamegraph reveals that we spent a huge amount of time on `contains_slice`, a generic but dumb routine called by `ngram_similarity` in a loop. It's a function that finds whether a slice contains subslice and emulates Nuspell's use of `std::basic_string_view::find`. This was ultimately a lot of `[char]::starts_with` which is fairly slow relative to `str::contains` against a `str` pattern. The `ngram` module was a bit special before this commit because it eagerly switched most `str`s into UTF-32: `Vec`s. That matched Nuspell and, as mentioned above, made some calculations easier/dumber. But the optimizations in the standard library for UTF-8 are undeniable. This commit decreases the total time to `suggest` for a rough word like "exmaple" by 25%. "exmaple" is tough because it contains two 'e's. 'E' is super common in English, so the `ngram_similarity` function ends up working relatively harder for "exmaple". Same for other words with multiple common chars or common stem substrings. The reason is that `ngram_similarity` has a fast lane to break out of looping when it notices that a word is quite dissimilar. It's a kind of "layered cake" - for a `left` and `right` string, you first find any k=1 kgrams of `left` in `right` and that's a fancy way of saying you find any `char`s in `left` that are in `right`. If there is more than one match you move onto k=2: find any substrings of `right` that match any two-character window in `left`. So the substrings you search for increase in size: k=1: "e", "x", "m", "a", "p", "l", "e" k=2: "ex", "xm", "ma", "ap", "pl", "le" k=3: "exm", "xma", "map", "apl", "ple" ... You may break out of the loop at a low `k` if your words are dissimilar. Words with multiple common letters though are unlikely to break out for your average other stem in the dictionary. All of this is to say that checking whether `right` contains a given subslice of `left` is central to this module and even more so in degenerate cases. So I believe focusing on UTF-8 here is worth the extra complexity of dealing with byte indices. --- To make this possible this module adds a wrapper struct around `&str` `CharsStr`: ```rust struct CharsStr<'a, 'i> { inner: &'a str, char_indices: &'i [u16] } ``` This eagerly computes the `str::char_indices` in `CharsStr::new`, borrowing a `&'i mut Vec`'s allocation. So this is hopefully equivalently expensive as converting each stem or expanded string to UTF-32 in a reused `Vec` since we need to iterate on chars anyways, and allocate per-char. We retain (and do not duplicate - as we would by converting to UTF-32) the UTF-8 representation though, allowing us to take advantage of the standard library's string searching optimizations. Hopefully this is also thriftier with memory. --- README.md | 1 + src/aff.rs | 15 -- src/suggester/ngram.rs | 499 +++++++++++++++++++++++++---------------- 3 files changed, 310 insertions(+), 205 deletions(-) diff --git a/README.md b/README.md index fe067ab..5f85f02 100644 --- a/README.md +++ b/README.md @@ -102,6 +102,7 @@ COMPOUNDRULE n*mp * [`@zverok`]'s [blog series on rebuilding Hunspell][zverok-blog] was an invaluable resource during early prototypes. The old [`spylls`](https://github.com/zverok/spylls)-like prototype can be found on the `spylls` branch. * Ultimately [Nuspell](https://github.com/nuspell/nuspell)'s codebase became the reference for Spellbook though as C++ idioms mesh better with Rust than Python's. Nuspell's code is in great shape and is much more readable than Hunspell so for now Spellbook is essentially a Rust rewrite of Nuspell (though we may diverge in the future). + * There are a few ways Spellbook diverges from Nuspell. Mostly this relates to data structures like using [`hashbrown`] instead of a custom hash table implementation or German strings for stems and flagsets (see the internal doc). Another difference is that Spellbook uses UTF-8 when calculating ngram suggestions rather than UTF-32; the results are the same but this performs better given the Rust standard library's optimizations for UTF-8. * The parser for `.dic` and `.aff` files is loosely based on [ZSpell](https://github.com/pluots/zspell). [`hashbrown`]: https://github.com/rust-lang/hashbrown diff --git a/src/aff.rs b/src/aff.rs index 96bfcf7..ffc175d 100644 --- a/src/aff.rs +++ b/src/aff.rs @@ -1115,21 +1115,6 @@ impl CaseHandling { } } - pub fn lowercase_into_utf32(&self, word: &str, out: &mut Vec) { - out.extend( - word.chars() - .map(match self { - Self::Turkic => |ch| match ch { - 'I' => 'ı', - 'İ' => 'i', - _ => ch, - }, - Self::Standard => |ch| ch, - }) - .flat_map(|ch| ch.to_lowercase()), - ) - } - pub fn uppercase(&self, word: &str) -> String { match self { Self::Turkic => word.replace('i', "İ").replace('ı', "I").to_uppercase(), diff --git a/src/suggester/ngram.rs b/src/suggester/ngram.rs index add7271..bd46d3a 100644 --- a/src/suggester/ngram.rs +++ b/src/suggester/ngram.rs @@ -20,35 +20,24 @@ // suggestions are implemented for compatibility with Nuspell/Hunspell but we could consider // adding other strategies as well, for example looking at the Aspell code. +// # Implementation note +// +// There's a theme in this module of reusing `Vec` allocations rather than having functions +// return new `Vec`s. This is slightly awkward but is necessary for performance. The +// `ngram_suggest` function in this module has a very hot loop iterating over every stem in the +// word list, so individual allocations add up and the performance boost from reusing allocations +// becomes noticeable. + use core::hash::BuildHasher; +use core::slice; use crate::alloc::{collections::BinaryHeap, string::String, vec::Vec}; use crate::aff::{CaseHandling, HIDDEN_HOMONYM_FLAG, MAX_SUGGESTIONS}; -use crate::{FlagSet, Stem, FULL_WORD}; +use crate::{FlagSet, FULL_WORD}; use super::Suggester; -// For ngram suggestions we'll be switching to UTF-32. UTF-32 uses 32-bit integers to represent -// every char. -// -// Compare to UTF-8 - the representation of `String` and `str` - in which a character could be -// represented by one through four bytes. -// -// UTF-32 is handy because indices are intuitive. `utf32_str[3]` is the third character. The same -// isn't true of UTF-8 - you index into the bytes so `utf8_str[3]` could be right in the middle of -// the bytes used to represent a character. -// -// `String` is a newtype wrapper around a `Vec` which is asserted to contain valid UTF-8. In -// contrast we won't use a newtype wrapper for our Utf32String below since any sequence of -// `char`s is valid UTF-32. -// -// This is getting off-topic but UTF-8 is generally preferred because it has good compression - -// for ASCII (which very common) you only need a byte to represent a character. UTF-32 strings -// take a constant 4 bytes per character which is relatively expensive. -type Utf32String = Vec; -type Utf32Str = [char]; - macro_rules! has_flag { ( $flags:expr, $flag:expr ) => {{ match $flag { @@ -80,14 +69,16 @@ impl PartialOrd for MinScored { } impl<'a, S: BuildHasher> Suggester<'a, S> { - pub(super) fn ngram_suggest(&self, word: &str, out: &mut Vec) { + pub(super) fn ngram_suggest(&self, word_str: &str, out: &mut Vec) { // First step: find 100 stems in the word list with the best ngram score. - let wrong_word: Utf32String = word.chars().collect(); - let mut wide_buf = Vec::new(); + let mut word_buf = Vec::with_capacity(word_str.len()); + let word = CharsStr::new(word_str, &mut word_buf); + // Overallocate so we probably don't need to reallocate in the loop: + let mut stem_buf = Vec::with_capacity(word.len_chars() * 2); + let mut lowercase_stem_buf = Vec::with_capacity(stem_buf.len()); let mut roots = BinaryHeap::with_capacity(100); - let mut stem_utf32 = Vec::new(); - for (stem_utf8, flagset) in self.checker.words.iter() { + for entry @ (stem, flagset) in self.checker.words.iter() { if has_flag!(flagset, self.checker.aff.options.forbidden_word_flag) || has_flag!(flagset, self.checker.aff.options.no_suggest_flag) || has_flag!(flagset, self.checker.aff.options.only_in_compound_flag) @@ -95,26 +86,25 @@ impl<'a, S: BuildHasher> Suggester<'a, S> { { continue; } - // Convert the dictionary stem to UTF-32. - stem_utf32.clear(); - stem_utf32.extend(stem_utf8.as_ref().chars()); - let mut score = left_common_substring_length( - &self.checker.aff.options.case_handling, - &wrong_word, - &stem_utf32, - ) as isize; - - wide_buf.clear(); - self.checker + let stem = CharsStr::new(stem.as_str(), &mut stem_buf); + let mut score = + left_common_substring_length(&self.checker.aff.options.case_handling, word, stem) + as isize; + + // TODO: lowercase into buf so we can reuse this allocation? It would mean copying a + // lot of code from the standard library unfortunately. + let lowercase_stem = self + .checker .aff .options .case_handling - .lowercase_into_utf32(stem_utf8.as_ref(), &mut wide_buf); - score += ngram_similarity_longer_worse(3, &wrong_word, &wide_buf); + .lowercase(stem.as_str()); + let lowercase_stem = CharsStr::new(lowercase_stem.as_str(), &mut lowercase_stem_buf); + score += ngram_similarity_longer_worse(3, word, lowercase_stem); let root = MinScored { score, - inner: (stem_utf8, flagset), + inner: entry, }; if roots.len() != 100 { roots.push(root); @@ -128,17 +118,20 @@ impl<'a, S: BuildHasher> Suggester<'a, S> { // Calculate a somewhat low threshold score so that we can ignore bad suggestions in the // next steps. + let mut mangled_word = String::new(); let mut threshold = 0isize; - for k in 1..=3 { - let mangled_word = &mut wide_buf; + for k_byte_idx in word.char_indices().skip(1).take(3) { + let k_byte_idx = *k_byte_idx as usize; mangled_word.clear(); - mangled_word.extend_from_slice(&wrong_word); - let mut i = k; - while i < mangled_word.len() { - mangled_word[i] = '*'; - i += 4; - } - threshold += ngram_similarity_any_mismatch(wrong_word.len(), &wrong_word, mangled_word); + mangled_word.push_str(&word_str[..k_byte_idx]); + mangled_word.extend(word_str[k_byte_idx..].chars().enumerate().map(|(i, ch)| { + if i % 4 == 0 { + '*' + } else { + ch + } + })); + threshold += ngram_similarity_any_mismatch(word.len_chars(), word, &mangled_word); } threshold /= 3; @@ -148,7 +141,7 @@ impl<'a, S: BuildHasher> Suggester<'a, S> { let mut expanded_list = Vec::new(); let mut expanded_cross_affix = Vec::new(); - let mut expanded_word_utf32 = Vec::new(); + let mut expanded_word_buf = Vec::with_capacity(word.len_chars() * 2); let mut guess_words = BinaryHeap::new(); for MinScored { @@ -158,45 +151,35 @@ impl<'a, S: BuildHasher> Suggester<'a, S> { { expanded_cross_affix.clear(); self.expand_stem_for_ngram( - stem, + stem.as_str(), flags, - word, + word_str, &mut expanded_list, &mut expanded_cross_affix, ); - for expanded_word_utf8 in expanded_list.drain(..) { - expanded_word_utf32.clear(); - expanded_word_utf32.extend(expanded_word_utf8.chars()); - + for expanded_word in expanded_list.drain(..) { let mut score = left_common_substring_length( &self.checker.aff.options.case_handling, - &wrong_word, - &expanded_word_utf32, + word, + CharsStr::new(&expanded_word, &mut expanded_word_buf), ) as isize; - let lower_expanded_word = &mut wide_buf; - lower_expanded_word.clear(); - self.checker + let lower_expanded_word = self + .checker .aff .options .case_handling - .lowercase_into_utf32(&expanded_word_utf8, lower_expanded_word); - score += ngram_similarity_any_mismatch( - wrong_word.len(), - &wrong_word, - lower_expanded_word, - ); + .lowercase(&expanded_word); + score += + ngram_similarity_any_mismatch(word.len_chars(), word, &lower_expanded_word); if score < threshold { continue; } - // Nuspell stores the UTF-32 word in `guess_words`, but then later converts from - // UTF-32 into UTF-8 when pushing into `out`. For our sake it's easier to store - // the UTF-8 instead and avoid the conversion later. let guess_word = MinScored { score, - inner: expanded_word_utf8, + inner: expanded_word, }; if guess_words.len() != 200 { guess_words.push(guess_word); @@ -216,43 +199,41 @@ impl<'a, S: BuildHasher> Suggester<'a, S> { // Nuspell questions whether or not the heap needs to be sorted before iterating. // For now, they sort the heap. I think Nuspell is correct to do so because the `break` // below could cause a different end behavior based on whether we're iterating on a sorted - // or unsorted vec. + // or unsorted vec. Note that we are sorting in descending order here. In my manual + // testing, using `BinaryHeap::into_vec` instead produces no noticeable difference. let mut guess_words = guess_words.into_sorted_vec(); + let mut lower_guess_word_buf = Vec::with_capacity(word.len_chars()); + // `iter_mut` because this loop modifies the `score`. for MinScored { score, inner: guess_word, } in guess_words.iter_mut() { - let lower_guess_word = &mut wide_buf; - lower_guess_word.clear(); - self.checker - .aff - .options - .case_handling - .lowercase_into_utf32(guess_word, lower_guess_word); + let lower_guess_word = self.checker.aff.options.case_handling.lowercase(guess_word); + let lower_guess_word = CharsStr::new(&lower_guess_word, &mut lower_guess_word_buf); - let lcs = - longest_common_subsequence_length(&wrong_word, lower_guess_word, &mut lcs_state); + let lcs = longest_common_subsequence_length(word, lower_guess_word, &mut lcs_state); - if wrong_word.len() == lower_guess_word.len() && wrong_word.len() == lcs { + if word.len_chars() == lower_guess_word.len_chars() && word.len_chars() == lcs { *score += 2000; break; } - let mut ngram2 = - ngram_similarity_any_mismatch_weighted(2, &wrong_word, lower_guess_word); - ngram2 += ngram_similarity_any_mismatch_weighted(2, lower_guess_word, &wrong_word); - let ngram4 = ngram_similarity_any_mismatch(4, &wrong_word, lower_guess_word); + let mut ngram2 = ngram_similarity_any_mismatch_weighted(2, word, lower_guess_word); + ngram2 += ngram_similarity_any_mismatch_weighted(2, lower_guess_word, word); + let ngram4 = ngram_similarity_any_mismatch(4, word, lower_guess_word.as_str()); + let left_common = left_common_substring_length( &self.checker.aff.options.case_handling, - &wrong_word, + word, lower_guess_word, ); + let (num_eq_chars_same_pos, eq_char_is_swapped) = - count_eq_at_same_pos(&wrong_word, lower_guess_word); + count_eq_at_same_pos(word, lower_guess_word); *score = 2 * lcs as isize; - *score -= (wrong_word.len() as isize - lower_guess_word.len() as isize).abs(); + *score -= (word.len_chars() as isize - lower_guess_word.len_chars() as isize).abs(); *score += left_common as isize + ngram2 + ngram4; if num_eq_chars_same_pos != 0 { *score += 1; @@ -262,7 +243,7 @@ impl<'a, S: BuildHasher> Suggester<'a, S> { } if 5 * ngram2 - < ((wrong_word.len() + lower_guess_word.len()) + < ((word.len_chars() + lower_guess_word.len_chars()) * (10 - self.checker.aff.options.max_diff_factor as usize)) as isize { @@ -298,15 +279,8 @@ impl<'a, S: BuildHasher> Suggester<'a, S> { { break; } - // Nuspell is converting back to UTF-8 but we store the `guess_word` in UTF-8 form. - // I think Nuspell only carries UTF-32 because of templates allowing easy conversion - // to lowercase. In Spellbook's case we need an explicit function for this and we - // already have one for UTF-8, so it's easier to carry UTF-8. It's nearly always less - // memory plus we save a conversion to UTF-8 right here: - if out - .iter() - .any(|sug| contains_subslice(guess_word.as_bytes(), sug.as_bytes())) - { + // Nuspell converts back to UTF-8 here but we've been working with UTF-8 all along. + if out.iter().any(|sug| guess_word.contains(sug)) { if score < -100 { break; } else { @@ -319,7 +293,7 @@ impl<'a, S: BuildHasher> Suggester<'a, S> { fn expand_stem_for_ngram( &self, - stem: &Stem, + stem: &str, flags: &FlagSet, word: &str, expanded_list: &mut Vec, @@ -329,7 +303,7 @@ impl<'a, S: BuildHasher> Suggester<'a, S> { cross_affix.clear(); if !has_flag!(flags, self.checker.aff.options.need_affix_flag) { - expanded_list.push(String::from(stem.as_ref())); + expanded_list.push(String::from(stem)); cross_affix.push(false); } @@ -359,18 +333,18 @@ impl<'a, S: BuildHasher> Suggester<'a, S> { if suffix .strip .as_ref() - .is_some_and(|suf| !stem.as_str().ends_with(&**suf)) + .is_some_and(|suf| !stem.ends_with(&**suf)) { continue; } - if !suffix.condition_matches(stem.as_str()) { + if !suffix.condition_matches(stem) { continue; } if !suffix.add.is_empty() && !word.ends_with(&*suffix.add) { continue; } - let expanded = suffix.to_derived(stem.as_str()); + let expanded = suffix.to_derived(stem); expanded_list.push(expanded); cross_affix.push(suffix.crossproduct); } @@ -409,7 +383,7 @@ impl<'a, S: BuildHasher> Suggester<'a, S> { continue; } - let expanded = prefix.to_derived(stem.as_str()); + let expanded = prefix.to_derived(stem); expanded_list.push(expanded); } } @@ -430,56 +404,201 @@ impl<'a, S: BuildHasher> Suggester<'a, S> { if prefix .strip .as_ref() - .is_some_and(|pre| !stem.as_str().starts_with(&**pre)) + .is_some_and(|pre| !stem.starts_with(&**pre)) { continue; } - if !prefix.condition_matches(stem.as_str()) { + if !prefix.condition_matches(stem) { continue; } if !prefix.add.is_empty() && !word.starts_with(&*prefix.add) { continue; } - let expanded = prefix.to_derived(stem.as_str()); + let expanded = prefix.to_derived(stem); expanded_list.push(expanded); } } } +/// A borrowed string (`&str`) wrapper which eagerly computes `core::str::char_indices`. +/// +/// With this type it is cheap both to ask for the number of Unicode characters in the string +/// (`len_chars`) and to subslice the string by _character index_. +/// +/// Compare with a regular `&str`: the number of characters can be counted with +/// `string.chars().count()` - a linear operation w.r.t. the length of the string. Subslicing +/// by character indices can't be done directly on a `&str`. +/// +/// # Some discussion on UTF-8 vs. UTF-32 +/// +/// Nuspell uses UTF-32 for the ngram similarity section of the suggester because you can iterate +/// and index on characters easily. (In UTF-32 every index is a character because in Unicode you +/// need at most 32 bits to represent any character. (Note: not the same as a grapheme cluster +/// like 🏴‍☠️).) The C++ standard library seems to be better optimized for UTF-32 operations (I +/// believe a `std::u32_string` and equivalent string views), specifically +/// std::basic_string_view`'s `find` which is central to `ngram_similarity`. We now +/// use `str::contains(kgram)` equivalently and it performs much better than +/// `&[char]::starts_with` plus indexing. +/// +/// Rust's UTF-8 methods are well optimized. Specifically, taking advantage of `memchr` and SIMD +/// to search `str`s. We retain a performance advantage by staying in UTF-8 and avoiding the more +/// generically (read: dumbly) implemented `Eq for &[T]` or `&[T: Eq]::starts_with`. +/// +/// In Spellbook's history this module was first implemented to work with UTF-32. Switching to +/// UTF-8 yielded an impressive 25% speed up for suggesting wrong words like "exmaple" which end +/// up in the `ngram_similarity` loop. +/// +///
What's curious about "exmaple" specifically?... +/// +/// "ngram similarity" is a kind of 'layer cake' when it comes to performance. The more similar +/// the input strings, the more you have to compare. First you start at `k=1` and check how many +/// times each character in `left` appears in `right`. If that happens more than twice for any +/// character you check how many times any two-character (`k=2`) combo in `left` appears in +/// `right`. If that happens more than twice then you move onto layer 3 (`k=3`). +/// +/// This means that when `left` contains two or more of a common letter for the language - like +/// "e" is in English - then the `ngram_similarity` function does much more work. +/// +/// `ngram_similarity` relies on finding sub-`&str`s in `right`, so the faster that you can +/// determine whether a kgram of `left` appears in `right`, the faster `ngram_similarity` will be. +/// +///
+/// +/// The original implementation eagerly collected `&str`s into `&[char]` once - paying the cost +/// of converting to UTF-32 once. This type is similar - we only run a character iterator over +/// the string once. But we can take advantage of optimized UTF-8 comparison tools in the standard +/// library. +/// +/// # Lifetimes +/// +/// For performance reasons this struct borrows a slab of memory from a `Vec` which is +/// hopefully instantiated many fewer times than this struct. See `CharsStr::new` docs. +#[derive(Clone, Copy)] +struct CharsStr<'s, 'i> { + inner: &'s str, + char_indices: &'i [u16], +} + +impl<'s, 'i> CharsStr<'s, 'i> { + /// Creates a `CharsStr`, borrowing the input `&str` and `slab`'s allocation. + /// + /// Taking in `slab` here is weird - it's not a nice API. We do it because otherwise we'd need + /// to allocate a new, short-lived `Vec` for every stem in the WordList. Short lived + /// allocations are bad for performance - `alloc`/`dealloc` from the Rust standard library + /// (and their implementations way down to the kernel) are usually slow compared to + /// stack-based operations. It's usually better to reuse a (probably overallocated) "block" / + /// "slab" of memory, and, intuitively, the savings get better the more you reuse. + /// + /// Above in `ngram_suggest`, the vector used as `slab` is reused across iterations of + /// stems in the wordlist for example. In fact we allocate a vector just for the stem's + /// `CharsStr`s. Instead of allocating per iteration of that loop we allocate once and + /// probably reallocate very rarely. + fn new(s: &'s str, slab: &'i mut Vec) -> Self { + let len_bytes = s.len(); + // Note: number of bytes is greater than or equal to number of chars, so all `as u16` + // conversions are safe after this assertion. (Equal when the string is ASCII only.) + assert!(len_bytes <= u16::MAX as usize); + slab.clear(); + slab.extend(s.char_indices().map(|(i, _ch)| i as u16)); + // Push the length so that we can make exclusive ranges out of the windows of the + // `char_indices` slice. + slab.push(len_bytes as u16); + + Self { + inner: s, + char_indices: slab.as_slice(), + } + } + + const fn len_chars(&self) -> usize { + // We push an extra element for the total `self.inner.len()`. + self.char_indices.len() - 1 + } + + const fn is_empty(&self) -> bool { + // As above, we pushed the extra element. So when `self.inner` is empty, + // `self.char_indices` should be `&[0u16]`. + self.char_indices.len() == 1 + } + + const fn as_str(&self) -> &str { + self.inner + } + + /// Returns a `&str` subslice containing all of the characters in the given _character_ range. + /// + /// Note that this method takes character indices and not byte indices. + fn char_slice(&self, char_range: core::ops::Range) -> &str { + let start_byte = self.char_indices[char_range.start] as usize; + let end_byte = self.char_indices[char_range.end] as usize; + // SAFETY: the caller is in charge of providing char indices that are in bounds of the + // `self.char_indices` array. (Those accesses are bounds checked.) All byte indices in + // `self.char_indices` are valid. + // Unfortunately the bounds checks cost a noticeable amount on a flamegraph, so we prefer + // the unsafe version to `&self.inner[start_byte..end_byte]`. + unsafe { self.inner.get_unchecked(start_byte..end_byte) } + } + + fn char_at(&self, char_idx: usize) -> &str { + let start_byte = self.char_indices[char_idx] as usize; + let end_byte = self.char_indices[char_idx + 1] as usize; + + // SAFETY: Same as above. All byte indices in `self.char_indices` are valid and the above + // accesses are checked. + unsafe { self.inner.get_unchecked(start_byte..end_byte) } + } + + fn char_iter(&self) -> impl Iterator + '_ { + // SAFETY: as above, all byte indices in `self.char_indices` are valid. `slice::windows` + // always produces valid indices into the slice, so all of these accesses can be done + // unchecked safely. + self.char_indices.windows(2).map(|idxs| unsafe { + let start = *idxs.get_unchecked(0) as usize; + let end = *idxs.get_unchecked(1) as usize; + self.inner.get_unchecked(start..end) + }) + } + + fn char_indices(&self) -> slice::Iter<'_, u16> { + self.char_indices.iter() + } +} + fn left_common_substring_length( case_handling: &CaseHandling, - left: &Utf32Str, - right: &Utf32Str, + left: CharsStr, + right: CharsStr, ) -> usize { - if left.is_empty() || right.is_empty() { + let mut left_chars = left.as_str().chars(); + let mut right_chars = right.as_str().chars(); + + let Some((l, r)) = left_chars.next().zip(right_chars.next()) else { return 0; - } + }; - if left[0] != right[0] && !case_handling.is_char_eq_lowercase(left[0], right[0]) { + if l != r && !case_handling.is_char_eq_lowercase(l, r) { return 0; } - index_of_mismatch(&left[1..], &right[1..]) + index_of_mismatch(left_chars, right_chars) .map(|idx| idx + 1) - .unwrap_or(left.len()) + .unwrap_or(left.len_chars()) } -fn index_of_mismatch(left: &[T], right: &[T]) -> Option { - left.iter() - .enumerate() - .find_map(|(idx, l)| match right.get(idx) { - Some(r) if r == l => None, - _ => Some(idx), - }) +fn index_of_mismatch>(left: I, mut right: I) -> Option { + left.enumerate().find_map(|(idx, l)| match right.next() { + Some(r) if r == l => None, + _ => Some(idx), + }) } -fn ngram_similarity_longer_worse(n: usize, left: &Utf32Str, right: &Utf32Str) -> isize { +fn ngram_similarity_longer_worse(n: usize, left: CharsStr, right: CharsStr) -> isize { if right.is_empty() { return 0; } - let mut score = ngram_similarity(n, left, right); - let d = (left.len() as isize - right.len() as isize) - 2; + let mut score = ngram_similarity(n, left, right.as_str()); + let d = (right.len_chars() as isize - left.len_chars() as isize) - 2; if d > 0 { score -= d; } @@ -487,15 +606,15 @@ fn ngram_similarity_longer_worse(n: usize, left: &Utf32Str, right: &Utf32Str) -> } // Nuspell calls this `ngram_similarity_low_level`. -fn ngram_similarity(n: usize, left: &Utf32Str, right: &Utf32Str) -> isize { - let n = n.min(left.len()); +fn ngram_similarity(n: usize, left: CharsStr, right: &str) -> isize { + let n = n.min(left.len_chars()); let mut score = 0; for k in 1..=n { let mut k_score = 0; - for i in 0..=left.len() - k { - let kgram = &left[i..i + k]; - if contains_subslice(right, kgram) { + for i in 0..=left.len_chars() - k { + let kgram = left.char_slice(i..i + k); + if right.contains(kgram) { k_score += 1; } } @@ -508,27 +627,12 @@ fn ngram_similarity(n: usize, left: &Utf32Str, right: &Utf32Str) -> isize { score } -fn contains_subslice(slice: &[T], subslice: &[T]) -> bool { - if subslice.len() > slice.len() { - return false; - } - - let window = slice.len() - subslice.len(); - for i in 0..=window { - if slice[i..].starts_with(subslice) { - return true; - } - } - - false -} - -fn ngram_similarity_any_mismatch(n: usize, left: &Utf32Str, right: &Utf32Str) -> isize { +fn ngram_similarity_any_mismatch(n: usize, left: CharsStr, right: &str) -> isize { if right.is_empty() { return 0; } let mut score = ngram_similarity(n, left, right); - let d = (right.len() as isize - left.len() as isize).abs() - 2; + let d = (right.chars().count() as isize - left.len_chars() as isize).abs() - 2; if d > 0 { score -= d; } @@ -536,22 +640,21 @@ fn ngram_similarity_any_mismatch(n: usize, left: &Utf32Str, right: &Utf32Str) -> } // Nuspell returns an isize. -fn longest_common_subsequence_length( - left: &[T], - right: &[T], +fn longest_common_subsequence_length( + left: CharsStr, + right: CharsStr, state_buffer: &mut Vec, ) -> usize { state_buffer.clear(); - state_buffer.resize(right.len(), 0); + state_buffer.resize(right.len_chars(), 0); let mut row1_prev = 0; - for l in left.iter() { + for l in left.char_iter() { row1_prev = 0; let mut row2_prev = 0; - for j in 0..right.len() { - let row1_current = state_buffer[j]; - let row2_current = &mut state_buffer[j]; - *row2_current = if *l == right[j] { + for (j, row2_current) in state_buffer.iter_mut().enumerate().take(right.len_chars()) { + let row1_current = *row2_current; + *row2_current = if l == right.char_at(j) { row1_prev + 1 } else { row1_current.max(row2_prev) @@ -565,31 +668,31 @@ fn longest_common_subsequence_length( row1_prev } -fn ngram_similarity_any_mismatch_weighted(n: usize, left: &Utf32Str, right: &Utf32Str) -> isize { +fn ngram_similarity_any_mismatch_weighted(n: usize, left: CharsStr, right: CharsStr) -> isize { if right.is_empty() { return 0; } - let mut score = ngram_similarity_weighted(n, left, right); - let d = (right.len() as isize - left.len() as isize).abs() - 2; + let mut score = ngram_similarity_weighted(n, left, right.as_str()); + let d = (right.len_chars() as isize - left.len_chars() as isize).abs() - 2; if d > 0 { score -= d; } score } -fn ngram_similarity_weighted(n: usize, left: &Utf32Str, right: &Utf32Str) -> isize { - let n = n.min(left.len()); +fn ngram_similarity_weighted(n: usize, left: CharsStr, right: &str) -> isize { + let n = n.min(left.len_chars()); let mut score = 0; for k in 1..=n { let mut k_score = 0; - for i in 0..=left.len() - k { - let kgram = &left[i..i + k]; - if contains_subslice(right, kgram) { + for i in 0..=left.len_chars() - k { + let kgram = left.char_slice(i..i + k); + if right.contains(kgram) { k_score += 1; } else { k_score -= 1; - if i == 0 || i == left.len() - k { + if i == 0 || i == left.len_chars() - k { k_score -= 1; } } @@ -600,19 +703,19 @@ fn ngram_similarity_weighted(n: usize, left: &Utf32Str, right: &Utf32Str) -> isi score } -fn count_eq_at_same_pos(left: &[T], right: &[T]) -> (usize, bool) { - let n = left.len().min(right.len()); +fn count_eq_at_same_pos(left: CharsStr, right: CharsStr) -> (usize, bool) { + let n = left.len_chars().min(right.len_chars()); let count = left - .iter() - .zip(right.iter()) + .char_iter() + .zip(right.char_iter()) .filter(|(l, r)| l == r) .count(); let mut is_swap = false; // Only two characters are not equal. Check if they were swapped. - if left.len() == right.len() && n - count == 2 { + if left.len_chars() == right.len_chars() && n - count == 2 { let mut first_mismatch = None; - for (l, r) in left.iter().zip(right.iter()) { + for (l, r) in left.char_iter().zip(right.char_iter()) { if l != r { if let Some((l1, r1)) = first_mismatch { is_swap = l1 == r && r1 == l; @@ -632,18 +735,10 @@ mod test { #[test] fn index_of_mismatch_test() { - assert_eq!(index_of_mismatch(b"abcd", b"abcd"), None); - assert_eq!(index_of_mismatch(b"abcd", b"abxy"), Some(2)); - assert_eq!(index_of_mismatch(b"abcd", b"abc"), Some(3)); - assert_eq!(index_of_mismatch(b"abc", b"abcd"), None); - } - - #[test] - fn contains_subslice_test() { - assert!(contains_subslice(b"abcd", b"abcd")); - assert!(contains_subslice(b"abcd", b"abc")); - assert!(contains_subslice(b"abcd", b"bcd")); - assert!(contains_subslice(b"abcd", b"b")); + assert_eq!(index_of_mismatch(b"abcd".iter(), b"abcd".iter()), None); + assert_eq!(index_of_mismatch(b"abcd".iter(), b"abxy".iter()), Some(2)); + assert_eq!(index_of_mismatch(b"abcd".iter(), b"abc".iter()), Some(3)); + assert_eq!(index_of_mismatch(b"abc".iter(), b"abcd".iter()), None); } #[test] @@ -651,27 +746,51 @@ mod test { // Rebuilding the Spellchecker: // > ngram(3, 'actually', 'akchualy') // > 11 = a, c, u, a, l, l, y, ua, al, ly, ual - let left: Utf32String = "actually".chars().collect(); - let right: Utf32String = "akchualy".chars().collect(); - assert_eq!(ngram_similarity(3, &left, &right), 11); + let mut left_buf = Vec::new(); + let left = CharsStr::new("actually", &mut left_buf); + assert_eq!(ngram_similarity(3, left, "akchualy"), 11); } #[test] fn longest_common_subsequence_length_test() { + let mut left_buffer = Vec::new(); + let mut right_buffer = Vec::new(); let mut state_buffer = Vec::new(); assert_eq!( - longest_common_subsequence_length(b"aaa", b"aaa", &mut state_buffer), + longest_common_subsequence_length( + CharsStr::new("aaa", &mut left_buffer), + CharsStr::new("aaa", &mut right_buffer), + &mut state_buffer + ), 3 ); assert_eq!( - longest_common_subsequence_length(b"aaaaa", b"bbbaa", &mut state_buffer), + longest_common_subsequence_length( + CharsStr::new("aaaaa", &mut left_buffer), + CharsStr::new("bbbaa", &mut right_buffer), + &mut state_buffer + ), 2 ); } #[test] fn count_eq_at_same_pos_test() { - assert_eq!(count_eq_at_same_pos(b"abcd", b"abcd"), (4, false)); - assert_eq!(count_eq_at_same_pos(b"abcd", b"acbd"), (2, true)); + let mut left_buffer = Vec::new(); + let mut right_buffer = Vec::new(); + assert_eq!( + count_eq_at_same_pos( + CharsStr::new("abcd", &mut left_buffer), + CharsStr::new("abcd", &mut right_buffer), + ), + (4, false) + ); + assert_eq!( + count_eq_at_same_pos( + CharsStr::new("abcd", &mut left_buffer), + CharsStr::new("acbd", &mut right_buffer), + ), + (2, true) + ); } }