From 66749ac51bb3da2e462cf2536d8c799ff98cd74a Mon Sep 17 00:00:00 2001 From: Arun Prasad Date: Sat, 23 Nov 2024 07:49:49 -0800 Subject: [PATCH] [lipi] Increase performance by around 40% Transliteration performance is not a bottleneck for anyone, but this commit applies a few simple optimizations that might be extended to other crates in this repo. Before: - `sample` runs in 13.49s After: - `sample` runs in 7.99s Benchmark changes: - Switch to transliterating millions of small strings as opposed to one enormous string. This seems like a more realistic workload. Transliteration changes: - Avoid looped map lookup and instead look up by the first char. - Avoid an extra hash lookup for a vowel mark. There is still one extra lookup remaining, but further iteration might remove this as well. Reshaping changes: - Avoid `collect` in favor of using a pre-allocated Vec. - If no changes were made, `Matcher` returns the original string without an extra allocation. - Avoid `match_2` and `take_2` logic in tight loops. Instead, fetch each char lazily and at most once. - `push_next` now increments an index so that we can push a whole sequence of chars at once. I also simplified the logic in `transliterate_inner`, but it's unclear how much of an effect that cleanup has. --- vidyut-lipi/examples/sample.rs | 14 ++-- vidyut-lipi/src/mapping.rs | 137 +++++++++++++++++++------------ vidyut-lipi/src/numerals.rs | 2 +- vidyut-lipi/src/reshape.rs | 97 +++++++++++++++++----- vidyut-lipi/src/transliterate.rs | 98 +++++++++++----------- vidyut-lipi/src/unicode_norm.rs | 9 +- vidyut-lipi/tests/basic.rs | 9 ++ 7 files changed, 235 insertions(+), 131 deletions(-) diff --git a/vidyut-lipi/examples/sample.rs b/vidyut-lipi/examples/sample.rs index cda1635..4e6d319 100644 --- a/vidyut-lipi/examples/sample.rs +++ b/vidyut-lipi/examples/sample.rs @@ -1,15 +1,13 @@ use vidyut_lipi::{Lipika, Scheme}; fn main() { - let mut input = String::new(); - for _ in 0..1_000_000 { - input.push_str(concat!( + let mut lipika = Lipika::new(); + + for _ in 0..2_000_000 { + let input = concat!( "nArAyaRaM namaskftya naraM cEva narottamam . ", "devIM sarasvatIM cEva tato jayamudIrayet .. 1 .." - )); + ); + lipika.transliterate(input, Scheme::Slp1, Scheme::Devanagari); } - - let mut lipika = Lipika::new(); - let output = lipika.transliterate(input, Scheme::Slp1, Scheme::Devanagari); - _ = lipika.transliterate(output, Scheme::Devanagari, Scheme::Slp1); } diff --git a/vidyut-lipi/src/mapping.rs b/vidyut-lipi/src/mapping.rs index 88e3eb2..4f88314 100644 --- a/vidyut-lipi/src/mapping.rs +++ b/vidyut-lipi/src/mapping.rs @@ -3,35 +3,40 @@ use crate::scheme::Scheme; use rustc_hash::{FxHashMap, FxHashSet}; -/// An output token, which we append to our output string when transliterating. +/// A mapping between a span of input text and a span of output text. #[derive(Clone, Debug, Eq, Hash, PartialEq)] -pub(crate) struct Token { - /// The text of this token. - text: String, +pub(crate) struct Span { + /// The key of this token. + pub key: String, + /// The value of this token. + pub value: String, /// The token type. `kind` controls how this token combines with neighboring tokens. - pub kind: TokenKind, + pub kind: SpanKind, } -impl Token { - /// Creates a new `Token`. - pub fn new(text: String, kind: TokenKind) -> Self { - Self { text, kind } +impl Span { + /// Creates a new `Span`. + pub fn new(key: String, text: String, kind: SpanKind) -> Self { + Self { + key, + value: text, + kind, + } } - /// Returns the string value of this token. - pub fn text(&self) -> &str { - &self.text + pub fn is_mark(&self) -> bool { + self.kind == SpanKind::VowelMark } /// Returns whether this token represents a consonant. pub fn is_consonant(&self) -> bool { - self.kind == TokenKind::Consonant + self.kind == SpanKind::Consonant } } /// Models how a token behaves in relation to other tokens. #[derive(Copy, Clone, Debug, Eq, Hash, PartialEq)] -pub(crate) enum TokenKind { +pub(crate) enum SpanKind { /// A consonant. A following vowel is generally a vowel mark. Consonant, /// A vowel mark, which generally must follow a consonant. @@ -44,9 +49,9 @@ pub(crate) enum TokenKind { Other, } -impl TokenKind { +impl SpanKind { fn from_devanagari_key(s: &str) -> Self { - use TokenKind::*; + use SpanKind::*; const MARK_AA: char = '\u{093e}'; const MARK_AU: char = '\u{094c}'; @@ -213,8 +218,8 @@ impl OneWayMapping { let v = vals.first()?; out.push_str(v); - let token_kind = TokenKind::from_devanagari_key(&deva_char); - if self.to_scheme.is_alphabet() && token_kind == TokenKind::Consonant { + let token_kind = SpanKind::from_devanagari_key(&deva_char); + if self.to_scheme.is_alphabet() && token_kind == SpanKind::Consonant { out.push('a'); } } @@ -254,15 +259,16 @@ impl OneWayMapping { pub struct Mapping { pub(crate) from: Scheme, pub(crate) to: Scheme, - pub(crate) all: FxHashMap, + pub(crate) all: FxHashMap, pub(crate) marks: FxHashMap, pub(crate) from_map: OneWayMapping, pub(crate) to_map: OneWayMapping, - pub(crate) len_longest_key: usize, pub(crate) numeral_to_int: FxHashMap, pub(crate) int_to_numeral: FxHashMap, + + tokens_by_first_char: FxHashMap>, } impl Mapping { @@ -290,7 +296,6 @@ impl Mapping { /// /// 1. A mapping `a --> x` without a corresponding `x --> b`. For example, consider `| --> ळ`, /// where `|` is an SLP1 character and `ळ` is not defined in B. In this case, we - /// transliterate `x` to scheme `B` then programmatically create a new `a --> b` mapping. /// /// 2. A mapping `x --> b` without a corresponding `a --> x`. For example, consider `ळ --> |`, /// where `|` is again an SLP1 character and `ळ` is not defined in A. In this case, we @@ -320,8 +325,8 @@ impl Mapping { None => continue, }; - let token_kind = TokenKind::from_devanagari_key(deva_key); - if token_kind == TokenKind::VowelMark { + let token_kind = SpanKind::from_devanagari_key(deva_key); + if token_kind == SpanKind::VowelMark { marks.insert(a.to_string(), b.to_string()); } @@ -332,14 +337,17 @@ impl Mapping { // // - If a sound has alternates, we store only the first. if !all.contains_key(a) { - all.insert(a.to_string(), Token::new(b.to_string(), token_kind)); + all.insert( + a.to_string(), + Span::new(a.to_string(), b.to_string(), token_kind), + ); seen_b.insert(b); } } } for (deva_key, a) in from.token_pairs() { - let token_kind = TokenKind::from_devanagari_key(deva_key); + let token_kind = SpanKind::from_devanagari_key(deva_key); if !all.contains_key(*a) && b_map.get(deva_key).is_none() { // Mapping `a --> x` doesn't have a corresponding `x --> b`. // So, create one. @@ -348,10 +356,10 @@ impl Mapping { None => continue, }; - if token_kind == TokenKind::VowelMark { + if token_kind == SpanKind::VowelMark { marks.insert(a.to_string(), new_b.clone()); } - all.insert(a.to_string(), Token::new(new_b, token_kind)); + all.insert(a.to_string(), Span::new(a.to_string(), new_b, token_kind)); } } @@ -365,13 +373,16 @@ impl Mapping { None => continue, }; - let token_kind = TokenKind::from_devanagari_key(deva_key); + let token_kind = SpanKind::from_devanagari_key(deva_key); if !new_a.is_empty() && !all.contains_key(&new_a) { - if token_kind == TokenKind::VowelMark { + if token_kind == SpanKind::VowelMark { marks.insert(new_a.clone(), b.to_string()); } - all.insert(new_a, Token::new(b.to_string(), token_kind)); + all.insert( + new_a.to_string(), + Span::new(new_a.to_string(), b.to_string(), token_kind), + ); } } @@ -381,9 +392,19 @@ impl Mapping { } // Take length in *chars*, not in *bytes*. // (Using chars over bytes offers a ~3x speedup in the core transliterate loop.) - let len_longest_key = all.keys().map(|a| a.chars().count()).max().unwrap_or(0); let numeral_to_int = a_map.numeral_to_int.clone(); + let mut tokens_by_first_char = FxHashMap::default(); + for t in all.values() { + if let Some(first_char) = t.key.chars().next() { + debug_assert!(!t.key.is_empty()); + tokens_by_first_char + .entry(first_char) + .or_insert(Vec::new()) + .push(t.clone()); + } + } + Self { from, to, @@ -391,9 +412,9 @@ impl Mapping { marks, from_map: a_map, to_map: b_map, - len_longest_key, numeral_to_int, int_to_numeral, + tokens_by_first_char, } } @@ -407,10 +428,17 @@ impl Mapping { self.to } - pub(crate) fn get(&self, key: &str) -> Option<&Token> { + pub(crate) fn get(&self, key: &str) -> Option<&Span> { self.all.get(key) } + pub(crate) fn spans_starting_with(&self, c: char) -> &[Span] { + match self.tokens_by_first_char.get(&c) { + Some(v) => v, + None => &[], + } + } + /// Dumps this mapping's data to stdout. #[allow(unused)] pub(crate) fn dump(&self) { @@ -418,8 +446,8 @@ impl Mapping { items.sort_by(|x, y| x.0.cmp(y.0)); for (k, v) in items { let k_codes: Vec<_> = k.chars().map(|c| c as u32).collect(); - let v_codes: Vec<_> = v.text().chars().map(|c| c as u32).collect(); - println!("{k} ({k_codes:x?}) --> {} ({v_codes:x?})", v.text()); + let v_codes: Vec<_> = v.value.chars().map(|c| c as u32).collect(); + println!("{k} ({k_codes:x?}) --> {} ({v_codes:x?})", v.value); } } } @@ -431,9 +459,9 @@ mod tests { #[test] fn test_decide_token_type() { - let is_mark = |c| TokenKind::from_devanagari_key(c) == TokenKind::VowelMark; - let is_consonant = |c| TokenKind::from_devanagari_key(c) == TokenKind::Consonant; - let is_other = |c| TokenKind::from_devanagari_key(c) == TokenKind::Other; + let is_mark = |c| SpanKind::from_devanagari_key(c) == SpanKind::VowelMark; + let is_consonant = |c| SpanKind::from_devanagari_key(c) == SpanKind::Consonant; + let is_other = |c| SpanKind::from_devanagari_key(c) == SpanKind::Other; assert!(is_mark("\u{093e}")); assert!(is_mark("\u{093f}")); @@ -483,36 +511,36 @@ mod tests { #[test] fn test_mapping() { - let other = |x: &str| Token::new(x.to_string(), TokenKind::Other); - let mark = |x: &str| Token::new(x.to_string(), TokenKind::VowelMark); + let other = |x: &str, y: &str| Span::new(x.to_string(), y.to_string(), SpanKind::Other); + let mark = |x: &str, y: &str| Span::new(x.to_string(), y.to_string(), SpanKind::VowelMark); let m = Mapping::new(Devanagari, Itrans); assert_eq!(m.from(), Devanagari); assert_eq!(m.to(), Itrans); - let assert_has = |m: &Mapping, x: &str, y: &Token| { + let assert_has = |m: &Mapping, x: &str, y: &Span| { assert_eq!(m.get(x).unwrap(), y); }; let m = Mapping::new(Devanagari, Itrans); - assert_has(&m, "आ", &other("A")); - assert_has(&m, "\u{093e}", &mark("A")); - assert_has(&m, "ए", &other("e")); - assert_has(&m, "\u{0947}", &mark("e")); + assert_has(&m, "आ", &other("आ", "A")); + assert_has(&m, "\u{093e}", &mark("\u{093e}", "A")); + assert_has(&m, "ए", &other("ए", "e")); + assert_has(&m, "\u{0947}", &mark("\u{0947}", "e")); let m = Mapping::new(Bengali, Itrans); - assert_has(&m, "\u{09be}", &mark("A")); - assert_has(&m, "\u{09c7}", &mark("e")); + assert_has(&m, "\u{09be}", &mark("\u{09be}", "A")); + assert_has(&m, "\u{09c7}", &mark("\u{09c7}", "e")); } #[test] fn test_mapping_with_unicode_decompositions() { // Maps to NFD let m = Mapping::new(Velthuis, Devanagari); - let cons = |x: &str| Token::new(x.to_string(), TokenKind::Consonant); - assert_eq!(m.get("R").unwrap(), &cons("\u{0921}\u{093c}")); - assert_eq!(m.get("Rh").unwrap(), &cons("\u{0922}\u{093c}")); + let cons = |x: &str, y: &str| Span::new(x.to_string(), y.to_string(), SpanKind::Consonant); + assert_eq!(m.get("R").unwrap(), &cons("R", "\u{0921}\u{093c}")); + assert_eq!(m.get("Rh").unwrap(), &cons("Rh", "\u{0922}\u{093c}")); // Maps from NFD and composed let m = Mapping::new(Devanagari, Velthuis); @@ -527,8 +555,11 @@ mod tests { assert_eq!(velthuis.data.get("\u{0921}\u{093c}").unwrap(), &vec!["R"]); assert_eq!(velthuis.data.get("\u{095c}"), None); - assert_eq!(m.get("\u{0921}\u{093c}").unwrap(), &cons("R")); - assert_eq!(m.get("\u{095c}").unwrap(), &cons("R")); - assert_eq!(m.get("\u{095d}").unwrap(), &cons("Rh")); + assert_eq!( + m.get("\u{0921}\u{093c}").unwrap(), + &cons("\u{0921}\u{093c}", "R") + ); + assert_eq!(m.get("\u{095c}").unwrap(), &cons("\u{095c}", "R")); + assert_eq!(m.get("\u{095d}").unwrap(), &cons("\u{095d}", "Rh")); } } diff --git a/vidyut-lipi/src/numerals.rs b/vidyut-lipi/src/numerals.rs index e4e883b..17098c3 100644 --- a/vidyut-lipi/src/numerals.rs +++ b/vidyut-lipi/src/numerals.rs @@ -224,7 +224,7 @@ pub fn transliterate_numeral(buffer: &mut String, numeral: &str, mapping: &Mappi let glyph_str = c.encode_utf8(&mut temp); mapping.all.get(glyph_str) }) { - buffer.push_str(glyph.text()); + buffer.push_str(&glyph.value); } } } diff --git a/vidyut-lipi/src/reshape.rs b/vidyut-lipi/src/reshape.rs index e5d8905..7686094 100644 --- a/vidyut-lipi/src/reshape.rs +++ b/vidyut-lipi/src/reshape.rs @@ -24,6 +24,7 @@ use crate::scheme::Scheme; use crate::unicode_norm; +use core::str::Chars; const BENGALI_LETTER_YA: char = '\u{09af}'; @@ -407,28 +408,37 @@ fn is_zanabazar_square_consonant(c: char) -> bool { /// /// - Most of our changes are simple enough that we don't need the full power of a standard regex /// engine. A simpler solution works just as well while providing better readability. -struct Matcher<'a> { +struct Matcher { /// The input text to reshap. - text: &'a str, + text: String, /// Byte offset within `text`. This points to the next character to consider. i: usize, + /// Byte offset when using `push_next`. We use this so that we can push spans. + prev: usize, /// The output buffer. buf: String, } -impl<'a> Matcher<'a> { +impl Matcher { /// Creates a new `Matcher`. - fn new(text: &'a str) -> Self { + fn new(text: String) -> Self { Self { text, i: 0, + prev: 0, buf: String::new(), } } /// Returns the final output buffer. - fn finish(self) -> String { - self.buf + fn finish(mut self) -> String { + if !self.buf.is_empty() { + self.flush(); + self.buf + } else { + // Nothing matched, so return the original text. + self.text + } } /// Returns whether there is still content remaining in the input string. @@ -441,6 +451,11 @@ impl<'a> Matcher<'a> { &self.text[self.i..] } + /// Returns the active slice. + fn chars(&self) -> Chars<'_> { + self.text[self.i..].chars() + } + /// Returns whether the next character matches the given `filter`. fn match_1(&mut self, filter: impl Fn(char) -> bool) -> bool { let mut chars = self.slice().chars(); @@ -469,34 +484,60 @@ impl<'a> Matcher<'a> { /// Consumes the next character then uses `func` to append to the buffer. fn take_1(&mut self, func: impl Fn(&mut String, char)) { + self.flush(); let mut chars = self.slice().chars(); if let Some(x) = chars.next() { func(&mut self.buf, x); self.i += x.len_utf8(); + self.prev = self.i; } } /// Consumes the next 2 characters then uses `func` to append to the buffer. fn take_2(&mut self, func: impl Fn(&mut String, char, char)) { + self.flush(); let mut chars = self.slice().chars(); if let (Some(x), Some(y)) = (chars.next(), chars.next()) { func(&mut self.buf, x, y); self.i += x.len_utf8() + y.len_utf8(); + self.prev = self.i; } } /// Consumes the next 3 characters then uses `func` to append to the buffer. fn take_3(&mut self, func: impl Fn(&mut String, char, char, char)) { + self.flush(); let mut chars = self.slice().chars(); if let (Some(x), Some(y), Some(z)) = (chars.next(), chars.next(), chars.next()) { func(&mut self.buf, x, y, z); self.i += x.len_utf8() + y.len_utf8() + z.len_utf8(); + self.prev = self.i; + } + } + + fn flush(&mut self) { + if self.prev < self.i { + self.buf += &self.text[self.prev..self.i]; + self.prev = self.i; } } /// Consumes the next char and adds it immediately to the buffer. fn push_next(&mut self) { - self.take_1(|buf, x| buf.push(x)); + if let Some(c) = self.slice().chars().next() { + self.i += c.len_utf8(); + } + } + + fn swap(&mut self, old: &[char], new: &[char]) { + self.flush(); + self.buf.extend(new); + self.i += old.iter().map(|c| c.len_utf8()).sum::(); + self.prev = self.i; + } + + fn add(&mut self, c: char) { + self.i += c.len_utf8(); } } @@ -507,8 +548,8 @@ pub fn reshape_before(input: &str, from: Scheme) -> String { // Convert to NFC first to avoid certain transliteration errors. // (See `iso_15919_bug_no_greedy_match_on_nfd` for an example of what we want to prevent.) let input = unicode_norm::to_nfc(input); + let mut m = Matcher::new(input); - let mut m = Matcher::new(&input); match from { Scheme::Assamese | Scheme::Bengali => { while m.not_empty() { @@ -550,11 +591,18 @@ pub fn reshape_before(input: &str, from: Scheme) -> String { m.finish() } Scheme::Devanagari | Scheme::Gujarati | Scheme::Kannada | Scheme::Odia | Scheme::Telugu => { + // TODO: efficient but not as readable as match/take. Is there a way to make this more + // concise? while m.not_empty() { - if m.match_2(|x, y| is_ayogavaha(x) && is_svara(y)) { - m.take_2(|buf, x, y| buf.extend(&[y, x])); - } else { - m.push_next(); + let mut it = m.chars(); + if let Some(x) = it.next() { + if is_ayogavaha(x) { + if let Some(y) = it.next().filter(|c| is_svara(*c)) { + m.swap(&[x, y], &[y, x]); + continue; + } + } + m.add(x); } } m.finish() @@ -709,6 +757,7 @@ pub fn reshape_before(input: &str, from: Scheme) -> String { // tsheg --> space m.take_1(|buf, _| buf.push(' ')) } else if m.match_1(is_tibetan_subjoined_consonant) { + m.flush(); // Unwrap to dummy characters for simpler logic below. let w = m.buf.chars().last().unwrap_or('_'); let y = m.slice().chars().nth(1).unwrap_or('_'); @@ -740,6 +789,7 @@ pub fn reshape_before(input: &str, from: Scheme) -> String { } Scheme::ZanabazarSquare => { const ZANABAZAR_SQUARE_MARK_TSHEG: char = '\u{11a41}'; + while m.not_empty() { if m.match_1(|x| x == ZANABAZAR_SQUARE_MARK_TSHEG) { // tsheg --> space @@ -753,7 +803,7 @@ pub fn reshape_before(input: &str, from: Scheme) -> String { } m.finish() } - _ => input, + _ => m.finish(), } } @@ -773,7 +823,7 @@ fn is_bengali_sound(c: char) -> bool { /// Reshapes `output` after we run the main transliteration function. pub fn reshape_after(output: String, to: Scheme) -> String { - let mut m = Matcher::new(&output); + let mut m = Matcher::new(output); match to { Scheme::Assamese | Scheme::Bengali => { @@ -844,7 +894,7 @@ pub fn reshape_after(output: String, to: Scheme) -> String { // Substitution above blocks substitution here, so split into two Matchers. let first = m.finish(); - let mut m = Matcher::new(&first); + let mut m = Matcher::new(first); while m.not_empty() { if m.match_2(|x, y| has_cham_final_consonant(x) && y == FAKE_VIRAMA) { m.take_2(|buf, x, _| { @@ -863,11 +913,18 @@ pub fn reshape_after(output: String, to: Scheme) -> String { | Scheme::Malayalam | Scheme::Odia | Scheme::Telugu => { + // TODO: efficient but not as readable as match/take. Is there a way to make this more + // concise? while m.not_empty() { - if m.match_2(|x, y| is_svara(x) && is_ayogavaha(y)) { - m.take_2(|buf, x, y| buf.extend(&[y, x])); - } else { - m.push_next(); + let mut it = m.chars(); + if let Some(x) = it.next() { + if is_svara(x) { + if let Some(y) = it.next().filter(|c| is_ayogavaha(*c)) { + m.swap(&[x, y], &[y, x]); + continue; + } + } + m.add(x); } } m.finish() @@ -1071,6 +1128,6 @@ pub fn reshape_after(output: String, to: Scheme) -> String { } m.finish() } - _ => output, + _ => m.finish(), } } diff --git a/vidyut-lipi/src/transliterate.rs b/vidyut-lipi/src/transliterate.rs index 8c97a31..72f3318 100644 --- a/vidyut-lipi/src/transliterate.rs +++ b/vidyut-lipi/src/transliterate.rs @@ -47,8 +47,9 @@ fn transliterate_inner(input: &str, mapping: &Mapping) -> String { let input = reshape_before(input, mapping.from()); let is_to_alphabet = mapping.to.is_alphabet(); - let is_from_abugida = mapping.from.is_abugida(); let is_to_abugida = mapping.to.is_abugida(); + let is_from_abugida = mapping.from.is_abugida(); + let is_from_alphabet = mapping.from.is_alphabet(); let is_from_itrans = mapping.from == Scheme::Itrans; let uses_non_decimal = @@ -85,7 +86,7 @@ fn transliterate_inner(input: &str, mapping: &Mapping) -> String { } else { // Otherwise, add independent 'a' vowel. if let Some(x) = mapping.get("a") { - output.push_str(x.text()); + output.push_str(&x.value); } } // Increment past "a:" @@ -97,55 +98,55 @@ fn transliterate_inner(input: &str, mapping: &Mapping) -> String { // // We must check for the *largest* match to distinguish between `b` and `bh`, `R` and `RR`, // etc. - let mut token = None; - let mut key: &str = ""; - let mut next_i = i; - for len_key in (1..=mapping.len_longest_key).rev() { - // `nth` is 0-indexed. - let j = input[i..].char_indices().nth(len_key).map(|(i, _)| i); - next_i = if let Some(j) = j { i + j } else { input.len() }; - key = &input[i..next_i]; - token = mapping.get(key); - if token.is_some() { - break; - } - } - debug_assert!(next_i > i, "next_i = {next_i}, i = {i}"); + // New approach. + let slice = &input[i..]; + let c = match slice.chars().next() { + Some(c) => c, + None => break, + }; + let span = mapping + .spans_starting_with(c) + .iter() + .filter(|t| slice.starts_with(&t.key)) + .max_by_key(|t| t.key.len()); // 2. Append the mapped result, if it exists. - if let Some(token) = token { + if let Some(span) = span { + let key = &span.key; // `letter_a` is "a" for Latin scripts but takes other values for non-Latin scripts // like Ol Chiki. - // Abugidas and alphabets have distinct logic here, so keep their code neatly separate. - if is_from_abugida { + // Abugidas and alphabets have distinct logic, so keep their code neatly separate. + if is_from_abugida && is_to_alphabet { let a = &mapping.to_map.letter_a; - if output.ends_with(a) - && (mapping.marks.contains_key(key) || key == mapping.from_map.virama) - { - // `key` maps to a token that blocks the default "a" vowel, so pop the "a" that - // we added in the previous iteration. + if output.ends_with(a) && (span.is_mark() || key == &mapping.from_map.virama) { + // `key` maps to a token that blocks the default "a" vowel: + // + // - a vowel mark inserts a different vowel sound instead. + // - the virama prevents "a" from being added. + // + // So, pop the "a" that we added in the previous iteration. output.pop(); } - output += &token.text(); + output += &span.value; - if is_to_alphabet && token.is_consonant() { + if span.is_consonant() { // Add an implicit "a" vowel. // // (The next loop iteration might pop this "a" off of `output`.) output += a; } - } else { + } else if is_from_alphabet && is_to_abugida { // Transliterate from alphabet - if had_virama && key == mapping.from_map.letter_a { + if had_virama && key == &mapping.from_map.letter_a { // `key` is the default "a" vowel, so pop the virama that we added in the // previous iteration. output.pop(); had_virama = false; } else { - let mut text = token.text(); + let mut text = &span.value; if had_virama { if let Some(mark) = mapping.marks.get(key) { output.pop(); @@ -155,7 +156,7 @@ fn transliterate_inner(input: &str, mapping: &Mapping) -> String { output += text; - if token.is_consonant() && is_to_abugida { + if span.is_consonant() { // We have not seen a vowel mark yet, so push a virama for now. // // (The next loop iteration might pop this virama off of `output`.) @@ -163,6 +164,8 @@ fn transliterate_inner(input: &str, mapping: &Mapping) -> String { had_virama = true; } } + } else { + output += &span.value; } // Special case: to ISO-15959 separator logic for a:i, a:u @@ -172,33 +175,32 @@ fn transliterate_inner(input: &str, mapping: &Mapping) -> String { // TODO: is there a better place to put this? if mapping.to == Scheme::Iso15919 && (output.ends_with("ai") || output.ends_with("au")) - && matches!(token.text(), "i" | "u") + && matches!(span.value.as_str(), "i" | "u") { output.pop(); output.push(':'); - output.push_str(token.text()); + output.push_str(&span.value); } - } else { + + // Prepare for next loop. + debug_assert!(!span.key.is_empty()); + i += span.key.len(); + } else if is_from_itrans && c == '\\' { // ITRANS: `\` skips the next character. - if is_from_itrans { - let mut chars = input[i..].chars(); - if chars.next() == Some('\\') { - i += 1; - if let Some(c) = chars.next() { - output.push(c); - i += c.len_utf8(); - } - continue; - } + i += c.len_utf8(); + if let Some(c) = slice.chars().nth(1) { + output.push(c); + i += c.len_utf8(); } - + continue; + } else { // Use the original character as-is. - output.push_str(key); + output.push(c); had_virama = false; - } - // Prepare for next loop. - i = next_i; + // Prepare for next loop. + i += c.len_utf8(); + } } reshape_after(output, mapping.to()) diff --git a/vidyut-lipi/src/unicode_norm.rs b/vidyut-lipi/src/unicode_norm.rs index 4363354..b480da8 100644 --- a/vidyut-lipi/src/unicode_norm.rs +++ b/vidyut-lipi/src/unicode_norm.rs @@ -260,7 +260,14 @@ pub const TIRHUTA_NFD: Table = &[ /// will be left as-is. #[cfg(not(target_arch = "wasm32"))] pub(crate) fn to_nfc(s: &str) -> String { - s.nfc().collect() + // `.collect()` seems not to pre-allocate. + // + // Result: 8.27s --> 7.89s (-5%) + let mut ret = String::with_capacity(s.len()); + for c in s.nfc() { + ret.push(c); + } + ret } /// WASM-only version of `to_nfc`. diff --git a/vidyut-lipi/tests/basic.rs b/vidyut-lipi/tests/basic.rs index ee1aef2..ec4d01e 100644 --- a/vidyut-lipi/tests/basic.rs +++ b/vidyut-lipi/tests/basic.rs @@ -1242,3 +1242,12 @@ fn velthuis_basic() { // Extended consonants assert_has("qa .kha .ga za Ra Rha fa", "क़ ख़ ग़ ज़ ड़ ढ़ फ़"); } + +// Other bugs +// ---------- + +/// Tests that skipping unmappable content doesn't land within a char boundry. +#[test] +fn test_mixed_content() { + assert_transliterate("saMskftam 漢語", HarvardKyoto, Devanagari, "संस्क्फ़्तम् 漢語"); +}