From aa32d228e43ca38c3e00114374ea69fb81a15726 Mon Sep 17 00:00:00 2001 From: Arun Prasad Date: Sat, 10 Feb 2024 14:58:10 -0800 Subject: [PATCH] [lipi] Add 4 schemes and better Vedic support Schemes: - Add basic support for Assamese, Kharoshthi, Mon, and Soyombo. Features: - Add stronger support for upadhmaniya and jihvamuliya. - Add limited support for Meetei Mayek. - Add limited support for Samaveda svaras. Bug fixes: - Fix treatment of Malayalam au. - Add experimental support for Malayalam chillus. - Use NFC internally to avoid a bug with greedy ISO matching. - Support ayogavahas used with svaras in Devanagari. - Fix display for several schemes in the UI. Code: - Simplify and clean up `create_schemes.py`. - Add benchmark test. --- README.md | 7 +- vidyut-lipi/Makefile | 5 + vidyut-lipi/README.md | 7 +- vidyut-lipi/examples/sample.rs | 15 + vidyut-lipi/scripts/create_schemes.py | 712 +++++++++++++--------- vidyut-lipi/src/autogen_schemes.rs | 618 ++++++++++++++++--- vidyut-lipi/src/detect.rs | 23 +- vidyut-lipi/src/errors.rs | 23 + vidyut-lipi/src/lib.rs | 1 + vidyut-lipi/src/mapping.rs | 86 +-- vidyut-lipi/src/reshape.rs | 476 ++++++++++++++- vidyut-lipi/src/scheme.rs | 76 ++- vidyut-lipi/src/transliterate.rs | 10 +- vidyut-lipi/src/unicode_norm.rs | 15 +- vidyut-lipi/tests/basic.rs | 250 +++++++- vidyut-lipi/www/static/vidyut-lipi-app.js | 36 +- 16 files changed, 1820 insertions(+), 540 deletions(-) create mode 100644 vidyut-lipi/examples/sample.rs create mode 100644 vidyut-lipi/src/errors.rs diff --git a/README.md b/README.md index bf60161..631f6a3 100644 --- a/README.md +++ b/README.md @@ -139,10 +139,9 @@ For details, see the [vidyut-kosha README][vidyut-kosha]. ### [`vidyut-lipi`][vidyut-lipi] -`vidyut-lipi` is an experimental Sanskrit transliteration library that also +`vidyut-lipi` is a transliteration library for Sanskrit and Pali that also supports many of the scripts used within the Indosphere. Our goal is to provide -a standard transliterator for the Sanskrit ecosystem that is easy to bind to -other programming languages. +a standard transliterator that is easy to bind to other programming languages. For details, see the [vidyut-lipi README][vidyut-lipi]. @@ -217,8 +216,6 @@ we are most excited about: - dependency parsing and *anvaya* generation - search indexing that accounts for sandhi and Sanskrit's complex morphology. -- transliteration, perhaps through a port of [Aksharamukha][aksharamukha] -- meter recognition - support for Vedic Sanskrit - implementations of non-Paninian grammars diff --git a/vidyut-lipi/Makefile b/vidyut-lipi/Makefile index d9d6892..50246db 100644 --- a/vidyut-lipi/Makefile +++ b/vidyut-lipi/Makefile @@ -3,3 +3,8 @@ debugger: test: cargo nextest run --no-fail-fast --status-level=fail && cargo test --doc + +# Profiles the program's execution time on OSX. This command will probably not +# work on other operating systems. +profile-time-osx: + cargo instruments -t time --release --example sample > /dev/null diff --git a/vidyut-lipi/README.md b/vidyut-lipi/README.md index 0a95370..d79b008 100644 --- a/vidyut-lipi/README.md +++ b/vidyut-lipi/README.md @@ -1,12 +1,11 @@

vidyut-lipi

-

A fast Sanskrit transliterator

+

An Indic transliterator

-`vidyut-lipi` is an experimental Sanskrit transliteration library that also +`vidyut-lipi` is a transliteration library for Sanskrit and Pali that also supports many of the scripts used within the Indosphere. Our goal is to provide -a standard transliterator for the Sanskrit ecosystem that is easy to bind to -other programming languages. +a standard transliterator that is easy to bind to other programming languages. This [crate][crate] is under active development as part of the [Ambuda][ambuda] project. If you enjoy our work and wish to contribute to it, we encourage you diff --git a/vidyut-lipi/examples/sample.rs b/vidyut-lipi/examples/sample.rs new file mode 100644 index 0000000..f1d79ed --- /dev/null +++ b/vidyut-lipi/examples/sample.rs @@ -0,0 +1,15 @@ +use vidyut_lipi::{Lipika, Scheme}; + +fn main() { + let mut input = String::new(); + for _ in 0..1_000_000 { + input.push_str(concat!( + "nArAyaRaM namaskftya naraM cEva narottamam . ", + "devIM sarasvatIM cEva tato jayamudIrayet .. 1 .." + )); + } + + let mut lipika = Lipika::new(); + let output = lipika.transliterate(input, Scheme::Slp1, Scheme::Tibetan); + println!("{output}"); +} diff --git a/vidyut-lipi/scripts/create_schemes.py b/vidyut-lipi/scripts/create_schemes.py index a8d54e5..3bbe0d2 100755 --- a/vidyut-lipi/scripts/create_schemes.py +++ b/vidyut-lipi/scripts/create_schemes.py @@ -16,6 +16,7 @@ ALLOWED = { "AHOM", + "ASSAMESE", "BALINESE", "BENGALI", "BHAIKSUKI", @@ -32,6 +33,7 @@ "JAVANESE", "KAITHI", "KANNADA", + "KHAROSHTHI", "KHMER", "KHUDAWADI", "LAO", @@ -40,6 +42,7 @@ "MALAYALAM", "MANIPURI", "MODI", + "MON", "NANDINAGARI", "NEWA", "OL_CHIKI", @@ -47,8 +50,8 @@ "SHARADA", "SIDDHAM", "SINHALA", - # Not yet on indic-transliteration/master "SAURASHTRA", + "SOYOMBO", "TAKRI", "TAI_THAM", "TAMIL_SUPERSCRIPTED", @@ -57,7 +60,6 @@ "TIBETAN", "TIRHUTA_MAITHILI", "ZANBAZAR_SQUARE", - "BARAHA", "HK", "IAST", @@ -88,7 +90,6 @@ "\u0912": "O", "\u0913": "OO", "\u0914": "AU", - "\u093e": "SIGN_AA", "\u093f": "SIGN_I", "\u0940": "SIGN_II", @@ -106,12 +107,10 @@ "\u094a": "SIGN_O", "\u094b": "SIGN_OO", "\u094c": "SIGN_AU", - "\u0902": "ANUSVARA", "\u0903": "VISARGA", "\u0901": "CANDRABINDU", "\u094d": "VIRAMA", - "\u0915": "KA", "\u0916": "KHA", "\u0917": "GA", @@ -129,7 +128,6 @@ "\u0923": "NNA", "\u0924": "TA", "\u0925": "THA", - "\u0926": "DA", "\u0927": "DHA", "\u0928": "NA", @@ -141,22 +139,18 @@ "\u092f": "YA", "\u0930": "RA", "\u0932": "LA", - "\u0935": "VA", "\u0936": "SHA", "\u0937": "SSA", "\u0938": "SA", "\u0939": "HA", - "\u0933": "LLA", "\u0931": "RRA", "\u0929": "NNNA", "\u0934": "LLLA", - "क्ष": "KSSA", "ज्ञ": "JNYA", "त्र": "TRA", - "\u0915\u093c": "QA", "\u0916\u093c": "KHHA", "\u0917\u093c": "GHHA", @@ -165,11 +159,9 @@ "\u0922\u093c": "RHA", "\u092b\u093c": "FA", "\u092f\u093c": "YYA", - "\u0951": "SVARITA", "\u1cda": "DOUBLE_SVARITA", "\u0952": "ANUDATTA", - "\u0966": "DIGIT_0", "\u0967": "DIGIT_1", "\u0968": "DIGIT_2", @@ -180,7 +172,6 @@ "\u096d": "DIGIT_7", "\u096e": "DIGIT_8", "\u096f": "DIGIT_9", - "\u0950": "OM", "\u093c": "NUKTA", "\u093d": "AVAGRAHA", @@ -188,7 +179,32 @@ "\u0965": "DOUBLE_DANDA", "\u0970": "ABBREVIATION_SIGN", "\u0971": "HIGH_SPACING_DOT", - + "\u1cd2": "PRENKHA", + "\u1cdd": "VEDIC_DOT_BELOW", + "\u1ce1": "ATHARVAVEDA_INDEPENDENT_SVARITA", + "\u1cf5": "JIHVAMULIYA", + "\u1cf6": "UPADHMANIYA", + "\u1cf8": "VEDIC_RING_ABOVE", + "\ua8e0": "COMBINING_DIGIT_0", + "\ua8e1": "COMBINING_DIGIT_1", + "\ua8e2": "COMBINING_DIGIT_2", + "\ua8e3": "COMBINING_DIGIT_3", + "\ua8e4": "COMBINING_DIGIT_4", + "\ua8e5": "COMBINING_DIGIT_5", + "\ua8e6": "COMBINING_DIGIT_6", + "\ua8e7": "COMBINING_DIGIT_7", + "\ua8e8": "COMBINING_DIGIT_8", + "\ua8e9": "COMBINING_DIGIT_9", + "\ua8ea": "COMBINING_A", + "\ua8eb": "COMBINING_U", + "\ua8ec": "COMBINING_KA", + "\ua8ed": "COMBINING_NA", + "\ua8ee": "COMBINING_PA", + "\ua8ef": "COMBINING_RA", + "\ua8f0": "COMBINING_VI", + "\ua8f1": "COMBINING_AVAGRAHA", + "\ua8f3": "CANDRABINDU_VIRAMA", + "\u0b83": "TAMIL_AYTHAM", "\u200c": "ZERO_WIDTH_NON_JOINER", "\u200d": "ZERO_WIDTH_JOINER", } @@ -211,6 +227,333 @@ "औ": "\u094c", } + +JIHVAMULIYA = "\u1cf5" +UPADHMANIYA = "\u1cf6" +DANDA = "\u0964" +DOUBLE_DANDA = "\u0965" + + +OVERRIDES = { + "BARAHA": + # Existing accent marks seem to be mostly wrong -- delete so that we + # can redefine them elsewhere. + { + "\u1ce1": None, + "\ua8e1": None, + "\ua8e2": None, + "\ua8e3": None, + }, + "GRANTHA": { + # vowel sign AU + "\u094c": "\U0001134c", + }, + "GONDI_GUNJALA": { + # No avagraha defined -- for now, use Devanagari avagraha as placeholder. + "\u093d": "\u093d", + }, + "GONDI_MASARAM": { + # Virama + "\u094d": "\U00011d45", + # No avagraha defined -- for now, use Devanagari avagraha as placeholder. + "\u093d": "\u093d", + # Conjuncts + "क्ष": "𑴮", + "ज्ञ": "𑴯", + }, + "GURMUKHI": { + "\u090b": "ਰੁ", # letter vocalic r + "\u0960": "ਰੂ", # letter vocalic rr + "\u090c": "ਲੁ", # letter vocalic l + "\u0961": "ਲੂ", # letter vocalic ll + "\u0943": "\u0a4dਰੁ", # sign vocalic r + "\u0944": "\u0a4dਰੂ", # sign vocalic rr + "\u0962": "\u0a4dਲੁ", # sign vocalic l + "\u0963": "\u0a4dਲੂ", # sign vocalic ll + "\u090e": "ਏ", # letter short e + "\u0912": "ਓ", # letter short o + "\u0946": "\u0a47", # sign short e + "\u094a": "\u0a4b", # sign short o + }, + "HK": { + DANDA: ".", + DOUBLE_DANDA: "..", + }, + "ISO": { + "।": ".", + "॥": "..", + "ख़": "k͟h", + # Delete -- common_maps maps this to "ḳ", which we need for aytam. + # We'll add a valid mapping for क़: further below. + "क़": None, + # Delete -- mistake, corrected below. + "ḫ": None, + }, + "IAST": { + "ळ": "ḻ", + "ऴ": None, + "।": ".", + "॥": "..", + # candrabindu + "\u0901": "m̐", + }, + "JAVANESE": { + DANDA: "\ua9c8", + DOUBLE_DANDA: "\ua9c9", + }, + "KHAROSHTHI": { + "।": "\U00010a56", + "॥": "\U00010a57", + }, + "KHMER": { + "।": "។", + "॥": "៕", + }, + "KHUDAWADI": { + "\u090e": "\U000112b6", # letter short e + "\u0912": "\U000112b8", # letter short o + "\u0946": "\U000112e5", # sign short e + "\u094a": "\U000112e7", # sign short o + }, + "LIMBU": { + "\u090a": "\u1900\u1922\u193a", # letter uu + "\u0960": "\u1916\u1922\u193a", # letter vocalic rr + "\u0961": "\u1917\u1922\u193a", # letter vocalic ll + "\u0942": "\u1922\u193a", # sign uu + "\u0943": "\u193b\u1916\u1922", # sign vocalic r + "\u0944": "\u193b\u1916\u1922\u193a", # sign vocalic rr + "\u0962": "\u193b\u1917\u1922", # sign vocalic l + "\u0963": "\u193b\u1917\u1922\u193a", # sign vocalic ll + "\u090e": "\u1900\u1927", # letter short e + "\u0912": "\u1900\u1928", # letter short o + "\u0946": "\u1927", # sign short e + "\u094a": "\u1928", # sign short o + }, + "MANIPURI": { + DANDA: "꯫", + DOUBLE_DANDA: "꯫꯫", + }, + "MALAYALAM": { + # sign short o + "\u094a": "\u0d4a", + # sign au + "\u094c": "\u0d57", + }, + "MODI": { + "\u0907": "\U00011602", # letter i + "\u0908": "\U00011603", # letter ii + "\u0909": "\U00011604", # letter u + "\u090a": "\U00011605", # letter uu + "\u090b": "\U00011606", # letter vocalic r + "\u090c": "\U00011608", # letter vocalic l + "\u093f": "\U00011631", # sign i + "\u0940": "\U00011632", # sign ii + "\u0941": "\U00011633", # sign u + "\u0942": "\U00011634", # sign uu + "\u0943": "\U00011635", # sign vocalic r + "\u0944": "\U00011636", # sign vocalic rr + "\u0960": "\U00011607", # letter vocalic rr + "\u0961": "\U00011609", # letter vocalic ll + "\u0962": "\U00011637", # sign vocalic l + "\u0963": "\U00011638", # sign vocalic ll + DANDA: "\U00011641", + DOUBLE_DANDA: "\U00011642", + }, + "MON": { + DANDA: "\u104a", + DOUBLE_DANDA: "\u104b", + }, + "NEWA": { + DANDA: "\U0001144b", + DOUBLE_DANDA: "\U0001144c", + }, + "TAKRI": { + "ख": "𑚸", + }, + "TAMIL_SUPERSCRIPTED": + # Use roman digits per Aksharamukha + { + "०": "0", + "१": "1", + "२": "2", + "३": "3", + "४": "4", + "५": "5", + "६": "6", + "७": "7", + "८": "8", + "९": "9", + }, + "TIBETAN": { + # Virama + "\u094d": "\u0f84", + # Use distinct "va" character instead of "ba". + "व": "\u0f5d", + }, + "VELTHUIS": + # These are part of the Velthuis spec but are errors in indic-transliteration. + { + "ॠ": ".R", + "ॡ": ".L", + # Should be .o, per spec + "ॐ": ".o", + }, + "WX": { + "ऎ": "eV", + "ऒ": "oV", + "ॡ": "LV", + "ळ": "lY", + "ऽ": "Z", + }, + "ZANBAZAR_SQUARE": { + "\u0943": "\U00011A34\U00011A2B\U00011A09", # sign vocalic r + "\u0944": "\U00011A34\U00011A2B\U00011A09\U00011A0A", # sign vocalic rr + "\u0962": "\U00011A34\U00011A2C\U00011A09", # sign vocalic l + "\u0963": "\U00011A34\U00011A2C\U00011A09\U00011A0A", # sign vocalic ll + }, +} + + +EXTENSIONS = { + "BARAHA": [ + ("\u0914", "ou"), + ("\u094c", "ou"), + ("\u0939", "~h"), + # Corrected accent marks: + # - Horizontal line above () + ("\u1cd2", "Q"), + # - Dot below + ("\u1cdd", "V"), + # - TODO: Dot above (can't find the right Unicode for it) + # TODO: ("", "W"), + # - Double vertical line above (double svarita) + ("\u1cda", "$"), + ], + "BRAHMI": [ + (JIHVAMULIYA, "\U00011003"), + (UPADHMANIYA, "\U00011004"), + ], + "DEVANAGARI": [ + # DEVANAGARI VOWEL SIGN PRISHTHAMATRA E (U+094E) + # See comments on U+094E for details. + ("\u0948", "\u0947\u094e"), + ("\u094b", "\u093e\u094e"), + ("\u094c", "\u094b\u094e"), + # Vedic accents + ("\u1cd2", "\u1cd2"), + ("\u1cda", "\u1cda"), + ("\u1cdd", "\u1cdd"), + ("\ua8e0", "\ua8e0"), + # Punctuation + ("\u0970", "\u0970"), + ("\u0971", "\u0971"), + ], + "GONDI_GUNJALA": [ + ("\u094d", "\U00011D97"), + ], + "GONDI_MASARAM": [ + ("त्र", "𑴰"), + ], + "GRANTHA": [ + # OO (EE + AA length mark) + ("\u094b", "\U00011347\U0001133e"), + # AU length mark + ("\u094c", "\U00011357"), + # AU (AA + AU length mark) + ("\u094c", "\U00011347\U00011357"), + # Vedic accents + ("\ua8e0", "\U00011366"), + ("\ua8e1", "\U00011367"), + ("\ua8e2", "\U00011368"), + ("\ua8e3", "\U00011369"), + ("\ua8e4", "\U0001136a"), + ("\ua8e5", "\U0001136b"), + ("\ua8e6", "\U0001136c"), + # -- 3 reserved chars -- + ("\ua8ea", "\U00011370"), # a + ("\ua8ec", "\U00011371"), # ka + ("\ua8ed", "\U00011372"), # na + ("\ua8e6", "\U00011374"), # pa + ("\ua8f0", "\U00011373"), # vi + ], + "ITRANS": [ + # Vedic anusvara (just render as candrabindu) + ("\u0901", "{\\m+}"), + ("\u1cda", "\\\""), + ], + "ISO": [ + # Aytam + ("\u0b83", "ḳ"), + (JIHVAMULIYA, "ẖ"), + (UPADHMANIYA, "ḫ"), + ], + "KANNADA": [ + (JIHVAMULIYA, "\u0cf1"), + (UPADHMANIYA, "\u0cf2"), + ], + "MALAYALAM": [ + # AU archaic mark + ("\u094c", "\u0d4c"), + ], + "NEWA": [ + (JIHVAMULIYA, "\U00011460"), + (UPADHMANIYA, "\U00011461"), + ], + "SHARADA": [ + (JIHVAMULIYA, "\U000111c2"), + (UPADHMANIYA, "\U000111c3"), + ], + "SINHALA": + # Sinhala chandrabindu is not supported in the fonts I tried, so + # use anusvara instead. + [("\u0901", "\u0d82")], + "SLP1": [ + (JIHVAMULIYA, "Z"), + (UPADHMANIYA, "V"), + # Lha + ("ळ्ह", "|"), + # Svarita + ("\u0951", "^"), + # Anudatta + ("\u0952", "\\"), + ], + "SOYOMBO": [ + (JIHVAMULIYA, "\U00011a84"), + (UPADHMANIYA, "\U00011a85"), + (DANDA, "\U00011a9b"), + (DOUBLE_DANDA, "\U00011a9c"), + ], + "TAMIL_SUPERSCRIPTED": [ + # Aytam + ("\u0b83", "\u0b83"), + ], + "TIBETAN": [ + (JIHVAMULIYA, "\u0f88"), + (UPADHMANIYA, "\u0f89"), + ], + "VELTHUIS": [ + # Virama + ("\u094d", "&"), + # Chandrabindu variant + ("\u0901", "/"), + ("\u0945", "~a"), + ("\u0949", "~o"), + # Punctuation + ("\u0970", "@"), + ("\u0971", "#"), + # Consonants with nuqtas + ("\u0931", "^r"), + ("\u0915\u093c", "q"), + ("\u0916\u093c", ".kh"), + ("\u0957\u093c", ".g"), + ("\u091c\u093c", "z"), + ("\u0921\u093c", "R"), + ("\u0922\u093c", "Rh"), + ("\u092b\u093c", "f"), + ], +} + + def _sanitize(s: str) -> str: return s.replace("\\", "\\\\").replace('"', '\\"') @@ -228,6 +571,7 @@ def _to_deva_nfd(s: str) -> str: } return overrides.get(s, s) + def to_unique(xs: list) -> list: seen = set() ret = [] @@ -239,167 +583,7 @@ def to_unique(xs: list) -> list: def _maybe_override(name: str, deva: str, raw: str) -> str | None: - overrides = {} - - if name == "BARAHA": - # Existing accent marks seem to be mostly wrong -- delete so that we - # can redefine them elsewhere. - overrides = { - "\u1ce1": None, - "\ua8e1": None, - "\ua8e2": None, - "\ua8e3": None, - } - elif name == "GRANTHA": - overrides = { - # vowel sign AU - "\u094c": "\U0001134c", - } - elif name == "GONDI_GUNJALA": - overrides = { - # No avagraha defined -- for now, use Devanagari avagraha as placeholder. - "\u093d": "\u093d", - - } - elif name == "GONDI_MASARAM": - overrides = { - # Virama - "\u094d": "\U00011d45", - # No avagraha defined -- for now, use Devanagari avagraha as placeholder. - "\u093d": "\u093d", - # Conjuncts - "क्ष": "𑴮", - "ज्ञ": "𑴯", - - } - elif name == "GURMUKHI": - overrides = { - "\u090b": "ਰੁ", # letter vocalic r - "\u0960": "ਰੂ", # letter vocalic rr - "\u090c": "ਲੁ", # letter vocalic l - "\u0961": "ਲੂ", # letter vocalic ll - "\u0943": "\u0a4dਰੁ", # sign vocalic r - "\u0944": "\u0a4dਰੂ", # sign vocalic rr - "\u0962": "\u0a4dਲੁ", # sign vocalic l - "\u0963": "\u0a4dਲੂ", # sign vocalic ll - "\u090e": "ਏ", # letter short e - "\u0912": "ਓ", # letter short o - "\u0946": "\u0a47", # sign short e - "\u094a": "\u0a4b", # sign short o - } - elif name == "HK": - if raw == "|": - return "." - if raw == "||": - return ".." - elif name == "ISO": - overrides = { - "।": ".", - "॥": "..", - "ख़": "k͟h", - # Delete -- common_maps maps this to "ḳ", which we need for aytam. - # We'll add a valid mapping for क़: further below. - "क़": None, - } - elif name == "IAST": - overrides = { - "ळ": "ḻ", - "ऴ": None, - "।": ".", - "॥": "..", - # candrabindu - "\u0901": "m̐", - } - elif name == "KHMER": - overrides = { - "।": "។", - "॥": "៕", - } - elif name == "KHUDAWADI": - overrides = { - "\u090e": "\U000112b6", # letter short e - "\u0912": "\U000112b8", # letter short o - "\u0946": "\U000112e5", # sign short e - "\u094a": "\U000112e7", # sign short o - } - elif name == "MODI": - overrides = { - "\u0907": "\U00011602", # letter i - "\u0908": "\U00011603", # letter ii - "\u0909": "\U00011604", # letter u - "\u090a": "\U00011605", # letter uu - "\u090b": "\U00011606", # letter vocalic r - "\u090c": "\U00011608", # letter vocalic l - "\u093f": "\U00011631", # sign i - "\u0940": "\U00011632", # sign ii - "\u0941": "\U00011633", # sign u - "\u0942": "\U00011634", # sign uu - "\u0943": "\U00011635", # sign vocalic r - "\u0944": "\U00011636", # sign vocalic rr - "\u0960": "\U00011607", # letter vocalic rr - "\u0961": "\U00011609", # letter vocalic ll - "\u0962": "\U00011637", # sign vocalic l - "\u0963": "\U00011638", # sign vocalic ll - - "\u0964": "\U00011641", # danda - "\u0965": "\U00011642", # double danda - } - - elif name == "NEWA": - overrides = { - "\u0964": "\U0001144b", # danda - "\u0965": "\U0001144c", # double danda - } - elif name == "TAKRI": - overrides = { - "ख": "𑚸", - } - elif name == "TAMIL_SUPERSCRIPTED": - # Use roman digits per Aksharamukha - overrides = { - "०": "0", - "१": "1", - "२": "2", - "३": "3", - "४": "4", - "५": "5", - "६": "6", - "७": "7", - "८": "8", - "९": "9", - } - elif name == "TIBETAN": - overrides = { - # Virama - "\u094d": "\u0f84", - # Use distinct "va" character instead of "ba". - "व": "\u0f5d", - } - elif name == "VELTHUIS": - # These are part of the Velthuis spec but are errors in indic-transliteration. - overrides = { - "ॠ": ".R", - "ॡ": ".L", - # Should be .o, per spec - "ॐ": ".o", - } - elif name == "WX": - overrides = { - "ऎ": "eV", - "ऒ": "oV", - "ॡ": "LV", - "ळ": "lY", - "ऽ": "Z", - } - elif name == "ZANBAZAR_SQUARE": - overrides = { - "\u0943": "\U00011A34\U00011A2B\U00011A09", # sign vocalic r - "\u0944": "\U00011A34\U00011A2B\U00011A09\U00011A0A", # sign vocalic rr - "\u0962": "\U00011A34\U00011A2C\U00011A09", # sign vocalic l - "\u0963": "\U00011A34\U00011A2C\U00011A09\U00011A0A", # sign vocalic ll - } - - return overrides.get(deva, raw) + return OVERRIDES.get(name, {}).get(deva, raw) def _reorder_short_vowels(items: list) -> list: @@ -407,13 +591,20 @@ def _reorder_short_vowels(items: list) -> list: short_vowel_codes = {"\u0946", "\u094a", "\u090e", "\u0912"} no_short_e_o = [x for x in items if x[0] not in short_vowel_codes] short_e_o = [x for x in items if x[0] in short_vowel_codes] - return no_short_e_o + short_e_o + +def _reorder_kharoshthi_n(items: list) -> list: + nga = {"\u0919"} + no_nga = [x for x in items if x[0] not in nga] + nga = [x for x in items if x[0] in nga] + return no_nga + nga + + def _ol_chiki_consonants(items: list) -> list: new = [] for x, y in items: - if len(y) > 1 and y.endswith('ᱚ'): + if len(y) > 1 and y.endswith("ᱚ"): new.append((x, y[:-1])) else: new.append((x, y)) @@ -425,7 +616,7 @@ def create_key_consts() -> str: buf = [] for deva, name in KEY_NAMES.items(): buf.append(f'const {name}: &str = "{deva}";') - return '\n'.join(buf) + return "\n".join(buf) def create_scheme_entry(name: str, items: list[tuple[str, str]]) -> str: @@ -434,8 +625,8 @@ def create_scheme_entry(name: str, items: list[tuple[str, str]]) -> str: buf.append(f"pub const {name}: &[(&str, &str)] = &[") for deva, raw in items: - deva = unicodedata.normalize('NFC', _sanitize(deva)) - raw = unicodedata.normalize('NFC', _sanitize(raw)) + deva = unicodedata.normalize("NFC", _sanitize(deva)) + raw = unicodedata.normalize("NFC", _sanitize(raw)) if (deva, raw) in seen: continue @@ -473,8 +664,16 @@ def main(): "", ] - BRAHMIC_WITH_DEVA_ACCENTS = {"BENGALI", "KANNADA", "TELUGU", "MALAYALAM", "ORIYA", "SHARADA"} - + BRAHMIC_WITH_DEVA_ACCENTS = { + "ASSAMESE", + "BENGALI", + "GUJARATI", + "KANNADA", + "MALAYALAM", + "ORIYA", + "TELUGU", + "SHARADA", + } for path in sorted(glob("/Users/arun/temp/common_maps/**/*.toml")): with open(path, "rb") as f: @@ -509,7 +708,7 @@ def main(): if deva is None: continue - deva = unicodedata.normalize('NFC', _sanitize(deva)) + deva = unicodedata.normalize("NFC", _sanitize(deva)) for alt in alts: alt = _maybe_override(scheme_name, deva, alt) if alt is not None: @@ -541,145 +740,52 @@ def main(): assert isinstance(raw, str) scheme_items.append((mark, raw)) - scheme_items = [(_to_deva_nfd(x), _to_deva_nfd(y)) - for (x, y) in scheme_items] + scheme_items = [(_to_deva_nfd(x), _to_deva_nfd(y)) for (x, y) in scheme_items] scheme_items = to_unique(scheme_items) # Add svarita and anudatta for Brahmic scripts that use Devanagari accent marks. if scheme_name in BRAHMIC_WITH_DEVA_ACCENTS: - scheme_items.extend([ - # Svarita - ("\u0951", "\u0951"), - # Anudatta - ("\u0952", "\u0952"), - # Dirgha svarita - ("\u1cda", "\u1cda"), - ]) - elif scheme_name == "GRANTHA": - scheme_items.extend([ - # Svarita (use chandra symbol) - ("\u0951", "\u1cf4"), - # Dirgha svarita (use Devanagari svarita) - ("\u1cda", "\u0951"), - # Anudatta (use Devanagari) - ("\u0952", "\u0952"), - ]) - - if scheme_name == "BARAHA": - scheme_items.extend([ - ("\u0914", "ou"), - ("\u094c", "ou"), - ("\u0939", "~h"), - # Corrected accent marks: - # - Horizontal line above () - ("\u1cd2", "Q"), - # - Dot below - ("\u1cdd", "V"), - # - TODO: Dot above (can't find the right Unicode for it) - # TODO: ("", "W"), - # - Double vertical line above (double svarita) - ("\u1cda", "$"), - ]) - elif scheme_name == "DEVANAGARI": - scheme_items.extend([ - # DEVANAGARI VOWEL SIGN PRISHTHAMATRA E (U+094E) - # See comments on U+094E for details. - ("\u0948", "\u0947\u094e"), - ("\u094b", "\u093e\u094e"), - ("\u094c", "\u094b\u094e"), - - # Vedic accents - ("\u1cd2", "\u1cd2"), - ("\u1cda", "\u1cda"), - ("\u1cdd", "\u1cdd"), - # Punctuation - ("\u0970", "\u0970"), - ("\u0971", "\u0971"), - ]) - elif scheme_name == "GONDI_GUNJALA": - scheme_items.extend([ - ("\u094d", "\U00011D97"), - ]) - elif scheme_name == "GONDI_MASARAM": - scheme_items.extend([ - ("त्र", "𑴰"), - ]) + scheme_items.extend( + [ + # Svarita + ("\u0951", "\u0951"), + # Anudatta + ("\u0952", "\u0952"), + # Dirgha svarita + ("\u1cda", "\u1cda"), + ] + ) elif scheme_name == "GRANTHA": - scheme_items.extend([ - # OO (EE + AA length mark) - ("\u094b", "\U00011347\U0001133e"), - # AU length mark - ("\u094c", "\U00011357"), - # AU (AA + AU length mark) - ("\u094c", "\U00011347\U00011357"), - ]) - elif scheme_name == "ITRANS": - scheme_items.extend([ - # Vedic anusvara (just render as candrabindu) - ("\u0901", "{\\m+}"), - ]) - elif scheme_name == "ISO": - scheme_items.extend([ - # Aytam - ("\u0b83", "ḳ"), - ]) - elif scheme_name == "SINHALA": - # Sinhala chandrabindu is not supported in the fonts I tried, so - # use anusvara instead. - scheme_items.append(("\u0901", "\u0d82")) - elif scheme_name == "SLP1": - scheme_items.extend([ - # Jihvamuliya - ("\u1cf5", "Z"), - # Upadhmaniya - ("\u1cf6", "V"), - # Lha - ("ळ्ह", "|"), - # Svarita - ("\u0951", "^"), - # Anudatta - ("\u0952", "\\"), - ]) - elif scheme_name == "TAMIL_SUPERSCRIPTED": - scheme_items.extend([ - # Aytam - ("\u0b83", "\u0b83"), - ]) - elif scheme_name == "VELTHUIS": - scheme_items.extend([ - # Virama - ("\u094d", "&"), - # Chandrabindu variant - ("\u0901", "/"), - ("\u0945", "~a"), - ("\u0949", "~o"), - # Punctuation - ("\u0970", "@"), - ("\u0971", "#"), - # Consonants with nuqtas - ("\u0931", "^r"), - ("\u0915\u093c", "q"), - ("\u0916\u093c", ".kh"), - ("\u0957\u093c", ".g"), - ("\u091c\u093c", "z"), - ("\u0921\u093c", "R"), - ("\u0922\u093c", "Rh"), - ("\u092b\u093c", "f"), - ]) + scheme_items.extend( + [ + # Svarita (use chandra symbol) + ("\u0951", "\u1cf4"), + # Dirgha svarita (use Devanagari svarita) + ("\u1cda", "\u0951"), + # Anudatta (use Devanagari) + ("\u0952", "\u0952"), + ] + ) + + scheme_items.extend(EXTENSIONS.get(scheme_name, [])) renames = { "GONDI_GUNJALA": "GUNJALA_GONDI", "GONDI_MASARAM": "MASARAM_GONDI", + "ISO": "ISO_15919", "MANIPURI": "MEETEI_MAYEK", "ZANBAZAR_SQUARE": "ZANABAZAR_SQUARE", "TAMIL_SUPERSCRIPTED": "TAMIL", "TIRHUTA_MAITHILI": "TIRHUTA", } + scheme_name = renames.get(scheme_name, scheme_name) scheme_items = _reorder_short_vowels(scheme_items) if scheme_name == "OL_CHIKI": scheme_items = _ol_chiki_consonants(scheme_items) + elif scheme_name == "KHAROSHTHI": + scheme_items = _reorder_kharoshthi_n(scheme_items) buf.append(create_scheme_entry(scheme_name, scheme_items)) with open(CRATE_DIR / "src/autogen_schemes.rs", "w") as f: diff --git a/vidyut-lipi/src/autogen_schemes.rs b/vidyut-lipi/src/autogen_schemes.rs index 7bd0331..f90c6ae 100644 --- a/vidyut-lipi/src/autogen_schemes.rs +++ b/vidyut-lipi/src/autogen_schemes.rs @@ -112,6 +112,32 @@ const DANDA: &str = "।"; const DOUBLE_DANDA: &str = "॥"; const ABBREVIATION_SIGN: &str = "॰"; const HIGH_SPACING_DOT: &str = "ॱ"; +const PRENKHA: &str = "᳒"; +const VEDIC_DOT_BELOW: &str = "᳝"; +const ATHARVAVEDA_INDEPENDENT_SVARITA: &str = "᳡"; +const JIHVAMULIYA: &str = "ᳵ"; +const UPADHMANIYA: &str = "ᳶ"; +const VEDIC_RING_ABOVE: &str = "᳸"; +const COMBINING_DIGIT_0: &str = "꣠"; +const COMBINING_DIGIT_1: &str = "꣡"; +const COMBINING_DIGIT_2: &str = "꣢"; +const COMBINING_DIGIT_3: &str = "꣣"; +const COMBINING_DIGIT_4: &str = "꣤"; +const COMBINING_DIGIT_5: &str = "꣥"; +const COMBINING_DIGIT_6: &str = "꣦"; +const COMBINING_DIGIT_7: &str = "꣧"; +const COMBINING_DIGIT_8: &str = "꣨"; +const COMBINING_DIGIT_9: &str = "꣩"; +const COMBINING_A: &str = "꣪"; +const COMBINING_U: &str = "꣫"; +const COMBINING_KA: &str = "꣬"; +const COMBINING_NA: &str = "꣭"; +const COMBINING_PA: &str = "꣮"; +const COMBINING_RA: &str = "꣯"; +const COMBINING_VI: &str = "꣰"; +const COMBINING_AVAGRAHA: &str = "꣱"; +const CANDRABINDU_VIRAMA: &str = "ꣳ"; +const TAMIL_AYTHAM: &str = "ஃ"; const ZERO_WIDTH_NON_JOINER: &str = "‌"; const ZERO_WIDTH_JOINER: &str = "‍"; @@ -217,6 +243,110 @@ pub const AHOM: &[(&str, &str)] = &[ (SIGN_O, "𑜨"), ]; +pub const ASSAMESE: &[(&str, &str)] = &[ + (A, "অ"), + (AA, "আ"), + (I, "ই"), + (II, "ঈ"), + (U, "উ"), + (UU, "ঊ"), + (R, "ঋ"), + (RR, "ৠ"), + (L, "ঌ"), + (LL, "ৡ"), + (EE, "এ"), + (AI, "ঐ"), + (OO, "ও"), + (AU, "ঔ"), + (SIGN_AA, "া"), + (SIGN_I, "ি"), + (SIGN_II, "ী"), + (SIGN_U, "ু"), + (SIGN_UU, "ূ"), + (SIGN_R, "ৃ"), + (SIGN_RR, "ৄ"), + (SIGN_L, "ৢ"), + (SIGN_LL, "ৣ"), + (SIGN_EE, "ে"), + (SIGN_AI, "ৈ"), + (SIGN_OO, "ো"), + (SIGN_AU, "ৌ"), + (ANUSVARA, "ং"), + (VISARGA, "ঃ"), + (CANDRABINDU, "ঁ"), + (VIRAMA, "্"), + (KA, "ক"), + (KHA, "খ"), + (GA, "গ"), + (GHA, "ঘ"), + (NGA, "ঙ"), + (CA, "চ"), + (CHA, "ছ"), + (JA, "জ"), + (JHA, "ঝ"), + (NYA, "ঞ"), + (TTA, "ট"), + (TTHA, "ঠ"), + (DDA, "ড"), + (DDHA, "ঢ"), + (NNA, "ণ"), + (TA, "ত"), + (THA, "থ"), + (DA, "দ"), + (DHA, "ধ"), + (NA, "ন"), + (PA, "প"), + (PHA, "ফ"), + (BA, "ব"), + (BHA, "ভ"), + (MA, "ম"), + (YA, "য"), + (RA, "ৰ"), + (LA, "ল"), + (VA, "ৱ"), + (SHA, "শ"), + (SSA, "ষ"), + (SA, "স"), + (HA, "হ"), + (LLA, "ল়"), + (KSSA, "ক্ষ"), + (JNYA, "জ্ঞ"), + (DIGIT_0, "০"), + (DIGIT_1, "১"), + (DIGIT_2, "২"), + (DIGIT_3, "৩"), + (DIGIT_4, "৪"), + (DIGIT_5, "৫"), + (DIGIT_6, "৬"), + (DIGIT_7, "৭"), + (DIGIT_8, "৮"), + (DIGIT_9, "৯"), + (OM, "ওঁ"), + (AVAGRAHA, "ঽ"), + (DANDA, "।"), + (DOUBLE_DANDA, "॥"), + (ZERO_WIDTH_JOINER, "‍"), + ("", ""), + (SIGN_CANDRA_E, "ে"), + (QA, "ক়"), + (KHHA, "খ়"), + (GHHA, "গ়"), + (ZA, "জ়"), + (DDDHA, "ড়"), + (RHA, "ঢ়"), + (FA, "ফ়"), + (YYA, "য়"), + (RRA, "ৰ়"), + (LLLA, "ষ়"), + (SVARITA, "॑"), + (ANUDATTA, "॒"), + (DOUBLE_SVARITA, "᳚"), + (E, "ऎ"), + (O, "ऒ"), + (SIGN_E, "ॆ"), + (SIGN_O, "ॊ"), +]; + pub const BALINESE: &[(&str, &str)] = &[ (A, "ᬅ"), (AA, "ᬆ"), @@ -621,6 +751,8 @@ pub const BRAHMI: &[(&str, &str)] = &[ (FA, "𑀨"), (YYA, "𑀬"), (RRA, "𑀭"), + (JIHVAMULIYA, "𑀃"), + (UPADHMANIYA, "𑀄"), (E, "𑀏𑁆"), (O, "𑀑𑁆"), (SIGN_E, "𑁂"), @@ -866,9 +998,9 @@ pub const DEVANAGARI: &[(&str, &str)] = &[ (ANUSVARA, "ं"), (VISARGA, "ः"), (CANDRABINDU, "ँ"), - ("ᳵ", "ᳵ"), - ("ᳶ", "ᳶ"), - ("ꣳ", "ꣳ"), + (JIHVAMULIYA, "ᳵ"), + (UPADHMANIYA, "ᳶ"), + (CANDRABINDU_VIRAMA, "ꣳ"), (VIRAMA, "्"), (KA, "क"), (KHA, "ख"), @@ -926,27 +1058,27 @@ pub const DEVANAGARI: &[(&str, &str)] = &[ (SVARITA, "॑"), (DOUBLE_SVARITA, "᳚"), (ANUDATTA, "॒"), - ("᳒", "᳒"), - ("᳡", "᳡"), - ("꣡", "꣡"), - ("꣢", "꣢"), - ("꣣", "꣣"), - ("꣤", "꣤"), - ("꣥", "꣥"), - ("꣦", "꣦"), - ("꣧", "꣧"), - ("꣨", "꣨"), - ("꣩", "꣩"), - ("꣪", "꣪"), - ("꣫", "꣫"), - ("꣬", "꣬"), - ("꣭", "꣭"), - ("꣮", "꣮"), - ("꣯", "꣯"), - ("꣰", "꣰"), - ("꣱", "꣱"), - ("᳝", "᳝"), - ("᳸", "᳸"), + (PRENKHA, "᳒"), + (ATHARVAVEDA_INDEPENDENT_SVARITA, "᳡"), + (COMBINING_DIGIT_1, "꣡"), + (COMBINING_DIGIT_2, "꣢"), + (COMBINING_DIGIT_3, "꣣"), + (COMBINING_DIGIT_4, "꣤"), + (COMBINING_DIGIT_5, "꣥"), + (COMBINING_DIGIT_6, "꣦"), + (COMBINING_DIGIT_7, "꣧"), + (COMBINING_DIGIT_8, "꣨"), + (COMBINING_DIGIT_9, "꣩"), + (COMBINING_A, "꣪"), + (COMBINING_U, "꣫"), + (COMBINING_KA, "꣬"), + (COMBINING_NA, "꣭"), + (COMBINING_PA, "꣮"), + (COMBINING_RA, "꣯"), + (COMBINING_VI, "꣰"), + (COMBINING_AVAGRAHA, "꣱"), + (VEDIC_DOT_BELOW, "᳝"), + (VEDIC_RING_ABOVE, "᳸"), (QA, "क़"), (KHHA, "ख़"), (GHHA, "ग़"), @@ -961,6 +1093,7 @@ pub const DEVANAGARI: &[(&str, &str)] = &[ (SIGN_AI, "ेॎ"), (SIGN_OO, "ाॎ"), (SIGN_AU, "ोॎ"), + (COMBINING_DIGIT_0, "꣠"), (ABBREVIATION_SIGN, "॰"), (HIGH_SPACING_DOT, "ॱ"), (E, "ऎ"), @@ -1373,6 +1506,18 @@ pub const GRANTHA: &[(&str, &str)] = &[ (DOUBLE_SVARITA, "॑"), (ANUDATTA, "॒"), (SIGN_AU, "𑍗"), + (COMBINING_DIGIT_0, "𑍦"), + (COMBINING_DIGIT_1, "𑍧"), + (COMBINING_DIGIT_2, "𑍨"), + (COMBINING_DIGIT_3, "𑍩"), + (COMBINING_DIGIT_4, "𑍪"), + (COMBINING_DIGIT_5, "𑍫"), + (COMBINING_DIGIT_6, "𑍬"), + (COMBINING_A, "𑍰"), + (COMBINING_KA, "𑍱"), + (COMBINING_NA, "𑍲"), + (COMBINING_DIGIT_6, "𑍴"), + (COMBINING_VI, "𑍳"), (E, "𑌏𑌀"), (O, "𑌓𑌀"), (SIGN_E, "𑍇𑌀"), @@ -1463,8 +1608,6 @@ pub const GUJARATI: &[(&str, &str)] = &[ (DOUBLE_DANDA, "॥"), (ZERO_WIDTH_JOINER, ""), ("", ""), - (SVARITA, "॑"), - (ANUDATTA, "॒"), (SIGN_CANDRA_E, "ૅ"), (QA, "ક઼"), (KHHA, "ખ઼"), @@ -1476,6 +1619,9 @@ pub const GUJARATI: &[(&str, &str)] = &[ (YYA, "ય઼"), (RRA, "ર઼"), (LLLA, "ળ઼"), + (SVARITA, "॑"), + (ANUDATTA, "॒"), + (DOUBLE_SVARITA, "᳚"), (E, "ऎ"), (O, "ऒ"), (SIGN_E, "ॆ"), @@ -1664,8 +1810,8 @@ pub const JAVANESE: &[(&str, &str)] = &[ (DIGIT_9, "꧙"), (OM, "ꦎꦀ"), (AVAGRAHA, "'"), - (DANDA, "।"), - (DOUBLE_DANDA, "॥"), + (DANDA, "꧈"), + (DOUBLE_DANDA, "꧉"), (SIGN_CANDRA_E, "ꦼ"), (E, ""), (O, ""), @@ -1804,8 +1950,8 @@ pub const KANNADA: &[(&str, &str)] = &[ (ANUSVARA, "ಂ"), (VISARGA, "ಃ"), (CANDRABINDU, "ಁ"), - ("ᳵ", "ೱ"), - ("ᳶ", "ೲ"), + (JIHVAMULIYA, "ೱ"), + (UPADHMANIYA, "ೲ"), (VIRAMA, "್"), (KA, "ಕ"), (KHA, "ಖ"), @@ -1878,6 +2024,95 @@ pub const KANNADA: &[(&str, &str)] = &[ (SIGN_O, "ೊ"), ]; +pub const KHAROSHTHI: &[(&str, &str)] = &[ + (A, "𐨀"), + (AA, "𐨀𐨌"), + (I, "𐨀𐨁"), + (II, "𐨀𐨁𐨌"), + (U, "𐨀𐨂"), + (UU, "𐨀𐨂𐨌"), + (R, "𐨀𐨃"), + (RR, "𐨀𐨃𐨌"), + (L, "𐨫𐨂"), + (LL, "𐨫𐨂𐨌"), + (EE, "𐨀𐨅"), + (AI, "𐨀𐨅𐨌"), + (OO, "𐨀𐨆"), + (AU, "𐨀𐨆𐨌"), + (SIGN_AA, "𐨌"), + (SIGN_I, "𐨁"), + (SIGN_II, "𐨁𐨌"), + (SIGN_U, "𐨂"), + (SIGN_UU, "𐨂𐨌"), + (SIGN_R, "𐨃"), + (SIGN_RR, "𐨃𐨌"), + (SIGN_L, "𐨿𐨫𐨂"), + (SIGN_LL, "𐨿𐨫𐨂𐨌"), + (SIGN_EE, "𐨅"), + (SIGN_AI, "𐨅𐨌"), + (SIGN_OO, "𐨆"), + (SIGN_AU, "𐨆𐨌"), + (ANUSVARA, "𐨎"), + (VISARGA, "𐨏"), + (CANDRABINDU, "𐨎"), + (VIRAMA, "𐨿"), + (KA, "𐨐"), + (KHA, "𐨑"), + (GA, "𐨒"), + (GHA, "𐨓"), + (CA, "𐨕"), + (CHA, "𐨖"), + (JA, "𐨗"), + (JHA, "𐨗"), + (NYA, "𐨙"), + (TTA, "𐨚"), + (TTHA, "𐨛"), + (DDA, "𐨜"), + (DDHA, "𐨝"), + (NNA, "𐨞"), + (TA, "𐨟"), + (THA, "𐨠"), + (DA, "𐨡"), + (DHA, "𐨢"), + (NA, "𐨣"), + (PA, "𐨤"), + (PHA, "𐨥"), + (BA, "𐨦"), + (BHA, "𐨧"), + (MA, "𐨨"), + (YA, "𐨩"), + (RA, "𐨪"), + (LA, "𐨫"), + (VA, "𐨬"), + (SHA, "𐨭"), + (SSA, "𐨮"), + (SA, "𐨯"), + (HA, "𐨱"), + (LLA, "𐨫"), + (KSSA, "𐨐𐨿𐨮"), + (JNYA, "𐨗𐨿𐨙"), + (DIGIT_0, ""), + (DIGIT_1, "𐩀"), + (DIGIT_2, "𐩁"), + (DIGIT_3, "𐩂"), + (DIGIT_4, "𐩃"), + (DIGIT_5, "𐩃𐩀"), + (DIGIT_6, "𐩃𐩁"), + (DIGIT_7, "𐩃𐩂"), + (DIGIT_8, "𐩃𐩃"), + (DIGIT_9, "𐩃𐩃𐩀"), + (OM, "𐨀𐨅𐨎"), + (AVAGRAHA, ""), + (DANDA, "𐩖"), + (DOUBLE_DANDA, "𐩗"), + (SIGN_CANDRA_E, "𐨅"), + (E, ""), + (O, ""), + (SIGN_E, ""), + (SIGN_O, ""), + (NGA, "𐨣"), +]; + pub const KHMER: &[(&str, &str)] = &[ (A, "អ"), (AA, "អា"), @@ -2240,11 +2475,11 @@ pub const LIMBU: &[(&str, &str)] = &[ (I, "ᤀᤡ"), (II, "ᤀᤡ᤺"), (U, "ᤀᤢ"), - (UU, "ᤀ᤺ᤢ"), + (UU, "ᤀᤢ᤺"), (R, "ᤖᤢ"), - (RR, "ᤖ᤺ᤢ"), + (RR, "ᤖᤢ᤺"), (L, "ᤗᤢ"), - (LL, "ᤗ᤺ᤢ"), + (LL, "ᤗᤢ᤺"), (EE, "ᤀᤣ"), (AI, "ᤀᤤ"), (OO, "ᤀᤥ"), @@ -2253,11 +2488,11 @@ pub const LIMBU: &[(&str, &str)] = &[ (SIGN_I, "ᤡ"), (SIGN_II, "ᤡ᤺"), (SIGN_U, "ᤢ"), - (SIGN_UU, "᤺ᤢ"), - (SIGN_R, "ᤪᤢ"), - (SIGN_RR, "ᤪ᤺ᤢ"), + (SIGN_UU, "ᤢ᤺"), + (SIGN_R, "᤻ᤖᤢ"), + (SIGN_RR, "᤻ᤖᤢ᤺"), (SIGN_L, "᤻ᤗᤢ"), - (SIGN_LL, "᤻ᤗ᤺ᤢ"), + (SIGN_LL, "᤻ᤗᤢ᤺"), (SIGN_EE, "ᤣ"), (SIGN_AI, "ᤤ"), (SIGN_OO, "ᤥ"), @@ -2317,10 +2552,10 @@ pub const LIMBU: &[(&str, &str)] = &[ (DANDA, "।"), (DOUBLE_DANDA, "॥"), (SIGN_CANDRA_E, "ᤧ"), - (E, ""), - (O, ""), - (SIGN_E, ""), - (SIGN_O, ""), + (E, "ᤀᤧ"), + (O, "ᤀᤨ"), + (SIGN_E, "ᤧ"), + (SIGN_O, "ᤨ"), ]; pub const MALAYALAM: &[(&str, &str)] = &[ @@ -2350,7 +2585,7 @@ pub const MALAYALAM: &[(&str, &str)] = &[ (SIGN_EE, "േ"), (SIGN_AI, "ൈ"), (SIGN_OO, "ോ"), - (SIGN_AU, "ൌ"), + (SIGN_AU, "ൗ"), (ANUSVARA, "ം"), (VISARGA, "ഃ"), (CANDRABINDU, "ഁ"), @@ -2418,6 +2653,7 @@ pub const MALAYALAM: &[(&str, &str)] = &[ (SVARITA, "॑"), (ANUDATTA, "॒"), (DOUBLE_SVARITA, "᳚"), + (SIGN_AU, "ൌ"), (E, "എ"), (O, "ഒ"), (SIGN_E, "െ"), @@ -2504,8 +2740,8 @@ pub const MEETEI_MAYEK: &[(&str, &str)] = &[ (DIGIT_9, "꯹"), (OM, "ꯑꯣꯪ"), (AVAGRAHA, "'"), - (DANDA, "।"), - (DOUBLE_DANDA, "॥"), + (DANDA, "꯫"), + (DOUBLE_DANDA, "꯫꯫"), (SIGN_CANDRA_E, "ꯦ"), (E, ""), (O, ""), @@ -2602,6 +2838,95 @@ pub const MODI: &[(&str, &str)] = &[ (SIGN_O, ""), ]; +pub const MON: &[(&str, &str)] = &[ + (A, "အ"), + (AA, "အာ"), + (I, "ဣ"), + (II, "ဣဳ"), + (U, "ဥ"), + (UU, "ဥူ"), + (R, "ၒ"), + (RR, "ၓ"), + (L, "ၔ"), + (LL, "ၕ"), + (EE, "ဨ"), + (AI, "အဲ"), + (OO, "ဩ"), + (AU, "ဪ"), + (SIGN_AA, "ာ"), + (SIGN_I, "ိ"), + (SIGN_II, "ဳ"), + (SIGN_U, "ု"), + (SIGN_UU, "ူ"), + (SIGN_R, "ၖ"), + (SIGN_RR, "ၗ"), + (SIGN_L, "ၘ"), + (SIGN_LL, "ၙ"), + (SIGN_EE, "ေ"), + (SIGN_AI, "ဲ"), + (SIGN_OO, "ော"), + (SIGN_AU, "ော်"), + (ANUSVARA, "ံ"), + (VISARGA, "း"), + (CANDRABINDU, "ံ"), + (VIRAMA, "်"), + (KA, "က"), + (KHA, "ခ"), + (GA, "ဂ"), + (GHA, "ဃ"), + (NGA, "ၚ"), + (CA, "စ"), + (CHA, "ဆ"), + (JA, "ဇ"), + (JHA, "ၛ"), + (NYA, "ည"), + (TTA, "ဋ"), + (TTHA, "ဌ"), + (DDA, "ဍ"), + (DDHA, "ဎ"), + (NNA, "ဏ"), + (TA, "တ"), + (THA, "ထ"), + (DA, "ဒ"), + (DHA, "ဓ"), + (NA, "န"), + (PA, "ပ"), + (PHA, "ဖ"), + (BA, "ဗ"), + (BHA, "ဘ"), + (MA, "မ"), + (YA, "ယ"), + (RA, "ရ"), + (LA, "လ"), + (VA, "ဝ"), + (SHA, "ၐ"), + (SSA, "ၑ"), + (SA, "သ"), + (HA, "ဟ"), + (LLA, "ဠ"), + (KSSA, "က္ၑ"), + (JNYA, "ဇ္ည"), + (DIGIT_0, "၀"), + (DIGIT_1, "၁"), + (DIGIT_2, "၂"), + (DIGIT_3, "၃"), + (DIGIT_4, "၄"), + (DIGIT_5, "၅"), + (DIGIT_6, "၆"), + (DIGIT_7, "၇"), + (DIGIT_8, "၈"), + (DIGIT_9, "၉"), + (OM, "ဩံ"), + (AVAGRAHA, "'"), + (DANDA, "၊"), + (DOUBLE_DANDA, "။"), + (SIGN_CANDRA_E, "ေ"), + (E, ""), + (O, ""), + (SIGN_E, ""), + (SIGN_O, ""), +]; + pub const NANDINAGARI: &[(&str, &str)] = &[ (A, "𑦠"), (AA, "𑦡"), @@ -2773,6 +3098,8 @@ pub const NEWA: &[(&str, &str)] = &[ (DANDA, "𑑋"), (DOUBLE_DANDA, "𑑌"), (SIGN_CANDRA_E, "𑐾"), + (JIHVAMULIYA, "𑑠"), + (UPADHMANIYA, "𑑡"), (E, ""), (O, ""), (SIGN_E, ""), @@ -3179,6 +3506,8 @@ pub const SHARADA: &[(&str, &str)] = &[ (SVARITA, "॑"), (ANUDATTA, "॒"), (DOUBLE_SVARITA, "᳚"), + (JIHVAMULIYA, "𑇂"), + (UPADHMANIYA, "𑇃"), (E, "𑆍𑇌"), (O, "𑆏𑇌"), (SIGN_E, "𑆼𑇌"), @@ -3372,6 +3701,98 @@ pub const SINHALA: &[(&str, &str)] = &[ (SIGN_O, "ො"), ]; +pub const SOYOMBO: &[(&str, &str)] = &[ + (A, "𑩐"), + (AA, "𑩐𑩛"), + (I, "𑩐𑩑"), + (II, "𑩐𑩑𑩛"), + (U, "𑩐𑩒"), + (UU, "𑩐𑩒𑩛"), + (R, "𑩐𑩙"), + (RR, "𑩐𑩙𑩛"), + (L, "𑩐𑩚"), + (LL, "𑩐𑩚𑩛"), + (EE, "𑩐𑩔"), + (AI, "𑩐𑩗"), + (OO, "𑩐𑩖"), + (AU, "𑩐𑩘"), + (SIGN_AA, "𑩛"), + (SIGN_I, "𑩑"), + (SIGN_II, "𑩑𑩛"), + (SIGN_U, "𑩒"), + (SIGN_UU, "𑩒𑩛"), + (SIGN_R, "𑩙"), + (SIGN_RR, "𑩙𑩛"), + (SIGN_L, "𑩚"), + (SIGN_LL, "𑩚𑩛"), + (SIGN_EE, "𑩔"), + (SIGN_AI, "𑩗"), + (SIGN_OO, "𑩖"), + (SIGN_AU, "𑩘"), + (ANUSVARA, "𑪖"), + (VISARGA, "𑪗"), + (CANDRABINDU, "𑪖"), + (VIRAMA, "𑪘"), + (KA, "𑩜"), + (KHA, "𑩝"), + (GA, "𑩞"), + (GHA, "𑩟"), + (NGA, "𑩠"), + (CA, "𑩵"), + (CHA, "𑩶"), + (JA, "𑩷"), + (JHA, "𑩤"), + (NYA, "𑩥"), + (TTA, "𑩦"), + (TTHA, "𑩧"), + (DDA, "𑩨"), + (DDHA, "𑩩"), + (NNA, "𑩪"), + (TA, "𑩫"), + (THA, "𑩬"), + (DA, "𑩭"), + (DHA, "𑩮"), + (NA, "𑩯"), + (PA, "𑩰"), + (PHA, "𑩱"), + (BA, "𑩲"), + (BHA, "𑩳"), + (MA, "𑩴"), + (YA, "𑩻"), + (RA, "𑩼"), + (LA, "𑩽"), + (VA, "𑩾"), + (SHA, "𑩿"), + (SSA, "𑪀"), + (SA, "𑪁"), + (HA, "𑪂"), + (LLA, "𑩽"), + (KSSA, "𑪃"), + (JNYA, "𑩷𑪙𑩥"), + (DIGIT_0, "0"), + (DIGIT_1, "1"), + (DIGIT_2, "2"), + (DIGIT_3, "3"), + (DIGIT_4, "4"), + (DIGIT_5, "5"), + (DIGIT_6, "6"), + (DIGIT_7, "7"), + (DIGIT_8, "8"), + (DIGIT_9, "9"), + (OM, "𑩐𑩖𑪖"), + (AVAGRAHA, ""), + ("𑪛", "𑪛"), + ("𑪜", "𑪜"), + (JIHVAMULIYA, "𑪄"), + (UPADHMANIYA, "𑪅"), + (DANDA, "𑪛"), + (DOUBLE_DANDA, "𑪜"), + (E, "𑩐𑩔"), + (O, "𑩐𑩖"), + (SIGN_E, "𑩔"), + (SIGN_O, "𑩖"), +]; + pub const TAI_THAM: &[(&str, &str)] = &[ (A, "ᩋ"), (AA, "ᩋᩣ"), @@ -3652,7 +4073,7 @@ pub const TAMIL: &[(&str, &str)] = &[ (RRA, "ற"), (LLLA, "ழ"), (NNNA, "ன"), - ("ஃ", "ஃ"), + (TAMIL_AYTHAM, "ஃ"), (E, "எ"), (O, "ஒ"), (SIGN_E, "ெ"), @@ -3951,6 +4372,8 @@ pub const TIBETAN: &[(&str, &str)] = &[ (FA, "ཕ༹"), (YYA, "ཡ༹"), (RRA, "ར༹"), + (JIHVAMULIYA, "ྈ"), + (UPADHMANIYA, "ྉ"), (E, "ཨེ"), (O, "ཨོ"), (SIGN_E, "ེ"), @@ -4186,9 +4609,9 @@ pub const BARAHA: &[(&str, &str)] = &[ (ANUSVARA, "M"), (VISARGA, "H"), (CANDRABINDU, "~M"), - ("ᳵ", ""), - ("ᳶ", ""), - ("ꣳ", "(gg)"), + (JIHVAMULIYA, ""), + (UPADHMANIYA, ""), + (CANDRABINDU_VIRAMA, "(gg)"), (VIRAMA, ""), (KA, "k"), (KHA, "kh"), @@ -4257,9 +4680,9 @@ pub const BARAHA: &[(&str, &str)] = &[ ("", "_"), (SVARITA, "#"), (ANUDATTA, "q"), - ("᳝", "V"), - ("᳸", "W"), - ("᳒", "Q"), + (VEDIC_DOT_BELOW, "V"), + (VEDIC_RING_ABOVE, "W"), + (PRENKHA, "Q"), (DOUBLE_SVARITA, "$"), (AA, "aa"), (SIGN_AA, "aa"), @@ -4420,28 +4843,28 @@ pub const IAST: &[(&str, &str)] = &[ (ANUSVARA, "ṃ"), (VISARGA, "ḥ"), (CANDRABINDU, "m̐"), - ("ꣳ", "m̐"), + (CANDRABINDU_VIRAMA, "m̐"), (VIRAMA, ""), (SVARITA, "̭"), (ANUDATTA, "॒"), - ("᳡", "̀"), - ("꣡", "́"), - ("꣢", "²"), - ("꣣", "³"), - ("꣤", "⁴"), - ("꣥", "⁵"), - ("꣦", "⁶"), - ("꣧", "⁷"), - ("꣨", "⁸"), - ("꣩", "⁹"), - ("꣪", "꣪"), - ("꣫", "꣫"), - ("꣬", "꣬"), - ("꣭", "꣭"), - ("꣮", "꣮"), - ("꣯", "꣯"), - ("꣰", "꣰"), - ("꣱", "꣱"), + (ATHARVAVEDA_INDEPENDENT_SVARITA, "̀"), + (COMBINING_DIGIT_1, "́"), + (COMBINING_DIGIT_2, "²"), + (COMBINING_DIGIT_3, "³"), + (COMBINING_DIGIT_4, "⁴"), + (COMBINING_DIGIT_5, "⁵"), + (COMBINING_DIGIT_6, "⁶"), + (COMBINING_DIGIT_7, "⁷"), + (COMBINING_DIGIT_8, "⁸"), + (COMBINING_DIGIT_9, "⁹"), + (COMBINING_A, "꣪"), + (COMBINING_U, "꣫"), + (COMBINING_KA, "꣬"), + (COMBINING_NA, "꣭"), + (COMBINING_PA, "꣮"), + (COMBINING_RA, "꣯"), + (COMBINING_VI, "꣰"), + (COMBINING_AVAGRAHA, "꣱"), (KA, "k"), (KHA, "kh"), (GA, "g"), @@ -4504,19 +4927,19 @@ pub const IAST: &[(&str, &str)] = &[ (NNNA, "ṉ"), (AVAGRAHA, "`"), (AVAGRAHA, "’"), - ("ꣳ", "ṁ"), + (CANDRABINDU_VIRAMA, "ṁ"), (R, "r̥"), (SIGN_R, "r̥"), (RR, "r̥̄"), (SIGN_RR, "r̥̄"), - ("꣡", "¹"), + (COMBINING_DIGIT_1, "¹"), (E, "è"), (O, "ò"), (SIGN_E, "è"), (SIGN_O, "ò"), ]; -pub const ISO: &[(&str, &str)] = &[ +pub const ISO_15919: &[(&str, &str)] = &[ (A, "a"), (AA, "ā"), (I, "i"), @@ -4548,31 +4971,30 @@ pub const ISO: &[(&str, &str)] = &[ (SIGN_AU, "au"), (ANUSVARA, "ṁ"), (VISARGA, "ḥ"), - ("ᳵ", "ẖ"), - ("ḫ", "ᳶ"), + (JIHVAMULIYA, "ẖ"), (CANDRABINDU, "m̐"), (VIRAMA, ""), (ZERO_WIDTH_JOINER, "{}"), (SVARITA, "̭"), (ANUDATTA, "॒"), - ("᳡", "̀"), - ("꣡", "́"), - ("꣢", "²"), - ("꣣", "³"), - ("꣤", "⁴"), - ("꣥", "⁵"), - ("꣦", "⁶"), - ("꣧", "⁷"), - ("꣨", "⁸"), - ("꣩", "⁹"), - ("꣪", "꣪"), - ("꣫", "꣫"), - ("꣬", "꣬"), - ("꣭", "꣭"), - ("꣮", "꣮"), - ("꣯", "꣯"), - ("꣰", "꣰"), - ("꣱", "꣱"), + (ATHARVAVEDA_INDEPENDENT_SVARITA, "̀"), + (COMBINING_DIGIT_1, "́"), + (COMBINING_DIGIT_2, "²"), + (COMBINING_DIGIT_3, "³"), + (COMBINING_DIGIT_4, "⁴"), + (COMBINING_DIGIT_5, "⁵"), + (COMBINING_DIGIT_6, "⁶"), + (COMBINING_DIGIT_7, "⁷"), + (COMBINING_DIGIT_8, "⁸"), + (COMBINING_DIGIT_9, "⁹"), + (COMBINING_A, "꣪"), + (COMBINING_U, "꣫"), + (COMBINING_KA, "꣬"), + (COMBINING_NA, "꣭"), + (COMBINING_PA, "꣮"), + (COMBINING_RA, "꣯"), + (COMBINING_VI, "꣰"), + (COMBINING_AVAGRAHA, "꣱"), (KA, "k"), (KHA, "kh"), (GA, "g"), @@ -4650,10 +5072,11 @@ pub const ISO: &[(&str, &str)] = &[ (CANDRABINDU, "ṁ"), ("व़", "ẉ"), (GHHA, "g̠ẖ"), - ("꣡", "¹"), + (COMBINING_DIGIT_1, "¹"), (R, "r̤i"), (SIGN_R, "r̤i"), - ("ஃ", "ḳ"), + (TAMIL_AYTHAM, "ḳ"), + (UPADHMANIYA, "ḫ"), (E, "e"), (O, "o"), (SIGN_E, "e"), @@ -4796,6 +5219,7 @@ pub const ITRANS: &[(&str, &str)] = &[ (DOUBLE_DANDA, ".."), (ZA, "J"), (CANDRABINDU, "{\\m+}"), + (DOUBLE_SVARITA, "\\\""), (E, "è"), (O, "ò"), (SIGN_E, "è"), @@ -4895,8 +5319,8 @@ pub const SLP1: &[(&str, &str)] = &[ (RRA, "r2"), (LLLA, "L0"), (NNNA, "n0"), - ("ᳵ", "Z"), - ("ᳶ", "V"), + (JIHVAMULIYA, "Z"), + (UPADHMANIYA, "V"), ("ळ्ह", "|"), (SVARITA, "^"), (ANUDATTA, "\\"), diff --git a/vidyut-lipi/src/detect.rs b/vidyut-lipi/src/detect.rs index 59c42b0..6ac124b 100644 --- a/vidyut-lipi/src/detect.rs +++ b/vidyut-lipi/src/detect.rs @@ -2,7 +2,7 @@ use crate::scheme::Scheme; -/// Detcts the scheme used by the given text. +/// Detects the scheme used by the given text. /// /// `detect` is ideal for interfaces where the user would otherwise need to manually choose which /// input encoding to use. `detect` removes some of this user friction by making a reasonable guess @@ -44,11 +44,12 @@ fn detect_inner(input: &str) -> Option { type Range = std::ops::RangeInclusive; - // These are Latin supplements for IAST, ISO-15919, etc. + // The Latin blocks below are used by IAST, ISO-15919, etc. // - // - https://unicode.org/charts/PDF/U0080.pdf - // - https://unicode.org/charts/PDF/U0100.pdf - // - https://unicode.org/charts/PDF/U1E00.pdf + // Docs: + // - + // - + // - const LATIN_1_SUPPLEMENT: Range = 0x0080..=0x00ff; const LATIN_EXTENDED_A: Range = 0x0100..=0x017f; const LATIN_EXTENDED: Range = 0x01e00..=0x01eff; @@ -72,6 +73,7 @@ fn detect_inner(input: &str) -> Option { const TIBETAN: Range = 0x0f00..=0x0fff; const MYANMAR: Range = 0x1000..=0x109f; const TAI_THAM: Range = 0x1a20..=0x1aaf; + const OL_CHIKI: Range = 0x1c50..=0x1c7f; const KHMER: Range = 0x1780..=0x17ff; const LIMBU: Range = 0x1900..=0x194f; const BALINESE: Range = 0x1b00..=0x1b7f; @@ -89,7 +91,9 @@ fn detect_inner(input: &str) -> Option { const TAKRI: Range = 0x11680..=0x116cf; const _AHOM: Range = 0x11700..=0x1174f; const DOGRA: Range = 0x11800..=0x1184f; + const NANDINAGARI: Range = 0x119a0..=0x119ff; const ZANABAZAR_SQUARE: Range = 0x11a00..=0x11a4f; + const SOYOMBO: Range = 0x11a50..=0x11aaf; const BHAIKSUKI: Range = 0x11c00..=0x11c6f; const MASARAM_GONDI: Range = 0x11d00..=0x11d5f; const GUNJALA_GONDI: Range = 0x11d60..=0x11daf; @@ -149,6 +153,8 @@ fn detect_inner(input: &str) -> Option { Some(Burmese) } else if TAI_THAM.contains(&code) { Some(TaiTham) + } else if OL_CHIKI.contains(&code) { + Some(OlChiki) } else if KHMER.contains(&code) { Some(Khmer) } else if LIMBU.contains(&code) { @@ -181,8 +187,12 @@ fn detect_inner(input: &str) -> Option { Some(Takri) } else if DOGRA.contains(&code) { Some(Dogra) + } else if NANDINAGARI.contains(&code) { + Some(Nandinagari) } else if ZANABAZAR_SQUARE.contains(&code) { Some(ZanabazarSquare) + } else if SOYOMBO.contains(&code) { + Some(Soyombo) } else if BHAIKSUKI.contains(&code) { Some(Bhaiksuki) } else if MASARAM_GONDI.contains(&code) { @@ -277,13 +287,16 @@ mod tests { ("അഗ്നിമ്", Malayalam), ("𑴫𑵀𑴫𑵅𑴌𑴶𑴛𑴤𑵄", MasaramGondi), ("𑘀𑘐𑘿𑘡𑘱𑘦𑘿", Modi), + ("𑧍𑧞𑧍𑧠𑦮𑧖𑦽𑧆𑧠", Nandinagari), ("𑚨𑚫𑚨𑚶𑚊𑚶𑚘𑚶𑚙𑚢𑚶", Takri), ("𑐀𑐐𑑂𑐣𑐶𑐩𑑂", Newa), ("ଅଗ୍ନିମ୍", Odia), + ("ᱥᱚᱝᱥᱠᱨᱩᱛᱚᱢ", OlChiki), ("ꢂꢔ꣄ꢥꢶꢪ꣄", Saurashtra), ("𑆃𑆓𑇀𑆤𑆴𑆩𑇀", Sharada), ("𑖀𑖐𑖿𑖡𑖰𑖦𑖿", Siddham), ("අග්නිම්", Sinhala), + ("𑪁𑪖𑪁𑪙𑩜𑩙𑩫𑩴", Soyombo), ("அக்³நிம்", Tamil), ("అగ్నిమ్", Telugu), ("ཨགིམ", Tibetan), diff --git a/vidyut-lipi/src/errors.rs b/vidyut-lipi/src/errors.rs new file mode 100644 index 0000000..6b84317 --- /dev/null +++ b/vidyut-lipi/src/errors.rs @@ -0,0 +1,23 @@ +use std::fmt; + +/// A wrapper for `std::fmt::Result`. +#[allow(unused)] +pub type Result = std::result::Result; + +/// Models the error states of `vidyut-lipi`. +#[allow(unused)] +#[derive(Copy, Clone, Debug)] +pub enum LipiError { + /// Could not parse an input value. + ParseError, +} + +impl fmt::Display for LipiError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + use LipiError::*; + + match self { + ParseError => write!(f, "parse error"), + } + } +} diff --git a/vidyut-lipi/src/lib.rs b/vidyut-lipi/src/lib.rs index 5d46976..af18b2f 100644 --- a/vidyut-lipi/src/lib.rs +++ b/vidyut-lipi/src/lib.rs @@ -4,6 +4,7 @@ mod autogen_schemes; mod detect; +mod errors; mod lipika; mod mapping; mod numerals; diff --git a/vidyut-lipi/src/mapping.rs b/vidyut-lipi/src/mapping.rs index 64bc595..c9f3978 100644 --- a/vidyut-lipi/src/mapping.rs +++ b/vidyut-lipi/src/mapping.rs @@ -31,48 +31,51 @@ pub(crate) enum TokenKind { Consonant, /// A vowel mark, which generally must follow a consonant. VowelMark, + /// An ayogavaha (visarga, anusvara, candrabindu, etc.) + Ayogavaha, + /// An accent mark. + Accent, /// Any other token. Other, } impl TokenKind { fn from_devanagari_key(s: &str) -> Self { - const MARK_AA: u32 = 0x093e; - const MARK_AU: u32 = 0x094c; - const MARK_L: u32 = 0x0962; - const MARK_LL: u32 = 0x0963; - const MARK_PRISHTAMATRA_E: u32 = 0x094e; - const MARK_AW: u32 = 0x094f; - - const CONS_KA: u32 = 0x0915; - const CONS_HA: u32 = 0x0939; - const CONS_QA: u32 = 0x0958; - const CONS_YYA: u32 = 0x095f; - const CONS_DDDA: u32 = 0x097e; - const CONS_BBA: u32 = 0x097f; - const NUKTA: u32 = 0x093c; + use TokenKind::*; + + const MARK_AA: char = '\u{093e}'; + const MARK_AU: char = '\u{094c}'; + const MARK_L: char = '\u{0962}'; + const MARK_LL: char = '\u{0963}'; + const MARK_PRISHTAMATRA_E: char = '\u{094e}'; + const MARK_AW: char = '\u{094f}'; + + const CONS_KA: char = '\u{0915}'; + const CONS_HA: char = '\u{0939}'; + const CONS_QA: char = '\u{0958}'; + const CONS_YYA: char = '\u{095f}'; + const CONS_DDDA: char = '\u{097e}'; + const CONS_BBA: char = '\u{097f}'; + const NUKTA: char = '\u{093c}'; + + const CANDRABINDU: char = '\u{0901}'; + const VISARGA: char = '\u{0903}'; + + const SVARITA: char = '\u{0951}'; + const ANUDATTA: char = '\u{0952}'; if let Some(c) = s.chars().last() { - let code = c as u32; - if (MARK_AA..=MARK_AU).contains(&code) - || code == MARK_PRISHTAMATRA_E - || code == MARK_AW - || code == MARK_L - || code == MARK_LL - { - TokenKind::VowelMark - } else if (CONS_KA..=CONS_HA).contains(&code) - || (CONS_QA..=CONS_YYA).contains(&code) - || code == CONS_DDDA - || code == CONS_BBA - || code == NUKTA - { - TokenKind::Consonant - } else { - TokenKind::Other + match c { + (CONS_KA..=CONS_HA) | (CONS_QA..=CONS_YYA) | CONS_DDDA | CONS_BBA | NUKTA => { + Consonant + } + (MARK_AA..=MARK_AU) | MARK_PRISHTAMATRA_E | MARK_AW | MARK_L | MARK_LL => VowelMark, + CANDRABINDU..=VISARGA => Ayogavaha, + SVARITA | ANUDATTA => Accent, + _ => Other, } } else { - TokenKind::Other + Other } } } @@ -87,9 +90,9 @@ pub(crate) struct OneWayMapping { /// Maps from this scheme's digit chars to their numeric values. numeral_to_int: FxHashMap, /// The virama, or the empty string if not defined for this scheme. - virama: String, + pub(crate) virama: String, /// The letter representation of the "a" vowel. - letter_a: String, + pub(crate) letter_a: String, } impl OneWayMapping { @@ -249,10 +252,8 @@ pub struct Mapping { pub(crate) all: FxHashMap, pub(crate) marks: FxHashMap, - pub(crate) input_virama: String, - pub(crate) output_virama: String, - pub(crate) input_letter_a: String, - pub(crate) output_letter_a: String, + pub(crate) from_map: OneWayMapping, + pub(crate) to_map: OneWayMapping, pub(crate) len_longest_key: usize, pub(crate) numeral_to_int: FxHashMap, @@ -364,18 +365,17 @@ impl Mapping { int_to_numeral.insert(*v, k.to_string()); } let len_longest_key = all.keys().map(|a| a.len()).max().unwrap_or(0); + let numeral_to_int = a_map.numeral_to_int.clone(); Self { from, to, all, marks, - input_virama: a_map.virama, - output_virama: b_map.virama, - input_letter_a: a_map.letter_a, - output_letter_a: b_map.letter_a, + from_map: a_map, + to_map: b_map, len_longest_key, - numeral_to_int: a_map.numeral_to_int.clone(), + numeral_to_int, int_to_numeral, } } diff --git a/vidyut-lipi/src/reshape.rs b/vidyut-lipi/src/reshape.rs index 1993ec2..79922ef 100644 --- a/vidyut-lipi/src/reshape.rs +++ b/vidyut-lipi/src/reshape.rs @@ -23,11 +23,24 @@ //! WASM. So instead of using regexes, we've rolled our own logic for these transformations. use crate::scheme::Scheme; -use std::borrow::Cow; +use crate::unicode_norm; + +const BENGALI_LETTER_YA: char = '\u{09af}'; + +const BENGALI_LETTER_YYA: &str = "\u{09af}\u{09bc}"; + +const BENGALI_LETTER_TA: char = '\u{09a4}'; + +const BENGALI_LETTER_KHANDA_TA: char = '\u{09ce}'; + +const BENGALI_VIRAMA: char = '\u{09cd}'; /// Used instead of space (' ') in Bhaiksuki. const BHAIKSUKI_WORD_SEPARATOR: char = '\u{11c43}'; +/// Javanese virama. +const JAVANESE_PANGKON: char = '\u{a9c0}'; + /// "ra" consonant const KHMER_LETTER_RO: char = '\u{179a}'; @@ -41,6 +54,9 @@ const KHMER_SIGN_VIRIAM: char = '\u{17d1}'; /// Like virama, but indicates that next char should be subscripted. const KHMER_SIGN_COENG: char = '\u{17d2}'; +/// Limbu virama. +const LIMBU_SIGN_SA_I: char = '\u{193b}'; + // Generic ra const MASARAM_GONDI_LETTER_RA: char = '\u{11d26}'; @@ -56,6 +72,16 @@ const MASARAM_GONDI_RA_KARA: char = '\u{11d47}'; const MEETEI_MAYEK_APUN_IYEK: char = '\u{abed}'; +const MYANMAR_SIGN_VIRAMA: char = '\u{1039}'; + +const MYANMAR_SIGN_ASAT: char = '\u{103a}'; + +// Tai Tham virama. +const TAI_THAM_SIGN_RA_HAAM: char = '\u{1a7a}'; + +// Tai Tham combiner. +const TAI_THAM_SIGN_SAKOT: char = '\u{1a60}'; + /// Used instead of space (' ') in Tibetan const TIBETAN_MARK_INTERSYLLABLIC_TSHEG: char = '\u{0f0b}'; @@ -75,14 +101,139 @@ const ZANABAZAR_SQUARE_SIGN_VIRAMA: char = '\u{11a34}'; const ZANABAZAR_SQUARE_SUBJOINER: char = '\u{11a47}'; +fn is_svara(c: char) -> bool { + matches!(c, '\u{0951}' | '\u{0952}' | '\u{1cda}') +} + +fn is_ayogavaha(c: char) -> bool { + ('\u{0901}'..='\u{0903}').contains(&c) +} + +fn is_cham_yrlv(c: char) -> bool { + to_cham_subjoined_yrlv(c).is_some() +} + +fn to_cham_subjoined_yrlv(c: char) -> Option { + if ('\u{aa22}'..='\u{aa25}').contains(&c) { + let code = c as u32; + char::from_u32(code + 0x0011) + } else { + None + } +} + +fn has_cham_final_consonant(c: char) -> bool { + to_cham_final_consonant(c).is_some() +} + +fn to_cham_final_consonant(c: char) -> Option { + let ret = match c { + // Skip SIGN_FINAL_NG -- not sure how to use it. + '\u{aa06}' => '\u{aa40}', // KA + '\u{aa07}' => '\u{aa40}', // KHA --> KA + '\u{aa08}' => '\u{aa41}', // GA + '\u{aa09}' => '\u{aa41}', // GHA --> GA + '\u{aa0b}' => '\u{aa42}', // NGA + '\u{aa0c}' => '\u{aa44}', // CHA + '\u{aa0d}' => '\u{aa44}', // CHHA --> CHA + '\u{aa0e}' => '\u{aa44}', // JA --> CHA + '\u{aa0f}' => '\u{aa44}', // JHA --> CHA + '\u{aa13}' => '\u{aa45}', // TA + '\u{aa14}' => '\u{aa45}', // THA --> TA + '\u{aa15}' => '\u{aa45}', // DA --> TA + '\u{aa16}' => '\u{aa45}', // DHA --> TA + '\u{aa18}' => '\u{aa46}', // NA + '\u{aa1a}' => '\u{aa47}', // PA + '\u{aa1c}' => '\u{aa47}', // PHA --> PA + '\u{aa1d}' => '\u{aa47}', // BA --> PA + '\u{aa1e}' => '\u{aa47}', // BHA --> PA + '\u{aa20}' => '\u{aa4c}', // MA + '\u{aa22}' => '\u{aa48}', // YA + '\u{aa23}' => '\u{aa49}', // RA + '\u{aa24}' => '\u{aa4a}', // LA + '\u{aa26}' => '\u{aa4b}', // SSA + '\u{aa27}' => '\u{aa4b}', // SA --> SSA + '\u{aa28}' => '\u{aa4d}', // HA + _ => return None, + }; + Some(ret) +} + fn is_gunjala_gondi_consonant(c: char) -> bool { ('\u{11d6c}'..='\u{11d89}').contains(&c) } +fn from_javanese_medial(c: char) -> Option { + let ret = match c { + '\u{a9be}' => '\u{a9aa}', + '\u{a9bf}' => '\u{a9ab}', + _ => return None, + }; + Some(ret) +} + +fn to_javanese_medial(c: char) -> Option { + let ret = match c { + '\u{a9aa}' => '\u{a9be}', + '\u{a9ab}' => '\u{a9bf}', + _ => return None, + }; + Some(ret) +} + fn is_khmer_consonant(c: char) -> bool { ('\u{1780}'..='\u{17a2}').contains(&c) } +fn is_limbu_standard_yrv(c: char) -> bool { + to_limbu_subjoined_yrv(c).is_some() +} + +fn is_limbu_subjoined_yrv(c: char) -> bool { + to_limbu_standard_yrv(c).is_some() +} + +fn to_limbu_standard_yrv(c: char) -> Option { + let ret = match c { + '\u{1929}' => '\u{1915}', + '\u{192a}' => '\u{1916}', + '\u{192b}' => '\u{1918}', + _ => return None, + }; + Some(ret) +} + +fn to_limbu_subjoined_yrv(c: char) -> Option { + let ret = match c { + '\u{1915}' => '\u{1929}', + '\u{1916}' => '\u{192a}', + '\u{1918}' => '\u{192b}', + _ => return None, + }; + Some(ret) +} + +#[allow(unused)] +fn has_limbu_final_consonant(c: char) -> bool { + to_limbu_final_consonant(c).is_some() +} + +#[allow(unused)] +fn to_limbu_final_consonant(c: char) -> Option { + let ret = match c { + '\u{1901}' => '\u{1930}', + '\u{1905}' => '\u{1931}', + '\u{190b}' => '\u{1933}', + '\u{190f}' => '\u{1934}', + '\u{1910}' => '\u{1935}', + '\u{1914}' => '\u{1936}', + '\u{1916}' => '\u{1937}', + '\u{1917}' => '\u{1938}', + _ => return None, + }; + Some(ret) +} + fn is_masaram_gondi_consonant(c: char) -> bool { ('\u{11d0c}'..='\u{11d30}').contains(&c) || c == MASARAM_GONDI_REPHA @@ -104,10 +255,45 @@ fn to_meetei_mayek_final_consonant(c: char) -> Option { Some(ret) } +fn is_myanmar_consonant(c: char) -> bool { + ('\u{1000}'..='\u{1020}').contains(&c) || matches!(c, '\u{1050}' | '\u{1051}') +} + +fn to_myanmar_subjoined_yrvh(c: char) -> Option { + let ret = match c { + '\u{101a}' => '\u{103b}', + '\u{101b}' => '\u{103c}', + '\u{101d}' => '\u{103d}', + '\u{101f}' => '\u{103e}', + _ => return None, + }; + Some(ret) +} + +fn to_myanmar_standard_yrvh(c: char) -> Option { + let ret = match c { + '\u{103b}' => '\u{101a}', + '\u{103c}' => '\u{101b}', + '\u{103d}' => '\u{101d}', + '\u{103e}' => '\u{101f}', + _ => return None, + }; + Some(ret) +} + fn is_tamil_superscript(c: char) -> bool { ['²', '³', '⁴'].contains(&c) } +fn is_tai_tham_consonant(c: char) -> bool { + const TAI_THAM_LETTER_RUE: char = '\u{1a42}'; + const TAI_THAM_LETTER_LUE: char = '\u{1a44}'; + + // Ignore RUE and LUE, which are used for vocalic R/L. + ('\u{1a20}'..='\u{1a4c}').contains(&c) + && !matches!(c, TAI_THAM_LETTER_RUE | TAI_THAM_LETTER_LUE) +} + /// Returns whether `c` denotes a Tamil marker that must precede the superscript sign. fn is_tamil_preceding_mark(c: char) -> bool { const TA_VOWEL_AA: char = '\u{0bbe}'; @@ -243,7 +429,7 @@ impl<'a> Matcher<'a> { let mut chars = self.slice().chars(); if let Some(x) = chars.next() { func(&mut self.buf, x); - self.i += self.char_offset_to_byte_offset(1); + self.i += x.len_utf8(); } } @@ -252,7 +438,7 @@ impl<'a> Matcher<'a> { let mut chars = self.slice().chars(); if let (Some(x), Some(y)) = (chars.next(), chars.next()) { func(&mut self.buf, x, y); - self.i += self.char_offset_to_byte_offset(2); + self.i += x.len_utf8() + y.len_utf8(); } } @@ -261,7 +447,7 @@ impl<'a> Matcher<'a> { let mut chars = self.slice().chars(); if let (Some(x), Some(y), Some(z)) = (chars.next(), chars.next(), chars.next()) { func(&mut self.buf, x, y, z); - self.i += self.char_offset_to_byte_offset(3); + self.i += x.len_utf8() + y.len_utf8() + z.len_utf8(); } } @@ -269,24 +455,28 @@ impl<'a> Matcher<'a> { fn push_next(&mut self) { self.take_1(|buf, x| buf.push(x)); } - - /// Converts a char offset to a byte offset. - fn char_offset_to_byte_offset(&self, i_char: usize) -> usize { - self.slice() - .chars() - .take(i_char) - .map(|c| c.len_utf8()) - .sum() - } } /// Reshapes `input` before we run the main transliteration function. /// -/// A `Cow` type lets us return either a `&str` or a `String`. We return a `Cow` because most -/// schemes do not need to rewrite the input string at all and can return `&str` directly. -pub fn reshape_before(input: &str, from: Scheme) -> Cow { - let mut m = Matcher::new(input); +/// Once this function matures, we will consider switching to an iterator-based implementation. +pub fn reshape_before(input: &str, from: Scheme, to: Scheme) -> String { + // Convert to NFC first to avoid certain transliteration errors. + // (See `iso_15919_bug_no_greedy_match_on_nfd` for an example of what we want to prevent.) + let input = unicode_norm::to_nfc(input); + + let mut m = Matcher::new(&input); match from { + Scheme::Assamese | Scheme::Bengali => { + while m.not_empty() { + if m.match_1(|x| x == BENGALI_LETTER_KHANDA_TA) { + m.take_1(|buf, _| buf.extend(&[BENGALI_LETTER_TA, BENGALI_VIRAMA])) + } else { + m.push_next(); + } + } + m.finish() + } Scheme::Bhaiksuki => { while m.not_empty() { if m.match_1(|x| x == BHAIKSUKI_WORD_SEPARATOR) { @@ -296,7 +486,51 @@ pub fn reshape_before(input: &str, from: Scheme) -> Cow { m.push_next(); } } - Cow::Owned(m.finish()) + m.finish() + } + Scheme::Burmese | Scheme::Mon => { + while m.not_empty() { + if m.match_1(|x| to_myanmar_standard_yrvh(x).is_some()) { + // word separator --> space + m.take_1(|buf, x| { + let x_new = to_myanmar_standard_yrvh(x).expect("yrvh"); + buf.extend(&[MYANMAR_SIGN_ASAT, x_new]); + }); + } else if m.match_1(|x| x == MYANMAR_SIGN_VIRAMA) { + m.take_1(|buf, _| buf.push(MYANMAR_SIGN_ASAT)); + } else { + m.push_next(); + } + } + m.finish() + } + Scheme::Devanagari => { + if to.is_alphabet() { + while m.not_empty() { + if m.match_2(|x, y| is_ayogavaha(x) && is_svara(y)) { + m.take_2(|buf, x, y| buf.extend(&[y, x])); + } else { + m.push_next(); + } + } + m.finish() + } else { + input + } + } + Scheme::Javanese => { + while m.not_empty() { + if m.match_1(|x| from_javanese_medial(x).is_some()) { + // word separator --> space + m.take_1(|buf, x| { + let x_new = from_javanese_medial(x).expect("medial"); + buf.extend(&[JAVANESE_PANGKON, x_new]); + }); + } else { + m.push_next(); + } + } + m.finish() } Scheme::Khmer => { // TODO: rewrite anusvara per Aksharamukha. @@ -311,8 +545,49 @@ pub fn reshape_before(input: &str, from: Scheme) -> Cow { m.push_next(); } } - Cow::Owned(m.finish()) + m.finish() + } + Scheme::Limbu => { + while m.not_empty() { + if m.match_1(is_limbu_subjoined_yrv) { + m.take_1(|buf, x| { + buf.extend(&[LIMBU_SIGN_SA_I, to_limbu_standard_yrv(x).expect("yrv")]) + }); + } else { + m.push_next(); + } + } + m.finish() + } + Scheme::Malayalam => { + const MALAYALAM_SIGN_VIRAMA: char = '\u{0d4d}'; + + fn from_malayalam_chillu(c: char) -> Option { + let ret = match c { + '\u{0d7a}' => '\u{0d23}', // nna + '\u{0d7b}' => '\u{0d28}', // na + '\u{0d7c}' => '\u{0d30}', // rra + '\u{0d7d}' => '\u{0d32}', // la + '\u{0d7e}' => '\u{0d33}', // lla + '\u{0d7f}' => '\u{0d15}', // ka + _ => return None, + }; + Some(ret) + } + + while m.not_empty() { + if m.match_1(|x| from_malayalam_chillu(x).is_some()) { + m.take_1(|buf, x| { + let x_new = from_malayalam_chillu(x).expect("chillu"); + buf.extend(&[x_new, MALAYALAM_SIGN_VIRAMA]) + }) + } else { + m.push_next(); + } + } + m.finish() } + Scheme::MasaramGondi => { while m.not_empty() { if m.match_2(|x, y| is_masaram_gondi_consonant(x) && y == MASARAM_GONDI_RA_KARA) { @@ -334,7 +609,18 @@ pub fn reshape_before(input: &str, from: Scheme) -> Cow { m.push_next(); } } - Cow::Owned(m.finish()) + m.finish() + } + Scheme::TaiTham => { + while m.not_empty() { + if m.match_1(|x| x == TAI_THAM_SIGN_SAKOT) { + // combiner --> virama + m.take_1(|buf, _| buf.push(TAI_THAM_SIGN_RA_HAAM)); + } else { + m.push_next(); + } + } + m.finish() } Scheme::Tamil => { while m.not_empty() { @@ -345,7 +631,7 @@ pub fn reshape_before(input: &str, from: Scheme) -> Cow { m.push_next(); } } - Cow::Owned(m.finish()) + m.finish() } Scheme::Thai => { while m.not_empty() { @@ -359,7 +645,7 @@ pub fn reshape_before(input: &str, from: Scheme) -> Cow { m.push_next(); } } - Cow::Owned(m.finish()) + m.finish() } Scheme::Tibetan => { const TIBETAN_CATURTHA_HALF: &[char] = @@ -397,7 +683,7 @@ pub fn reshape_before(input: &str, from: Scheme) -> Cow { } } - Cow::Owned(m.finish()) + m.finish() } Scheme::ZanabazarSquare => { const ZANABAZAR_SQUARE_MARK_TSHEG: char = '\u{11a41}'; @@ -412,9 +698,23 @@ pub fn reshape_before(input: &str, from: Scheme) -> Cow { m.push_next(); } } - Cow::Owned(m.finish()) + m.finish() } - _ => Cow::Borrowed(input), + _ => input, + } +} + +fn is_bengali_sound(c: char) -> bool { + match c { + // Signs, vowels, consonants + '\u{0981}'..='\u{09bc}' => true, + // Dependent vowels + '\u{09be}'..='\u{09cc}' => true, + // Other consonants and signs + '\u{09ce}'..='\u{09e3}' => true, + // Assamese + '\u{09f0}'..='\u{09f1}' => true, + _ => false, } } @@ -423,6 +723,30 @@ pub fn reshape_after(output: String, to: Scheme) -> String { let mut m = Matcher::new(&output); match to { + Scheme::Assamese | Scheme::Bengali => { + while m.not_empty() { + if m.match_2(|x, y| is_bengali_sound(x) && y == BENGALI_LETTER_YA) { + m.take_2(|buf, x, _| { + buf.push(x); + buf.push_str(BENGALI_LETTER_YYA); + }) + } else if m.match_2(|x, y| x == BENGALI_LETTER_TA && y == BENGALI_VIRAMA) { + let mut chars = m.slice().chars(); + let z = chars.nth(2); + + // virama + ta + (end) --> khanda ta + (end) + // virama + ta + (non-sound) --> khanda ta + (non-sound) + if z.map_or(true, |c| !is_bengali_sound(c)) { + m.take_2(|buf, _, _| buf.push(BENGALI_LETTER_KHANDA_TA)); + } else { + m.push_next(); + } + } else { + m.push_next(); + } + } + m.finish() + } Scheme::Bhaiksuki => { while m.not_empty() { if m.match_1(|x| x == ' ') { @@ -434,6 +758,73 @@ pub fn reshape_after(output: String, to: Scheme) -> String { } m.finish() } + Scheme::Burmese | Scheme::Mon => { + while m.not_empty() { + if m.match_2(|x, y| x == MYANMAR_SIGN_ASAT && is_myanmar_consonant(y)) { + m.take_2(|buf, _, y| { + if let Some(y_new) = to_myanmar_subjoined_yrvh(y) { + buf.extend(&[y_new]); + } else { + buf.extend(&[MYANMAR_SIGN_VIRAMA, y]); + } + }); + } else { + m.push_next(); + } + } + m.finish() + } + Scheme::Cham => { + const FAKE_VIRAMA: char = '\u{02be}'; + while m.not_empty() { + if m.match_2(|x, y| x == FAKE_VIRAMA && is_cham_yrlv(y)) { + m.take_2(|buf, _, y| { + let y_new = to_cham_subjoined_yrlv(y).expect("yrlv"); + buf.push(y_new) + }) + } else { + m.push_next(); + } + } + + // Substitution above blocks substitution here, so split into two Matchers. + let first = m.finish(); + let mut m = Matcher::new(&first); + while m.not_empty() { + if m.match_2(|x, y| has_cham_final_consonant(x) && y == FAKE_VIRAMA) { + m.take_2(|buf, x, _| { + let x_new = to_cham_final_consonant(x).expect("has final"); + buf.push(x_new) + }); + } else { + m.push_next(); + } + } + m.finish() + } + Scheme::Devanagari => { + while m.not_empty() { + if m.match_2(|x, y| is_svara(x) && is_ayogavaha(y)) { + m.take_2(|buf, x, y| buf.extend(&[y, x])); + } else { + m.push_next(); + } + } + m.finish() + } + Scheme::Javanese => { + while m.not_empty() { + if m.match_2(|x, y| x == JAVANESE_PANGKON && to_javanese_medial(y).is_some()) { + m.take_2(|buf, _, y| { + let y_new = to_javanese_medial(y).expect("medial"); + buf.extend(&[y_new]); + }); + } else { + m.push_next(); + } + } + m.finish() + } Scheme::GunjalaGondi => { const GUNJALA_GONDI_VIRAMA: char = '\u{11d97}'; while m.not_empty() { @@ -472,6 +863,16 @@ pub fn reshape_after(output: String, to: Scheme) -> String { } m.finish() } + Scheme::Limbu => { + while m.not_empty() { + if m.match_2(|x, y| x == LIMBU_SIGN_SA_I && is_limbu_standard_yrv(y)) { + m.take_2(|buf, _, y| buf.push(to_limbu_subjoined_yrv(y).expect("y r v"))); + } else { + m.push_next(); + } + } + m.finish() + } Scheme::MasaramGondi => { while m.not_empty() { if m.match_3(|x, y, z| { @@ -513,6 +914,31 @@ pub fn reshape_after(output: String, to: Scheme) -> String { } m.finish() } + Scheme::TaiTham => { + const RA: char = '\u{1a41}'; + const LA: char = '\u{1a43}'; + const MEDIAL_RA: char = '\u{1a55}'; + const MEDIAL_LA: char = '\u{1a56}'; + + while m.not_empty() { + if m.match_2(|x, y| x == TAI_THAM_SIGN_RA_HAAM && matches!(y, RA | LA)) { + m.take_2(|buf, _, y| { + if y == RA { + buf.push(MEDIAL_RA); + } else { + buf.push(MEDIAL_LA); + } + }); + } else if m.match_2(|x, y| x == TAI_THAM_SIGN_RA_HAAM && is_tai_tham_consonant(y)) { + m.take_2(|buf, _, y| { + buf.extend(&[TAI_THAM_SIGN_SAKOT, y]); + }); + } else { + m.push_next(); + } + } + m.finish() + } Scheme::Tamil => { while m.not_empty() { if m.match_2(|x, y| is_tamil_superscript(x) && is_tamil_preceding_mark(y)) { diff --git a/vidyut-lipi/src/scheme.rs b/vidyut-lipi/src/scheme.rs index 15b74bd..197f1cc 100644 --- a/vidyut-lipi/src/scheme.rs +++ b/vidyut-lipi/src/scheme.rs @@ -32,6 +32,14 @@ pub(crate) enum Coverage { #[derive(Clone, Copy, Debug, Hash, Eq, PartialEq)] #[wasm_bindgen] pub enum Scheme { + /// Assamese script. + /// + /// The Assamese script uses the same characters as the Bengali script, with a few minor + /// differences. + /// + /// Docs: + Assamese, + /// Balinese script. /// /// Docs: @@ -113,6 +121,11 @@ pub enum Scheme { /// Docs: Kannada, + /// Kharoshthi script. + /// + /// Docs: + Kharoshthi, + /// Khmer script. /// /// Docs: @@ -133,6 +146,11 @@ pub enum Scheme { /// Docs: Limbu, + /// Mahajani script. + /// + /// Docs: + /// Mahajani, + /// Malayalam script. /// /// Docs: @@ -150,9 +168,14 @@ pub enum Scheme { /// Modi script. /// - /// + /// Docs: Modi, + /// Mon script. + /// + /// Docs: + Mon, + /// Lao script. /// /// Docs: @@ -205,6 +228,11 @@ pub enum Scheme { /// Docs: Sinhala, + /// Soyombo script. + /// + /// Docs: + Soyombo, + /// Tai Tham script (Lanna) /// /// Docs: @@ -272,9 +300,9 @@ pub enum Scheme { /// ITRANS 5.3 transliteration. /// /// Docs: - /// - https://www.aczoom.com/itrans/ (official ITRANS site for version 5.3) - /// - https://www.aczoom.com/itrans/html/dvng/node3.html (DEVNAG table) - /// - http://www.sanskritweb.net/itrans/itmanual2003.pdf (Itranslator 2003 manual) + /// - (official ITRANS site for version 5.3) + /// - (DEVNAG table) + /// - (Itranslator 2003 manual) /// /// ITRANS appears in various versions, some of which conflict with each other. Version 5.3 /// seems to be the most widely used, and it is supported by software like Itranslator 2003. @@ -304,7 +332,7 @@ impl Scheme { pub fn iter() -> impl Iterator { use Scheme::*; const SCHEMES: &[Scheme] = &[ - // Assamese, + Assamese, Balinese, BarahaSouth, Bengali, @@ -325,13 +353,15 @@ impl Scheme { Javanese, Kaithi, Kannada, + Kharoshthi, Khmer, Khudawadi, Limbu, Malayalam, - MeeteiMayek, MasaramGondi, + MeeteiMayek, Modi, + Mon, Nandinagari, Newa, Odia, @@ -341,7 +371,7 @@ impl Scheme { Siddham, Sinhala, Slp1, - // Soyombo, + Soyombo, TaiTham, Takri, Tamil, @@ -350,7 +380,6 @@ impl Scheme { Tibetan, Tirhuta, Velthuis, - // Wancho, Wx, ZanabazarSquare, ]; @@ -362,6 +391,7 @@ impl Scheme { match self { // Abugidas + Scheme::Assamese => auto::ASSAMESE, Scheme::Balinese => auto::BALINESE, Scheme::Bengali => auto::BENGALI, Scheme::Bhaiksuki => auto::BHAIKSUKI, @@ -377,6 +407,7 @@ impl Scheme { Scheme::Javanese => auto::JAVANESE, Scheme::Kaithi => auto::KAITHI, Scheme::Kannada => auto::KANNADA, + Scheme::Kharoshthi => auto::KHAROSHTHI, Scheme::Khmer => auto::KHMER, Scheme::Khudawadi => auto::KHUDAWADI, // Scheme::Lao => auto::LAO, @@ -386,6 +417,7 @@ impl Scheme { Scheme::MeeteiMayek => auto::MEETEI_MAYEK, Scheme::MasaramGondi => auto::MASARAM_GONDI, Scheme::Modi => auto::MODI, + Scheme::Mon => auto::MON, Scheme::Nandinagari => auto::NANDINAGARI, Scheme::Newa => auto::NEWA, Scheme::Odia => auto::ORIYA, @@ -394,6 +426,7 @@ impl Scheme { Scheme::Sharada => auto::SHARADA, Scheme::Siddham => auto::SIDDHAM, Scheme::Sinhala => auto::SINHALA, + Scheme::Soyombo => auto::SOYOMBO, Scheme::TaiTham => auto::TAI_THAM, Scheme::Takri => auto::TAKRI, Scheme::Tamil => auto::TAMIL, @@ -407,7 +440,7 @@ impl Scheme { Scheme::BarahaSouth => auto::BARAHA, Scheme::HarvardKyoto => auto::HK, Scheme::Iast => auto::IAST, - Scheme::Iso15919 => auto::ISO, + Scheme::Iso15919 => auto::ISO_15919, Scheme::Itrans => auto::ITRANS, Scheme::Slp1 => auto::SLP1, Scheme::Velthuis => auto::VELTHUIS, @@ -470,11 +503,12 @@ impl Scheme { // Use an exhaustive match (no `_`) so that we explicitly account for all schemes. match self { // Abugidas are all `true`. - Balinese | Bengali | Bhaiksuki | Brahmi | Burmese | Cham | Devanagari | Dogra - | Grantha | Gujarati | GunjalaGondi | Gurmukhi | Javanese | Kaithi | Kannada - | Khmer | Khudawadi | Limbu | Malayalam | MasaramGondi | MeeteiMayek | Modi - | Nandinagari | Newa | Odia | Saurashtra | Sharada | Siddham | Sinhala | TaiTham - | Takri | Tamil | Telugu | Thai | Tibetan | Tirhuta | ZanabazarSquare => true, + Assamese | Balinese | Bengali | Bhaiksuki | Brahmi | Burmese | Cham | Devanagari + | Dogra | Grantha | Gujarati | GunjalaGondi | Gurmukhi | Javanese | Kaithi + | Kannada | Khmer | Kharoshthi | Khudawadi | Limbu | Malayalam | MasaramGondi + | MeeteiMayek | Modi | Mon | Nandinagari | Newa | Odia | Saurashtra | Sharada + | Siddham | Sinhala | Soyombo | TaiTham | Takri | Tamil | Telugu | Thai | Tibetan + | Tirhuta | ZanabazarSquare => true, // Alphabets are all `false`. BarahaSouth | HarvardKyoto | Iso15919 | Itrans | Iast | OlChiki | Slp1 | Velthuis @@ -544,13 +578,13 @@ mod tests { // // Don't use `_`, as that would defeat the point of this test. match s { - Balinese | BarahaSouth | Bengali | Bhaiksuki | Brahmi | Burmese | Cham - | Devanagari | Dogra | GunjalaGondi | Grantha | Gujarati | Gurmukhi - | HarvardKyoto | Iast | Iso15919 | Itrans | Javanese | Kaithi | Kannada | Khmer - | Khudawadi | Limbu | Malayalam | MasaramGondi | MeeteiMayek | Nandinagari - | Modi | Newa | Odia | OlChiki | Saurashtra | Sharada | Siddham | Sinhala - | Slp1 | TaiTham | Takri | Tamil | Telugu | Thai | Tibetan | Tirhuta | Velthuis - | Wx | ZanabazarSquare => { + Assamese | Balinese | BarahaSouth | Bengali | Bhaiksuki | Brahmi | Burmese + | Cham | Devanagari | Dogra | GunjalaGondi | Grantha | Gujarati | Gurmukhi + | HarvardKyoto | Iast | Iso15919 | Itrans | Javanese | Kaithi | Kannada + | Kharoshthi | Khmer | Khudawadi | Limbu | Malayalam | MasaramGondi + | MeeteiMayek | Nandinagari | Modi | Mon | Newa | Odia | OlChiki | Saurashtra + | Sharada | Siddham | Sinhala | Slp1 | Soyombo | TaiTham | Takri | Tamil + | Telugu | Thai | Tibetan | Tirhuta | Velthuis | Wx | ZanabazarSquare => { expected.push(*s); } } diff --git a/vidyut-lipi/src/transliterate.rs b/vidyut-lipi/src/transliterate.rs index 21de470..327dc08 100644 --- a/vidyut-lipi/src/transliterate.rs +++ b/vidyut-lipi/src/transliterate.rs @@ -44,7 +44,7 @@ pub fn transliterate(input: impl AsRef, mapping: &Mapping) -> String { /// pass. For more complex scheme pairs (such as `Tibetan` to `Khmer`), this code will make three /// passes total. fn transliterate_inner(input: &str, mapping: &Mapping) -> String { - let input = reshape_before(input, mapping.from()); + let input = reshape_before(input, mapping.from(), mapping.to()); let is_to_alphabet = mapping.to.is_alphabet(); let is_from_abugida = mapping.from.is_abugida(); @@ -97,9 +97,9 @@ fn transliterate_inner(input: &str, mapping: &Mapping) -> String { // Abugidas and alphabets have distinct logic here, so keep their code neatly separate. if is_from_abugida { - let a = &mapping.output_letter_a; + let a = &mapping.to_map.letter_a; if output.ends_with(a) - && (mapping.marks.contains_key(key) || key == mapping.input_virama) + && (mapping.marks.contains_key(key) || key == mapping.from_map.virama) { // `key` maps to a token that blocks the default "a" vowel, so pop the "a" that // we added in the previous iteration. @@ -116,7 +116,7 @@ fn transliterate_inner(input: &str, mapping: &Mapping) -> String { } } else { // Transliterate from alphabet - if had_virama && key == mapping.input_letter_a { + if had_virama && key == mapping.from_map.letter_a { // `key` is the default "a" vowel, so pop the virama that we added in the // previous iteration. output.pop(); @@ -136,7 +136,7 @@ fn transliterate_inner(input: &str, mapping: &Mapping) -> String { // We have not seen a vowel mark yet, so push a virama for now. // // (The next loop iteration might pop this virama off of `output`.) - output += &mapping.output_virama; + output += &mapping.to_map.virama; had_virama = true; } } diff --git a/vidyut-lipi/src/unicode_norm.rs b/vidyut-lipi/src/unicode_norm.rs index 59e4ece..cfcfe16 100644 --- a/vidyut-lipi/src/unicode_norm.rs +++ b/vidyut-lipi/src/unicode_norm.rs @@ -19,9 +19,9 @@ //! //! Rough size estimates, as of 2024-01-22: //! -//! - Code without NFC/NFD logic: ~ 129 KiB wasm -//! - Code with this module: ~ 134 KiB wasm -//! - Code with `unicode_normaliation`: ~ 248 KiB wasm +//! - Code without NFC/NFD logic : ~ 129 KiB wasm +//! - Code with this module: ~ 134 KiB wasm +//! - Code with `unicode_normalization`: ~ 248 KiB wasm //! //! [1]: https://docs.rs/unicode-normalization/latest/unicode_normalization/ @@ -61,6 +61,7 @@ pub const LATIN_NFD: Table = &[ ("\u{1e0d}", "d\u{0323}"), // ḍ ("\u{1e24}", "H\u{0323}"), // Ḥ ("\u{1e25}", "h\u{0323}"), // ḥ + ("\u{1e2b}", "h\u{032e}"), // ḫ ("\u{1e32}", "K\u{0323}"), // Ḳ ("\u{1e33}", "k\u{0323}"), // ḳ ("\u{1e36}", "L\u{0323}"), // Ḷ @@ -248,7 +249,13 @@ pub const TIRHUTA_NFD: Table = &[ ("\u{114be}", "\u{114b9}\u{114bd}"), // vowel sign au ]; -#[allow(unused)] +/// Converts `s` to its NFC representation. +/// +/// Only characters that appear in one of our `Scheme`s will be converted. All other characters +/// will be left as-is. +/// +/// TODO: consider using `unicode_normalization` in non-WASM with conditional compilation. Leaning +/// against due to having to reason about two different systems. pub(crate) fn to_nfc(s: &str) -> String { let mut map = FxHashMap::default(); let mut len_longest_key = 0; diff --git a/vidyut-lipi/tests/basic.rs b/vidyut-lipi/tests/basic.rs index c39e537..ed9e4f5 100644 --- a/vidyut-lipi/tests/basic.rs +++ b/vidyut-lipi/tests/basic.rs @@ -62,6 +62,9 @@ fn assert_one_way_pairwise(reference: (Scheme, &str), examples: &[(Scheme, &str) assert_two_way_pairwise(examples); } +/// Asserts that: +/// 1. each test case has multiple unicode variants +/// 2. for each test case, all unicode variants have the same transliteration output. fn assert_supports_unicode_variants(scheme: Scheme, test_cases: &[&str]) { use unicode_normalization::UnicodeNormalization; @@ -76,7 +79,7 @@ fn assert_supports_unicode_variants(scheme: Scheme, test_cases: &[&str]) { // The choice of output_scheme does not really matter here. What we're testing is that the // transliterator treats our two inputs in the same way. - let output_scheme = Scheme::Itrans; + let output_scheme = Scheme::Devanagari; let output_nfc = lipika.transliterate(&input_nfc, scheme, output_scheme); let output_nfd = lipika.transliterate(&input_nfd, scheme, output_scheme); assert_eq!( @@ -102,11 +105,12 @@ fn sanskrit_independent_vowels() { (Velthuis, "a aa i ii u uu .r .R .l .L e ai o au"), (Wx, "a A i I u U q Q L LV e E o O"), // Indic + (Assamese, "অ আ ই ঈ উ ঊ ঋ ৠ ঌ ৡ এ ঐ ও ঔ"), (Balinese, "ᬅ ᬆ ᬇ ᬈ ᬉ ᬊ ᬋ ᬌ ᬍ ᬎ ᬏ ᬐ ᬑ ᬒ"), (Bengali, "অ আ ই ঈ উ ঊ ঋ ৠ ঌ ৡ এ ঐ ও ঔ"), (Brahmi, "𑀅 𑀆 𑀇 𑀈 𑀉 𑀊 𑀋 𑀌 𑀍 𑀎 𑀏 𑀐 𑀑 𑀒"), (Burmese, "အ အာ ဣ ဤ ဥ ဦ ၒ ၓ ၔ ၕ ဧ အဲ ဩ ဪ"), - // (Cham, "ꨀ ꨀꨩ ꨁ ꨁꨩ ꨂ ꨂꨩ ꨣꨮ ꨣꨮꨩ ꨤꨮ ꨤꨮꨩ ꨃ ꨄ ꨅ ꨀꨯꨱ"), + (Cham, "ꨀ ꨀꨩ ꨁ ꨁꨩ ꨂ ꨂꨩ ꨣꨮ ꨣꨮꨩ ꨤꨮ ꨤꨮꨩ ꨃ ꨄ ꨅ ꨀꨯꨱ"), (Devanagari, "अ आ इ ई उ ऊ ऋ ॠ ऌ ॡ ए ऐ ओ औ"), (Dogra, "𑠀 𑠁 𑠂 𑠃 𑠄 𑠅 𑠤𑠭 𑠤𑠮 𑠥𑠭 𑠥𑠮 𑠆 𑠇 𑠈 𑠉"), (Grantha, "𑌅 𑌆 𑌇 𑌈 𑌉 𑌊 𑌋 𑍠 𑌌 𑍡 𑌏 𑌐 𑌓 𑌔"), @@ -116,11 +120,13 @@ fn sanskrit_independent_vowels() { (Khmer, "អ អា ឥ ឦ ឧ ឩ ឫ ឬ ឭ ឮ ឯ ឰ ឱ ឳ"), (Malayalam, "അ ആ ഇ ഈ ഉ ഊ ഋ ൠ ഌ ൡ ഏ ഐ ഓ ഔ"), (Modi, "𑘀 𑘁 𑘂 𑘃 𑘄 𑘅 𑘆 𑘇 𑘈 𑘉 𑘊 𑘋 𑘌 𑘍"), + (Mon, "အ အာ ဣ ဣဳ ဥ ဥူ ၒ ၓ ၔ ၕ ဨ အဲ ဩ ဪ"), (Newa, "𑐀 𑐁 𑐂 𑐃 𑐄 𑐅 𑐆 𑐇 𑐈 𑐉 𑐊 𑐋 𑐌 𑐍"), (Odia, "ଅ ଆ ଇ ଈ ଉ ଊ ଋ ୠ ଌ ୡ ଏ ଐ ଓ ଔ"), (Saurashtra, "ꢂ ꢃ ꢄ ꢅ ꢆ ꢇ ꢈ ꢉ ꢊ ꢋ ꢍ ꢎ ꢐ ꢑ"), (Sharada, "𑆃 𑆄 𑆅 𑆆 𑆇 𑆈 𑆉 𑆊 𑆋 𑆌 𑆍 𑆎 𑆏 𑆐"), (Sinhala, "අ ආ ඉ ඊ උ ඌ ඍ ඎ ඏ ඐ ඒ ඓ ඕ ඖ"), + (Soyombo, "𑩐 𑩐𑩛 𑩐𑩑 𑩐𑩑𑩛 𑩐𑩒 𑩐𑩒𑩛 𑩐𑩙 𑩐𑩙𑩛 𑩐𑩚 𑩐𑩚𑩛 𑩐𑩔 𑩐𑩗 𑩐𑩖 𑩐𑩘"), (TaiTham, "ᩋ ᩋᩣ ᩍ ᩎ ᩏ ᩐ ᩂ ᩂᩣ ᩄ ᩄᩣ ᩑ ᩋᩱ ᩒ ᩏᩫᩣ"), (Tamil, "அ ஆ இ ஈ உ ஊ ருʼ ரூʼ லுʼ லூʼ ஏ ஐ ஓ ஔ"), (Telugu, "అ ఆ ఇ ఈ ఉ ఊ ఋ ౠ ఌ ౡ ఏ ఐ ఓ ఔ"), @@ -139,6 +145,7 @@ fn sanskrit_independent_vowels() { // Scripts with no vocalic L/LL assert_two_way_pairwise(&[ (Slp1, slp1_text), + (Kharoshthi, "𐨀 𐨀𐨌 𐨀𐨁 𐨀𐨁𐨌 𐨀𐨂 𐨀𐨂𐨌 𐨀𐨃 𐨀𐨃𐨌 𐨫𐨂 𐨫𐨂𐨌 𐨀𐨅 𐨀𐨅𐨌 𐨀𐨆 𐨀𐨆𐨌"), (Nandinagari, "𑦠 𑦡 𑦢 𑦣 𑦤 𑦥 𑦦 𑦧 𑧉𑧔 𑧉𑧕 𑦪 𑦫 𑦬 𑦭"), (Siddham, "𑖀 𑖁 𑖂 𑖃 𑖄 𑖅 𑖆 𑖇 𑖈 𑖉 𑖊 𑖋 𑖌 𑖍"), ]); @@ -150,6 +157,7 @@ fn sanskrit_independent_vowels() { (Gurmukhi, "ਅ ਆ ਇ ਈ ਉ ਊ ਰੁ ਰੂ ਲੁ ਲੂ ਏ ਐ ਓ ਔ"), (Kaithi, "𑂃 𑂄 𑂅 𑂆 𑂇 𑂈 𑂩𑂱 𑂩𑂲 𑂪𑂱 𑂪𑂲 𑂉 𑂊 𑂋 𑂌"), (Khudawadi, "𑊰 𑊱 𑊲 𑊳 𑊴 𑊵 𑋙𑋡 𑋙𑋢 𑋚𑋡 𑋚𑋢 𑊶 𑊷 𑊸 𑊹"), + (Limbu, "ᤀ ᤀᤠ ᤀᤡ ᤀᤡ᤺ ᤀᤢ ᤀᤢ᤺ ᤖᤢ ᤖᤢ᤺ ᤗᤢ ᤗᤢ᤺ ᤀᤣ ᤀᤤ ᤀᤥ ᤀᤦ"), (MasaramGondi, "𑴀 𑴁 𑴂 𑴃 𑴄 𑴅 𑴦𑴶 𑴦𑴵 𑴧𑴴 𑴧𑴵 𑴆 𑴈 𑴉 𑴋"), (OlChiki, "ᱚ ᱟ ᱤ ᱤᱻ ᱩ ᱩᱻ ᱨᱩ ᱨᱩᱻ ᱞᱩ ᱞᱩᱻ ᱮ ᱚᱤ ᱳ ᱚᱩ"), (Takri, "𑚀 𑚁 𑚂 𑚃 𑚄 𑚅 𑚤𑚮 𑚤𑚯 𑚥𑚮 𑚥𑚯 𑚆 𑚇 𑚈 𑚉"), @@ -190,11 +198,12 @@ fn sanskrit_dependent_vowels() { ), (Wx, "ka kA ki kI ku kU kq kQ kL kLV ke kE ko kO"), // Indic + (Assamese, "ক কা কি কী কু কূ কৃ কৄ কৢ কৣ কে কৈ কো কৌ"), (Balinese, "ᬓ ᬓᬵ ᬓᬶ ᬓᬷ ᬓᬸ ᬓᬹ ᬓᬺ ᬓᬻ ᬓᬼ ᬓᬽ ᬓᬾ ᬓᬿ ᬓᭀ ᬓᭁ"), (Bengali, "ক কা কি কী কু কূ কৃ কৄ কৢ কৣ কে কৈ কো কৌ"), (Brahmi, "𑀓 𑀓𑀸 𑀓𑀺 𑀓𑀻 𑀓𑀼 𑀓𑀽 𑀓𑀾 𑀓𑀿 𑀓𑁀 𑀓𑁁 𑀓𑁂 𑀓𑁃 𑀓𑁄 𑀓𑁅"), (Burmese, "က ကာ ကိ ကီ ကု ကူ ကၖ ကၗ ကၘ ကၙ ကေ ကဲ ကော ကော်"), - // (Cham, "ꨆ ꨆꨩ ꨆꨪ ꨆꨫ ꨆꨭ ꨆꨭꨩ ꨆꨴꨮ ꨆꨴꨮꨩ ꨆꨵꨮ ꨆꨵꨮꨩ ꨆꨯꨮ ꨆꨰ ꨆꨯ ꨆꨯꨱ"), + (Cham, "ꨆ ꨆꨩ ꨆꨪ ꨆꨫ ꨆꨭ ꨆꨭꨩ ꨆꨴꨮ ꨆꨴꨮꨩ ꨆꨵꨮ ꨆꨵꨮꨩ ꨆꨯꨮ ꨆꨰ ꨆꨯ ꨆꨯꨱ"), (Devanagari, "क का कि की कु कू कृ कॄ कॢ कॣ के कै को कौ"), (Dogra, "𑠊 𑠊𑠬 𑠊𑠭 𑠊𑠮 𑠊𑠯 𑠊𑠰 𑠊𑠱 𑠊𑠲 𑠊𑠹𑠥𑠭 𑠊𑠹𑠥𑠮 𑠊𑠳 𑠊𑠴 𑠊𑠵 𑠊𑠶"), (Grantha, "𑌕 𑌕𑌾 𑌕𑌿 𑌕𑍀 𑌕𑍁 𑌕𑍂 𑌕𑍃 𑌕𑍄 𑌕𑍢 𑌕𑍣 𑌕𑍇 𑌕𑍈 𑌕𑍋 𑌕𑍌"), @@ -202,13 +211,15 @@ fn sanskrit_dependent_vowels() { (Javanese, "ꦏ ꦏꦴ ꦏꦶ ꦏꦷ ꦏꦸ ꦏꦹ ꦏꦽ ꦏ꧀ꦉꦴ ꦏ꧀ꦊ ꦏ꧀ꦋ ꦏꦺ ꦏꦻ ꦏꦺꦴ ꦏꦻꦴ"), (Kannada, "ಕ ಕಾ ಕಿ ಕೀ ಕು ಕೂ ಕೃ ಕೄ ಕೢ ಕೣ ಕೇ ಕೈ ಕೋ ಕೌ"), (Khmer, "ក កា កិ កី កុ កូ ក្ឫ ក្ឬ ក្ឭ ក្ឮ កេ កៃ កោ កៅ"), - (Malayalam, "ക കാ കി കീ കു കൂ കൃ കൄ കൢ കൣ കേ കൈ കോ കൌ"), + (Malayalam, "ക കാ കി കീ കു കൂ കൃ കൄ കൢ കൣ കേ കൈ കോ കൗ"), (Modi, "𑘎 𑘎𑘰 𑘎𑘱 𑘎𑘲 𑘎𑘳 𑘎𑘴 𑘎𑘵 𑘎𑘶 𑘎𑘷 𑘎𑘸 𑘎𑘹 𑘎𑘺 𑘎𑘻 𑘎𑘼"), + (Mon, "က ကာ ကိ ကဳ ကု ကူ ကၖ ကၗ ကၘ ကၙ ကေ ကဲ ကော ကော်"), (Newa, "𑐎 𑐎𑐵 𑐎𑐶 𑐎𑐷 𑐎𑐸 𑐎𑐹 𑐎𑐺 𑐎𑐻 𑐎𑐼 𑐎𑐽 𑐎𑐾 𑐎𑐿 𑐎𑑀 𑐎𑑁"), (Odia, "କ କା କି କୀ କୁ କୂ କୃ କୄ କୢ କୣ କେ କୈ କୋ କୌ"), (Saurashtra, "ꢒ ꢒꢵ ꢒꢶ ꢒꢷ ꢒꢸ ꢒꢹ ꢒꢺ ꢒꢻ ꢒꢼ ꢒꢽ ꢒꢿ ꢒꣀ ꢒꣂ ꢒꣃ"), (Sharada, "𑆑 𑆑𑆳 𑆑𑆴 𑆑𑆵 𑆑𑆶 𑆑𑆷 𑆑𑆸 𑆑𑆹 𑆑𑆺 𑆑𑆻 𑆑𑆼 𑆑𑆽 𑆑𑆾 𑆑𑆿"), (Sinhala, "ක කා කි කී කු කූ කෘ කෲ කෟ කෳ කේ කෛ කෝ කෞ"), + (Soyombo, "𑩜 𑩜𑩛 𑩜𑩑 𑩜𑩑𑩛 𑩜𑩒 𑩜𑩒𑩛 𑩜𑩙 𑩜𑩙𑩛 𑩜𑩚 𑩜𑩚𑩛 𑩜𑩔 𑩜𑩗 𑩜𑩖 𑩜𑩘"), (TaiTham, "ᨠ ᨠᩣ ᨠᩥ ᨠᩦ ᨠᩩ ᨠᩪ ᨠ᩺ᩂ ᨠ᩺ᩂᩣ ᨠ᩺ᩄ ᨠ᩺ᩄᩣ ᨠᩮ ᨠᩱ ᨠᩮᩣ ᨠᩮᩫᩣ"), (Tamil, "க கா கி கீ கு கூ க்ருʼ க்ரூʼ க்லுʼ க்லூʼ கே கை கோ கௌ"), (Telugu, "క కా కి కీ కు కూ కృ కౄ కౢ కౣ కే కై కో కౌ"), @@ -226,6 +237,7 @@ fn sanskrit_dependent_vowels() { // Scripts without vocalic L/LL assert_two_way_pairwise(&[ (Slp1, slp1_text), + (Kharoshthi, "𐨐 𐨐𐨌 𐨐𐨁 𐨐𐨁𐨌 𐨐𐨂 𐨐𐨂𐨌 𐨐𐨃 𐨐𐨃𐨌 𐨐𐨿𐨫𐨂 𐨐𐨿𐨫𐨂𐨌 𐨐𐨅 𐨐𐨅𐨌 𐨐𐨆 𐨐𐨆𐨌"), (Nandinagari, "𑦮 𑦮𑧑 𑦮𑧒 𑦮𑧓 𑦮𑧔 𑦮𑧕 𑦮𑧖 𑦮𑧗 𑦮𑧠𑧉𑧔 𑦮𑧠𑧉𑧕 𑦮𑧚 𑦮𑧛 𑦮𑧜 𑦮𑧝"), (Siddham, "𑖎 𑖎𑖯 𑖎𑖰 𑖎𑖱 𑖎𑖲 𑖎𑖳 𑖎𑖴 𑖎𑖵 𑖎𑖿𑖩𑖰 𑖎𑖿𑖩𑖱 𑖎𑖸 𑖎𑖹 𑖎𑖺 𑖎𑖻"), ]); @@ -237,15 +249,20 @@ fn sanskrit_dependent_vowels() { (GunjalaGondi, "𑵱 𑵱𑶊 𑵱𑶋 𑵱𑶌 𑵱𑶍 𑵱𑶎 𑵱𑶗𑶈𑶍 𑵱𑶗𑶈𑶎 𑵱𑶗𑵵𑶍 𑵱𑶗𑵵𑶎 𑵱𑶐 𑵱𑶑 𑵱𑶓 𑵱𑶔"), (Kaithi, "𑂍 𑂍𑂰 𑂍𑂱 𑂍𑂲 𑂍𑂳 𑂍𑂴 𑂍𑂹𑂩𑂱 𑂍𑂹𑂩𑂲 𑂍𑂹𑂪𑂱 𑂍𑂹𑂪𑂲 𑂍𑂵 𑂍𑂶 𑂍𑂷 𑂍𑂸"), (Khudawadi, "𑊺 𑊺𑋠 𑊺𑋡 𑊺𑋢 𑊺𑋣 𑊺𑋤 𑊺𑋪𑋙𑋡 𑊺𑋪𑋙𑋢 𑊺𑋪𑋚𑋡 𑊺𑋪𑋚𑋢 𑊺𑋥 𑊺𑋦 𑊺𑋧 𑊺𑋨"), + (Limbu, "ᤁ ᤁᤠ ᤁᤡ ᤁᤡ᤺ ᤁᤢ ᤁᤢ᤺ ᤁᤪᤢ ᤁᤪᤢ᤺ ᤁ᤻ᤗᤢ ᤁ᤻ᤗᤢ᤺ ᤁᤣ ᤁᤤ ᤁᤥ ᤁᤦ"), (MasaramGondi, "𑴌 𑴌𑴱 𑴌𑴲 𑴌𑴳 𑴌𑴴 𑴌𑴵 𑴌𑴶 𑴌𑵇𑴵 𑴌𑵅𑴧𑴴 𑴌𑵅𑴧𑴵 𑴌𑴺 𑴌𑴼 𑴌𑴽 𑴌𑴿"), - ( - OlChiki, - "ᱠᱚ ᱠᱟ ᱠᱤ ᱠᱤᱻ ᱠᱩ ᱠᱩᱻ ᱠᱨᱩ ᱠᱨᱩᱻ ᱠᱞᱩ ᱠᱞᱩᱻ ᱠᱮ ᱠᱚᱤ ᱠᱳ ᱠᱚᱩ", - ), (Takri, "𑚊 𑚊𑚭 𑚊𑚮 𑚊𑚯 𑚊𑚰 𑚊𑚱 𑚊𑚶𑚤𑚮 𑚊𑚶𑚤𑚯 𑚊𑚶𑚥𑚮 𑚊𑚶𑚥𑚯 𑚊𑚲 𑚊𑚳 𑚊𑚴 𑚊𑚵"), (ZanabazarSquare, "𑨋 𑨋𑨊 𑨋𑨁 𑨋𑨁𑨊 𑨋𑨂 𑨋𑨂𑨊 𑨋𑩇𑨫𑨉 𑨋𑩇𑨫𑨉𑨊 𑨋𑩇𑨬𑨉 𑨋𑩇𑨬𑨉𑨊 𑨋𑨄 𑨋𑨄𑨊 𑨋𑨆 𑨋𑨆𑨊"), ]); + // Scripts without R/RR/L/LL or long O. + assert_transliterate( + slp1_text, + Slp1, + OlChiki, + "ᱠᱚ ᱠᱟ ᱠᱤ ᱠᱤᱻ ᱠᱩ ᱠᱩᱻ ᱠᱨᱩ ᱠᱨᱩᱻ ᱠᱞᱩ ᱠᱞᱩᱻ ᱠᱮ ᱠᱚᱤ ᱠᱳ ᱠᱚᱩ", + ); + // Scripts without R/RR/L/LL or long U. assert_transliterate( slp1_text, @@ -267,11 +284,12 @@ fn sanskrit_ayogavahas_etc() { (Velthuis, "a.m a.h a~m"), (Wx, "aM aH az"), // Indic + (Assamese, "অং অঃ অঁ"), (Balinese, "ᬅᬂ ᬅᬄ ᬅᬁ"), (Bengali, "অং অঃ অঁ"), (Bhaiksuki, "𑰀𑰽𑱃𑰀𑰾𑱃𑰀𑰼"), (Brahmi, "𑀅𑀁 𑀅𑀂 𑀅𑀀"), - // (Cham, "ꨀꩌ ꨀꩍ ꨀꩃ"), + (Cham, "ꨀꩌ ꨀꩍ ꨀꩃ"), (Devanagari, "अं अः अँ"), (Grantha, "𑌅𑌂 𑌅𑌃 𑌅𑌁"), (Gujarati, "અં અઃ અઁ"), @@ -299,13 +317,17 @@ fn sanskrit_ayogavahas_etc() { (Burmese, "အံ အး အံ"), (Dogra, "𑠀𑠷 𑠀𑠸 𑠀𑠷"), (GunjalaGondi, "𑵠𑶕 𑵠𑶖 𑵠𑶕"), + (Kharoshthi, "𐨀𐨎 𐨀𐨏 𐨀𐨎"), (Khmer, "អំ អះ អំ"), (Khudawadi, "𑊰𑋟 𑊰𑋞𑋪 𑊰𑋟"), + (Limbu, "ᤀᤱ ᤀᤜ᤻ ᤀᤱ"), (MasaramGondi, "𑴀𑵀 𑴀𑵁 𑴀𑵀"), (MeeteiMayek, "ꯑꯪ ꯑꯍ꯭ ꯑꯪ"), (Modi, "𑘀𑘽 𑘀𑘾 𑘀𑘽"), + (Mon, "အံ အး အံ"), (Nandinagari, "𑦠𑧞 𑦠𑧟 𑦠𑧞"), (Sinhala, "අං අඃ අං"), + (Soyombo, "𑩐𑪖 𑩐𑪗 𑩐𑪖"), (TaiTham, "ᩋᩴ ᩋᩡ ᩋᩴ"), (Takri, "𑚀𑚫 𑚀𑚬 𑚀𑚫"), (Thai, "อํ อห์ อํ"), @@ -327,6 +349,7 @@ fn sanskrit_consonants_non_vedic() { (Velthuis, "ka kha ga gha \"na ca cha ja jha ~na .ta .tha .da .dha .na ta tha da dha na pa pha ba bha ma ya ra la va \"sa .sa sa ha"), (Wx, "ka Ka ga Ga fa ca Ca ja Ja Fa ta Ta da Da Na wa Wa xa Xa na pa Pa ba Ba ma ya ra la va Sa Ra sa ha"), // Indic + (Assamese, "ক খ গ ঘ ঙ চ ছ জ ঝ ঞ ট ঠ ড ঢ ণ ত থ দ ধ ন প ফ ব ভ ম য ৰ ল ৱ শ ষ স হ"), (Balinese, "ᬓ ᬔ ᬕ ᬖ ᬗ ᬘ ᬙ ᬚ ᬛ ᬜ ᬝ ᬞ ᬟ ᬠ ᬡ ᬢ ᬣ ᬤ ᬥ ᬦ ᬧ ᬨ ᬩ ᬪ ᬫ ᬬ ᬭ ᬮ ᬯ ᬰ ᬱ ᬲ ᬳ"), (Bhaiksuki, "𑰎𑱃𑰏𑱃𑰐𑱃𑰑𑱃𑰒𑱃𑰓𑱃𑰔𑱃𑰕𑱃𑰖𑱃𑰗𑱃𑰘𑱃𑰙𑱃𑰚𑱃𑰛𑱃𑰜𑱃𑰝𑱃𑰞𑱃𑰟𑱃𑰠𑱃𑰡𑱃𑰢𑱃𑰣𑱃𑰤𑱃𑰥𑱃𑰦𑱃𑰧𑱃𑰨𑱃𑰩𑱃𑰪𑱃𑰫𑱃𑰬𑱃𑰭𑱃𑰮"), (Brahmi, "𑀓 𑀔 𑀕 𑀖 𑀗 𑀘 𑀙 𑀚 𑀛 𑀜 𑀝 𑀞 𑀟 𑀠 𑀡 𑀢 𑀣 𑀤 𑀥 𑀦 𑀧 𑀨 𑀩 𑀪 𑀫 𑀬 𑀭 𑀮 𑀯 𑀰 𑀱 𑀲 𑀳"), @@ -343,6 +366,7 @@ fn sanskrit_consonants_non_vedic() { (Malayalam, "ക ഖ ഗ ഘ ങ ച ഛ ജ ഝ ഞ ട ഠ ഡ ഢ ണ ത ഥ ദ ധ ന പ ഫ ബ ഭ മ യ ര ല വ ശ ഷ സ ഹ"), (MasaramGondi, "𑴌 𑴍 𑴎 𑴏 𑴐 𑴑 𑴒 𑴓 𑴔 𑴕 𑴖 𑴗 𑴘 𑴙 𑴚 𑴛 𑴜 𑴝 𑴞 𑴟 𑴠 𑴡 𑴢 𑴣 𑴤 𑴥 𑴦 𑴧 𑴨 𑴩 𑴪 𑴫 𑴬"), (Modi, "𑘎 𑘏 𑘐 𑘑 𑘒 𑘓 𑘔 𑘕 𑘖 𑘗 𑘘 𑘙 𑘚 𑘛 𑘜 𑘝 𑘞 𑘟 𑘠 𑘡 𑘢 𑘣 𑘤 𑘥 𑘦 𑘧 𑘨 𑘩 𑘪 𑘫 𑘬 𑘭 𑘮"), + (Mon, "က ခ ဂ ဃ ၚ စ ဆ ဇ ၛ ည ဋ ဌ ဍ ဎ ဏ တ ထ ဒ ဓ န ပ ဖ ဗ ဘ မ ယ ရ လ ဝ ၐ ၑ သ ဟ"), (Nandinagari, "𑦮 𑦯 𑦰 𑦱 𑦲 𑦳 𑦴 𑦵 𑦶 𑦷 𑦸 𑦹 𑦺 𑦻 𑦼 𑦽 𑦾 𑦿 𑧀 𑧁 𑧂 𑧃 𑧄 𑧅 𑧆 𑧇 𑧈 𑧉 𑧊 𑧋 𑧌 𑧍 𑧎"), (Newa, "𑐎 𑐏 𑐐 𑐑 𑐒 𑐔 𑐕 𑐖 𑐗 𑐘 𑐚 𑐛 𑐜 𑐝 𑐞 𑐟 𑐠 𑐡 𑐢 𑐣 𑐥 𑐦 𑐧 𑐨 𑐩 𑐫 𑐬 𑐮 𑐰 𑐱 𑐲 𑐳 𑐴"), (Odia, "କ ଖ ଗ ଘ ଙ ଚ ଛ ଜ ଝ ଞ ଟ ଠ ଡ ଢ ଣ ତ ଥ ଦ ଧ ନ ପ ଫ ବ ଭ ମ ଯ ର ଲ ଵ ଶ ଷ ସ ହ"), @@ -350,6 +374,7 @@ fn sanskrit_consonants_non_vedic() { (Sharada, "𑆑 𑆒 𑆓 𑆔 𑆕 𑆖 𑆗 𑆘 𑆙 𑆚 𑆛 𑆜 𑆝 𑆞 𑆟 𑆠 𑆡 𑆢 𑆣 𑆤 𑆥 𑆦 𑆧 𑆨 𑆩 𑆪 𑆫 𑆬 𑆮 𑆯 𑆰 𑆱 𑆲"), (Siddham, "𑖎 𑖏 𑖐 𑖑 𑖒 𑖓 𑖔 𑖕 𑖖 𑖗 𑖘 𑖙 𑖚 𑖛 𑖜 𑖝 𑖞 𑖟 𑖠 𑖡 𑖢 𑖣 𑖤 𑖥 𑖦 𑖧 𑖨 𑖩 𑖪 𑖫 𑖬 𑖭 𑖮"), (Sinhala, "ක ඛ ග ඝ ඞ ච ඡ ජ ඣ ඤ ට ඨ ඩ ඪ ණ ත ථ ද ධ න ප ඵ බ භ ම ය ර ල ව ශ ෂ ස හ"), + (Soyombo, "𑩜 𑩝 𑩞 𑩟 𑩠 𑩵 𑩶 𑩷 𑩤 𑩥 𑩦 𑩧 𑩨 𑩩 𑩪 𑩫 𑩬 𑩭 𑩮 𑩯 𑩰 𑩱 𑩲 𑩳 𑩴 𑩻 𑩼 𑩽 𑩾 𑩿 𑪀 𑪁 𑪂"), (TaiTham, "ᨠ ᨡ ᨣ ᨥ ᨦ ᨧ ᨨ ᨩ ᨫ ᨬ ᨭ ᨮ ᨯ ᨰ ᨱ ᨲ ᨳ ᨴ ᨵ ᨶ ᨸ ᨹ ᨻ ᨽ ᨾ ᨿ ᩁ ᩃ ᩅ ᩆ ᩇ ᩈ ᩉ"), (Takri, "𑚊 𑚸 𑚌 𑚍 𑚎 𑚏 𑚐 𑚑 𑚒 𑚓 𑚔 𑚕 𑚖 𑚗 𑚘 𑚙 𑚚 𑚛 𑚜 𑚝 𑚞 𑚟 𑚠 𑚡 𑚢 𑚣 𑚤 𑚥 𑚦 𑚧 𑚋 𑚨 𑚩"), (Tamil, "க க² க³ க⁴ ங ச ச² ஜ ஜ² ஞ ட ட² ட³ ட⁴ ண த த² த³ த⁴ ந ப ப² ப³ ப⁴ ம ய ர ல வ ஶ ஷ ஸ ஹ"), @@ -374,6 +399,22 @@ fn sanskrit_consonants_non_vedic() { ], ); + // No nga or jha + assert_transliterate( + slp1, + Slp1, + Kharoshthi, + "𐨐 𐨑 𐨒 𐨓 𐨣 𐨕 𐨖 𐨗 𐨗 𐨙 𐨚 𐨛 𐨜 𐨝 𐨞 𐨟 𐨠 𐨡 𐨢 𐨣 𐨤 𐨥 𐨦 𐨧 𐨨 𐨩 𐨪 𐨫 𐨬 𐨭 𐨮 𐨯 𐨱", + ); + + // No jha, various nasals missing + assert_transliterate( + slp1, + Slp1, + Limbu, + "ᤁ ᤂ ᤃ ᤄ ᤅ ᤆ ᤇ ᤈ ᤈ ᤏ ᤋ ᤌ ᤍ ᤎ ᤏ ᤋ ᤌ ᤍ ᤎ ᤏ ᤐ ᤑ ᤒ ᤓ ᤔ ᤕ ᤖ ᤗ ᤘ ᤙ ᤙ ᤛ ᤜ", + ); + // No distinction between ca/cha and others. assert_transliterate( slp1, @@ -405,9 +446,10 @@ fn sanskrit_consonants_non_vedic() { // No distinction between Ta / ta assert_one_way_pairwise( (Slp1, slp1), - &[ - // (Cham, "ꨆ ꨇ ꨈ ꨉ ꨋ ꨌ ꨍ ꨎ ꨏ ꨑ ꨓ ꨔ ꨕ ꨖ ꨘ ꨓ ꨔ ꨕ ꨖ ꨘ ꨚ ꨜ ꨝ ꨞ ꨠ ꨢ ꨣ ꨤ ꨥ ꨦ ꨦ ꨧ ꨨ"), - ], + &[( + Cham, + "ꨆ ꨇ ꨈ ꨉ ꨋ ꨌ ꨍ ꨎ ꨏ ꨑ ꨓ ꨔ ꨕ ꨖ ꨘ ꨓ ꨔ ꨕ ꨖ ꨘ ꨚ ꨜ ꨝ ꨞ ꨠ ꨢ ꨣ ꨤ ꨥ ꨦ ꨦ ꨧ ꨨ", + )], ); } @@ -421,9 +463,10 @@ fn sanskrit_symbols() { (Velthuis, "0 1 2 3 4 5 6 7 8 9 | || .a"), (Wx, "0 1 2 3 4 5 6 7 8 9 . .. Z"), // Indic + (Assamese, "০ ১ ২ ৩ ৪ ৫ ৬ ৭ ৮ ৯ । ॥ ঽ"), (Bengali, "০ ১ ২ ৩ ৪ ৫ ৬ ৭ ৮ ৯ । ॥ ঽ"), (Bhaiksuki, "𑱐𑱃𑱑𑱃𑱒𑱃𑱓𑱃𑱔𑱃𑱕𑱃𑱖𑱃𑱗𑱃𑱘𑱃𑱙𑱃𑱁𑱃𑱂𑱃𑱀"), - // (Cham, "꩐ ꩑ ꩒ ꩓ ꩔ ꩕ ꩖ ꩗ ꩘ ꩙ ꩝ ꩞ '"), + (Cham, "꩐ ꩑ ꩒ ꩓ ꩔ ꩕ ꩖ ꩗ ꩘ ꩙ ꩝ ꩞ '"), (Devanagari, "० १ २ ३ ४ ५ ६ ७ ८ ९ । ॥ ऽ"), (Dogra, "० १ २ ३ ४ ५ ६ ७ ८ ९ । ॥ ऽ"), (Grantha, "௦ ௧ ௨ ௩ ௪ ௫ ௬ ௭ ௮ ௯ । ॥ 𑌽"), @@ -431,18 +474,23 @@ fn sanskrit_symbols() { (GunjalaGondi, "𑶠 𑶡 𑶢 𑶣 𑶤 𑶥 𑶦 𑶧 𑶨 𑶩 . .. ऽ"), (Gujarati, "૦ ૧ ૨ ૩ ૪ ૫ ૬ ૭ ૮ ૯ । ॥ ઽ"), (Gurmukhi, "੦ ੧ ੨ ੩ ੪ ੫ ੬ ੭ ੮ ੯ । ॥ ऽ"), + (Javanese, "꧐ ꧑ ꧒ ꧓ ꧔ ꧕ ꧖ ꧗ ꧘ ꧙ ꧈ ꧉ '"), (Kaithi, "० १ २ ३ ४ ५ ६ ७ ८ ९ 𑃀 𑃁 ऽ"), (Kannada, "೦ ೧ ೨ ೩ ೪ ೫ ೬ ೭ ೮ ೯ । ॥ ಽ"), (Khmer, "០ ១ ២ ៣ ៤ ៥ ៦ ៧ ៨ ៩ ។ ៕ ៜ"), (Khudawadi, "𑋰 𑋱 𑋲 𑋳 𑋴 𑋵 𑋶 𑋷 𑋸 𑋹 । ॥ ऽ"), + (Limbu, "᥆ ᥇ ᥈ ᥉ ᥊ ᥋ ᥌ ᥍ ᥎ ᥏ । ॥ '"), (Malayalam, "൦ ൧ ൨ ൩ ൪ ൫ ൬ ൭ ൮ ൯ । ॥ ഽ"), (MasaramGondi, "𑵐 𑵑 𑵒 𑵓 𑵔 𑵕 𑵖 𑵗 𑵘 𑵙 । ॥ ऽ"), + (MeeteiMayek, "꯰ ꯱ ꯲ ꯳ ꯴ ꯵ ꯶ ꯷ ꯸ ꯹ ꯫ ꯫꯫ '"), (Modi, "𑙐 𑙑 𑙒 𑙓 𑙔 𑙕 𑙖 𑙗 𑙘 𑙙 𑙁 𑙂 ऽ"), + (Mon, "၀ ၁ ၂ ၃ ၄ ၅ ၆ ၇ ၈ ၉ ၊ ။ '"), (Nandinagari, "೦ ೧ ೨ ೩ ೪ ೫ ೬ ೭ ೮ ೯ । ॥ 𑧡"), (Newa, "𑑐 𑑑 𑑒 𑑓 𑑔 𑑕 𑑖 𑑗 𑑘 𑑙 𑑋 𑑌 𑑇"), (Odia, "୦ ୧ ୨ ୩ ୪ ୫ ୬ ୭ ୮ ୯ । ॥ ଽ"), (OlChiki, "᱐ ᱑ ᱒ ᱓ ᱔ ᱕ ᱖ ᱗ ᱘ ᱙ ᱾ ᱿ '"), (Saurashtra, "꣐ ꣑ ꣒ ꣓ ꣔ ꣕ ꣖ ꣗ ꣘ ꣙ ꣎ ꣏ ఽ"), + // (Soyombo, "0 1 2 3 4 5 6 7 8 9 𑪛 𑪜 '"), (TaiTham, "᪐ ᪑ ᪒ ᪓ ᪔ ᪕ ᪖ ᪗ ᪘ ᪙ ᪨ ᪩ '"), (Takri, "𑛀 𑛁 𑛂 𑛃 𑛄 𑛅 𑛆 𑛇 𑛈 𑛉 । ॥ ऽ"), (Telugu, "౦ ౧ ౨ ౩ ౪ ౫ ౬ ౭ ౮ ౯ । ॥ ఽ"), @@ -474,24 +522,29 @@ fn sanskrit_basic_sentences() { (Balinese, "ᬦᬵᬭᬵᬬᬡᬂ ᬦᬫᬲ᭄ᬓᬺᬢ᭄ᬬ ᬦᬭᬂ ᬘᬿᬯ ᬦᬭᭀᬢ᭄ᬢᬫᬫ᭄ ᭞ ᬤᬾᬯᬷᬂ ᬲᬭᬲ᭄ᬯᬢᬷᬂ ᬘᬿᬯ ᬢᬢᭀ ᬚᬬᬫᬸᬤᬷᬭᬬᬾᬢ᭄ ᭟ ᭑ ᭟"), (Bhaiksuki, "𑰡𑰯𑰨𑰯𑰧𑰜𑰽𑱃𑰡𑰦𑰭𑰿𑰎𑰴𑰝𑰿𑰧𑱃𑰡𑰨𑰽𑱃𑰓𑰹𑰪𑱃𑰡𑰨𑰺𑰝𑰿𑰝𑰦𑰦𑰿𑱃𑱁𑱃𑰟𑰸𑰪𑰱𑰽𑱃𑰭𑰨𑰭𑰿𑰪𑰝𑰱𑰽𑱃𑰓𑰹𑰪𑱃𑰝𑰝𑰺𑱃𑰕𑰧𑰦𑰲𑰟𑰱𑰨𑰧𑰸𑰝𑰿𑱃𑱂𑱃𑱑𑱃𑱂"), (Brahmi, "𑀦𑀸𑀭𑀸𑀬𑀡𑀁 𑀦𑀫𑀲𑁆𑀓𑀾𑀢𑁆𑀬 𑀦𑀭𑀁 𑀘𑁃𑀯 𑀦𑀭𑁄𑀢𑁆𑀢𑀫𑀫𑁆 𑁇 𑀤𑁂𑀯𑀻𑀁 𑀲𑀭𑀲𑁆𑀯𑀢𑀻𑀁 𑀘𑁃𑀯 𑀢𑀢𑁄 𑀚𑀬𑀫𑀼𑀤𑀻𑀭𑀬𑁂𑀢𑁆 𑁈 𑁧 𑁈"), + (Burmese, "နာရာယဏံ နမသ္ကၖတျ နရံ စဲဝ နရောတ္တမမ် ၊ ဒေဝီံ သရသွတီံ စဲဝ တတော ဇယမုဒီရယေတ် ။ ၁ ။"), (Devanagari, "नारायणं नमस्कृत्य नरं चैव नरोत्तमम् । देवीं सरस्वतीं चैव ततो जयमुदीरयेत् ॥ १ ॥"), (Dogra, "𑠝𑠬𑠤𑠬𑠣𑠘𑠷 𑠝𑠢𑠩𑠹𑠊𑠱𑠙𑠹𑠣 𑠝𑠤𑠷 𑠏𑠴𑠦 𑠝𑠤𑠵𑠙𑠹𑠙𑠢𑠢𑠹 । 𑠛𑠳𑠦𑠮𑠷 𑠩𑠤𑠩𑠹𑠦𑠙𑠮𑠷 𑠏𑠴𑠦 𑠙𑠙𑠵 𑠑𑠣𑠢𑠯𑠛𑠮𑠤𑠣𑠳𑠙𑠹 ॥ १ ॥"), (Grantha, "𑌨𑌾𑌰𑌾𑌯𑌣𑌂 𑌨𑌮𑌸𑍍𑌕𑍃𑌤𑍍𑌯 𑌨𑌰𑌂 𑌚𑍈𑌵 𑌨𑌰𑍋𑌤𑍍𑌤𑌮𑌮𑍍 । 𑌦𑍇𑌵𑍀𑌂 𑌸𑌰𑌸𑍍𑌵𑌤𑍀𑌂 𑌚𑍈𑌵 𑌤𑌤𑍋 𑌜𑌯𑌮𑍁𑌦𑍀𑌰𑌯𑍇𑌤𑍍 ॥ ௧ ॥"), (Gujarati, "નારાયણં નમસ્કૃત્ય નરં ચૈવ નરોત્તમમ્ । દેવીં સરસ્વતીં ચૈવ તતો જયમુદીરયેત્ ॥ ૧ ॥"), + (Javanese, "ꦤꦴꦫꦴꦪꦟꦁ ꦤꦩꦱ꧀ꦏꦽꦠꦾ ꦤꦫꦁ ꦕꦻꦮ ꦤꦫꦺꦴꦠ꧀ꦠꦩꦩ꧀ ꧈ ꦢꦺꦮꦷꦁ ꦱꦫꦱ꧀ꦮꦠꦷꦁ ꦕꦻꦮ ꦠꦠꦺꦴ ꦗꦪꦩꦸꦢꦷꦫꦪꦺꦠ꧀ ꧉ ꧑ ꧉"), (Kaithi, "𑂢𑂰𑂩𑂰𑂨𑂝𑂁 𑂢𑂧𑂮𑂹𑂍𑂹𑂩𑂱𑂞𑂹𑂨 𑂢𑂩𑂁 𑂒𑂶𑂫 𑂢𑂩𑂷𑂞𑂹𑂞𑂧𑂧𑂹 𑃀 𑂠𑂵𑂫𑂲𑂁 𑂮𑂩𑂮𑂹𑂫𑂞𑂲𑂁 𑂒𑂶𑂫 𑂞𑂞𑂷 𑂔𑂨𑂧𑂳𑂠𑂲𑂩𑂨𑂵𑂞𑂹 𑃁 १ 𑃁"), (Kannada, "ನಾರಾಯಣಂ ನಮಸ್ಕೃತ್ಯ ನರಂ ಚೈವ ನರೋತ್ತಮಮ್ । ದೇವೀಂ ಸರಸ್ವತೀಂ ಚೈವ ತತೋ ಜಯಮುದೀರಯೇತ್ ॥ ೧ ॥"), + (Kharoshthi, "𐨣𐨌𐨪𐨌𐨩𐨞𐨎 𐨣𐨨𐨯𐨿𐨐𐨃𐨟𐨿𐨩 𐨣𐨪𐨎 𐨕𐨅𐨌𐨬 𐨣𐨪𐨆𐨟𐨿𐨟𐨨𐨨𐨿 𐩖 𐨡𐨅𐨬𐨁𐨌𐨎 𐨯𐨪𐨯𐨿𐨬𐨟𐨁𐨌𐨎 𐨕𐨅𐨌𐨬 𐨟𐨟𐨆 𐨗𐨩𐨨𐨂𐨡𐨁𐨌𐨪𐨩𐨅𐨟𐨿 𐩗 𐩀 𐩗"), (Khmer, "នារាយណំ នមស្ក្ឫត្យ នរំ ចៃវ នរោត្តមម៑ ។ ទេវីំ សរស្វតីំ ចៃវ តតោ ជយមុទីរយេត៑ ៕ ១ ៕"), (Khudawadi, "𑋑𑋠𑋙𑋠𑋘𑋌𑋟 𑋑𑋗𑋝𑋪𑊺𑋪𑋙𑋡𑋍𑋪𑋘 𑋑𑋙𑋟 𑋀𑋦𑋛 𑋑𑋙𑋧𑋍𑋪𑋍𑋗𑋗𑋪 । 𑋏𑋥𑋛𑋢𑋟 𑋝𑋙𑋝𑋪𑋛𑋍𑋢𑋟 𑋀𑋦𑋛 𑋍𑋍𑋧 𑋂𑋘𑋗𑋣𑋏𑋢𑋙𑋘𑋥𑋍𑋪 ॥ 𑋱 ॥"), + // (Limbu, "ᤏᤠᤖᤠᤕᤏᤱ ᤏᤔᤛ᤻ᤁᤪᤢᤋᤩ ᤏᤖᤱ ᤆᤤᤘ ᤏᤖᤥᤳᤋᤔᤶ । ᤍᤣᤘᤡ᤺ᤱ ᤛᤖᤛᤫᤋᤡ᤺ᤱ ᤆᤤᤘ ᤋᤋᤥ ᤈᤕᤔᤢᤍᤡ᤺ᤖᤕᤣᤳ ॥ ᥇ ॥"), (Malayalam, "നാരായണം നമസ്കൃത്യ നരം ചൈവ നരോത്തമമ് । ദേവീം സരസ്വതീം ചൈവ തതോ ജയമുദീരയേത് ॥ ൧ ॥"), (MasaramGondi, "𑴟𑴱𑴦𑴱𑴥𑴚𑵀 𑴟𑴤𑴫𑵅𑴌𑴶𑴛𑵅𑴥 𑴟𑴦𑵀 𑴑𑴼𑴨 𑴟𑴦𑴽𑴛𑵅𑴛𑴤𑴤𑵄 । 𑴝𑴺𑴨𑴳𑵀 𑴫𑴦𑴫𑵅𑴨𑴛𑴳𑵀 𑴑𑴼𑴨 𑴛𑴛𑴽 𑴓𑴥𑴤𑴴𑴝𑴳𑴦𑴥𑴺𑴛𑵄 ॥ 𑵑 ॥"), (Modi, "𑘡𑘰𑘨𑘰𑘧𑘜𑘽 𑘡𑘦𑘭𑘿𑘎𑘵𑘝𑘿𑘧 𑘡𑘨𑘽 𑘓𑘺𑘪 𑘡𑘨𑘻𑘝𑘿𑘝𑘦𑘦𑘿 𑙁 𑘟𑘹𑘪𑘲𑘽 𑘭𑘨𑘭𑘿𑘪𑘝𑘲𑘽 𑘓𑘺𑘪 𑘝𑘝𑘻 𑘕𑘧𑘦𑘳𑘟𑘲𑘨𑘧𑘹𑘝𑘿 𑙂 𑙑 𑙂"), + (Mon, "နာရာယဏံ နမသ္ကၖတျ နရံ စဲဝ နရောတ္တမမ် ၊ ဒေဝဳံ သရသွတဳံ စဲဝ တတော ဇယမုဒဳရယေတ် ။ ၁ ။"), (Nandinagari, "𑧁𑧑𑧈𑧑𑧇𑦼𑧞 𑧁𑧆𑧍𑧠𑦮𑧖𑦽𑧠𑧇 𑧁𑧈𑧞 𑦳𑧛𑧊 𑧁𑧈𑧜𑦽𑧠𑦽𑧆𑧆𑧠 । 𑦿𑧚𑧊𑧓𑧞 𑧍𑧈𑧍𑧠𑧊𑦽𑧓𑧞 𑦳𑧛𑧊 𑦽𑦽𑧜 𑦵𑧇𑧆𑧔𑦿𑧓𑧈𑧇𑧚𑦽𑧠 ॥ ೧ ॥"), (Newa, "𑐣𑐵𑐬𑐵𑐫𑐞𑑄 𑐣𑐩𑐳𑑂𑐎𑐺𑐟𑑂𑐫 𑐣𑐬𑑄 𑐔𑐿𑐰 𑐣𑐬𑑀𑐟𑑂𑐟𑐩𑐩𑑂 𑑋 𑐡𑐾𑐰𑐷𑑄 𑐳𑐬𑐳𑑂𑐰𑐟𑐷𑑄 𑐔𑐿𑐰 𑐟𑐟𑑀 𑐖𑐫𑐩𑐸𑐡𑐷𑐬𑐫𑐾𑐟𑑂 𑑌 𑑑 𑑌"), (Odia, "ନାରାଯଣଂ ନମସ୍କୃତ୍ଯ ନରଂ ଚୈଵ ନରୋତ୍ତମମ୍ । ଦେଵୀଂ ସରସ୍ଵତୀଂ ଚୈଵ ତତୋ ଜଯମୁଦୀରଯେତ୍ ॥ ୧ ॥"), (Saurashtra, "ꢥꢵꢬꢵꢫꢠꢀ ꢥꢪꢱ꣄ꢒꢺꢡ꣄ꢫ ꢥꢬꢀ ꢗꣀꢮ ꢥꢬꣂꢡ꣄ꢡꢪꢪ꣄ ꣎ ꢣꢿꢮꢷꢀ ꢱꢬꢱ꣄ꢮꢡꢷꢀ ꢗꣀꢮ ꢡꢡꣂ ꢙꢫꢪꢸꢣꢷꢬꢫꢿꢡ꣄ ꣏ ꣑ ꣏"), (Sharada, "𑆤𑆳𑆫𑆳𑆪𑆟𑆁 𑆤𑆩𑆱𑇀𑆑𑆸𑆠𑇀𑆪 𑆤𑆫𑆁 𑆖𑆽𑆮 𑆤𑆫𑆾𑆠𑇀𑆠𑆩𑆩𑇀 𑇅 𑆢𑆼𑆮𑆵𑆁 𑆱𑆫𑆱𑇀𑆮𑆠𑆵𑆁 𑆖𑆽𑆮 𑆠𑆠𑆾 𑆘𑆪𑆩𑆶𑆢𑆵𑆫𑆪𑆼𑆠𑇀 𑇆 𑇑 𑇆"), (Siddham, "𑖡𑖯𑖨𑖯𑖧𑖜𑖽 𑖡𑖦𑖭𑖿𑖎𑖴𑖝𑖿𑖧 𑖡𑖨𑖽 𑖓𑖹𑖪 𑖡𑖨𑖺𑖝𑖿𑖝𑖦𑖦𑖿 𑗂 𑖟𑖸𑖪𑖱𑖽 𑖭𑖨𑖭𑖿𑖪𑖝𑖱𑖽 𑖓𑖹𑖪 𑖝𑖝𑖺 𑖕𑖧𑖦𑖲𑖟𑖱𑖨𑖧𑖸𑖝𑖿 𑗃 1 𑗃"), - // (TaiTham, "ᨶᩣᩁᩣᨿᨱᩴ ᨶᨾᩈ᩠ᨠ᩺ᩂᨲ᩠ᨿ ᨶᩁᩴ ᨧᩱᩅ ᨶᩁᩮᩣᨲ᩠ᨲᨾᨾ᩺ ᪨ ᨴᩮᩅᩦᩴ ᩈᩁᩈ᩠ᩅᨲᩦᩴ ᨧᩱᩅ ᨲᨲᩮᩣ ᨩᨿᨾᩩᨴᩦᩁᨿᩮᨲ᩺ ᪩ ᪑ ᪩"), + (TaiTham, "ᨶᩣᩁᩣᨿᨱᩴ ᨶᨾᩈ᩠ᨠ᩺ᩂᨲ᩠ᨿ ᨶᩁᩴ ᨧᩱᩅ ᨶᩁᩮᩣᨲ᩠ᨲᨾᨾ᩺ ᪨ ᨴᩮᩅᩦᩴ ᩈᩁᩈ᩠ᩅᨲᩦᩴ ᨧᩱᩅ ᨲᨲᩮᩣ ᨩᨿᨾᩩᨴᩦᩁᨿᩮᨲ᩺ ᪩ ᪑ ᪩"), (Tamil, "நாராயணம்ʼ நமஸ்க்ருʼத்ய நரம்ʼ சைவ நரோத்தமம் . தே³வீம்ʼ ஸரஸ்வதீம்ʼ சைவ ததோ ஜயமுதீ³ரயேத் .. 1 .."), (Telugu, "నారాయణం నమస్కృత్య నరం చైవ నరోత్తమమ్ । దేవీం సరస్వతీం చైవ తతో జయముదీరయేత్ ॥ ౧ ॥"), (Thai, "นารายณํ นมสฺกฺฤตฺย นรํ ไจว นโรตฺตมมฺ ฯ เทวีํ สรสฺวตีํ ไจว ตโต ชยมุทีรเยตฺ ๚ ๑ ๚"), @@ -513,12 +566,55 @@ fn sanskrit_basic_sentences() { ], ); - // Non-reversible due to b/v and no virama. + // Non-reversible due to y-nukta and no virama. assert_transliterate( slp1_text, Slp1, - Tibetan, - "ནཱརཱཡཎཾ་ནམསྐྲྀཏྱ་ནརཾ་ཙཻབ་ནརོཏྟམམ་།་དེབཱིཾ་སརསྭཏཱིཾ་ཙཻབ་ཏཏོ་ཛཡམུདཱིརཡེཏ་༎་༡་༎", + Assamese, + "নাৰায়ণং নমস্কৃত্য নৰং চৈৱ নৰোত্তমম্ । দেৱীং সৰস্ৱতীং চৈৱ ততো জয়মুদীৰয়েৎ ॥ ১ ॥", + ); + + // Non-reversible due to no v/b distinction. + assert_transliterate( + slp1_text, + Slp1, + Bengali, + "নারায়ণং নমস্কৃত্য নরং চৈব নরোত্তমম্ । দেবীং সরস্বতীং চৈব ততো জয়মুদীরয়েৎ ॥ ১ ॥", + ); + + // Non-reversible due to no Ta/ta distinctions. + assert_transliterate( + slp1_text, + Slp1, + Cham, + "ꨘꨩꨣꨩꨢꨘꩌ ꨘꨠꩋꨆꨴꨮꨓꨳ ꨘꨣꩌ ꨌꨰꨥ ꨘꨣꨯꩅꨓꨠꩌ ꩝ ꨕꨯꨮꨥꨫꩌ ꨧꨣꨧꨶꨓꨫꩌ ꨌꨰꨥ ꨓꨓꨯ ꨎꨢꨠꨭꨕꨫꨣꨢꨯꨮꩅ ꩞ ꩑ ꩞", + ); + + // Non-reversible due to various na-s and sa-s. + assert_transliterate( + slp1_text, + Slp1, + GunjalaGondi, + "𑵺𑶊𑶈𑶊𑵬𑵺𑶕 𑵺𑵰𑶉𑶗𑵱𑶗𑶈𑶍𑵳𑶗𑵬 𑵺𑶈𑶕 𑵻𑶑𑵭 𑵺𑶈𑶓𑵳𑶗𑵳𑵰𑵰 . 𑵸𑶐𑵭𑶌𑶕 𑶉𑶈𑶉𑶗𑵭𑵳𑶌𑶕 𑵻𑶑𑵭 𑵳𑵳𑶓 𑶀𑵬𑵰𑶍𑵸𑶌𑶈𑵬𑶐𑵳 .. 𑶡 ..", + ); + + // Non-reversible due to no vocalic r/rr/l/ll + // TODO: how to use ADDAK? + /* + assert_transliterate( + slp1_text, + Slp1, + Gurmukhi, + "ਨਾਰਾਯਣੰ ਨਮਸ੍ਕ੍ਰੁਤ੍ਯ ਨਰੰ ਚੈਵ ਨਰੋੱਤਮਮ੍ । ਦੇਵੀਂ ਸਰਸ੍ਵਤੀਂ ਚੈਵ ਤਤੋ ਜਯਮੁਦੀਰਯੇਤ੍ ॥ 1 ॥", + ); + */ + + // Non-reversible due to ca/cha, vocalic r/rr/l/ll, etc. + assert_transliterate( + slp1_text, + Slp1, + MeeteiMayek, + "ꯅꯥꯔꯥꯌꯅꯪ ꯅꯃꯁ꯭ꯛꯔꯨꯠꯌ ꯅꯔꯪ ꯆꯩꯋ ꯅꯔꯣꯠꯇꯃꯝ ꯫ ꯗꯦꯋꯤꯪ ꯁꯔꯁ꯭ꯋꯇꯤꯪ ꯆꯩꯋ ꯇꯇꯣ ꯖꯌꯃꯨꯗꯤꯔꯌꯦꯠ ꯫꯫ ꯱ ꯫꯫", ); // Non-reversible due to no "sa" distinctions. @@ -532,13 +628,29 @@ fn sanskrit_basic_sentences() { ), ); - // Non-reversible due to various na-s and sa-s. + // Non-reversible due to no virama. + /* assert_transliterate( slp1_text, Slp1, - GunjalaGondi, - "𑵺𑶊𑶈𑶊𑵬𑵺𑶕 𑵺𑵰𑶉𑶗𑵱𑶗𑶈𑶍𑵳𑶗𑵬 𑵺𑶈𑶕 𑵻𑶑𑵭 𑵺𑶈𑶓𑵳𑶗𑵳𑵰𑵰 . 𑵸𑶐𑵭𑶌𑶕 𑶉𑶈𑶉𑶗𑵭𑵳𑶌𑶕 𑵻𑶑𑵭 𑵳𑵳𑶓 𑶀𑵬𑵰𑶍𑵸𑶌𑶈𑵬𑶐𑵳 .. 𑶡 ..", + Soyombo, + "𑩯𑩛𑩼𑩛𑩻𑩪𑪖 𑩯𑩴𑪁𑪙𑩜𑩙𑩫𑪙𑩻 𑩯𑩼𑪖 𑩵𑩗𑩾 𑩯𑩼𑩖𑩫𑪘𑩴𑩴 𑪛 𑩭𑩔𑩾𑩑𑩛𑪖 𑪁𑩼𑪁𑪙𑩾𑩫𑩑𑩛𑪖 𑩵𑩗𑩾 𑩫𑩫𑩖 𑩷𑩻𑩴𑩒𑩭𑩑𑩛𑩼𑩻𑩔𑩫 𑪜 1 𑪜", ); + */ + + // Non-reversible due to b/v and no virama. + assert_transliterate( + slp1_text, + Slp1, + Tibetan, + "ནཱརཱཡཎཾ་ནམསྐྲྀཏྱ་ནརཾ་ཙཻབ་ནརོཏྟམམ་།་དེབཱིཾ་སརསྭཏཱིཾ་ཙཻབ་ཏཏོ་ཛཡམུདཱིརཡེཏ་༎་༡་༎", + ); +} + +#[test] +fn cham() { + // Non-reversible due to no Ta/ta distinctions. + assert_transliterate("namaskftya", Slp1, Cham, "ꨘꨠꩋꨆꨴꨮꨓꨳ"); } // Sanskrit (Vedic) @@ -552,9 +664,11 @@ fn sanskrit_vedic_accent() { (Itrans, r"a\_ a\'"), (Slp1, r"a\ a^"), // Indic + (Assamese, "অ॒ অ॑"), (Bengali, "অ॒ অ॑"), (Devanagari, "अ॒ अ॑"), (Grantha, "𑌅॒ 𑌅᳴"), + (Gujarati, "અ॒ અ॑"), (Kannada, "ಅ॒ ಅ॑"), (Malayalam, "അ॒ അ॑"), (Odia, "ଅ॒ ଅ॑"), @@ -577,6 +691,31 @@ fn sanskrit_vedic_accent() { ]); } +#[test] +fn sanskrit_vedic_svara_with_ayogavahas() { + assert_two_way_pairwise(&[ + (BarahaSouth, "aqH a#H a$H aqM a#M a$M"), + (Itrans, r#"a\_H a\'H a\"H a\_M a\'M a\"M"#), + (Devanagari, "अः॒ अः॑ अः᳚ अं॒ अं॑ अं᳚"), + ]); +} + +#[test] +fn sanskrit_upadhmaniya_and_jihvamuliya() { + assert_two_way_pairwise(&[ + (Iso15919, "kaẖ kaḫ"), + (Slp1, "kaZ kaV"), + // Indic + (Brahmi, "𑀓𑀃 𑀓𑀄"), + (Devanagari, "कᳵ कᳶ"), + (Kannada, "ಕೱ ಕೲ"), + (Newa, "𑐎𑑠 𑐎𑑡"), + (Sharada, "𑆑𑇂 𑆑𑇃"), + (Soyombo, "𑩜𑪄 𑩜𑪅"), + (Tibetan, "ཀྈ་ཀྉ"), + ]); +} + #[test] fn sanskrit_vedic_consonants() { assert_two_way_pairwise(&[ @@ -592,7 +731,7 @@ fn sanskrit_vedic_consonants() { (Balinese, "ᬮ᬴ ᬮ᬴᭄ᬳ"), (Bengali, "ল় ল়্হ"), (Brahmi, "𑀴 𑀴𑁆𑀳"), - (Burmese, "ဠ ဠ်ဟ"), + (Burmese, "ဠ ဠှ"), (Devanagari, "ळ ळ्ह"), (Grantha, "𑌳 𑌳𑍍𑌹"), (Gujarati, "ળ ળ્હ"), @@ -621,12 +760,14 @@ fn other_consonants() { #[test] fn short_e_and_o_vowels() { assert_two_way_pairwise(&[ + (BarahaSouth, "e ke o ko"), + (Iso15919, "e ke o ko"), + // Indic (Devanagari, "ऎ कॆ ऒ कॊ"), (Kannada, "ಎ ಕೆ ಒ ಕೊ"), (Malayalam, "എ കെ ഒ കൊ"), (Tamil, "எ கெ ஒ கொ"), (Telugu, "ఎ కె ఒ కొ"), - (Iso15919, "e ke o ko"), ]); } @@ -655,6 +796,23 @@ fn baraha_accents() { // TODO: aW } +#[test] +fn burmese_subjoined_consonants() { + assert_two_way_pairwise(&[ + ( + Slp1, + concat!( + "nka nKa nga nGa nNa nca nCa nja nJa nYa nwa nWa nqa nQa nRa ", + "nta nTa nda nDa nna npa nPa nba nBa nma nya nra nla nva nSa nza nsa nha" + ), + ), + ( + Burmese, + "န္က န္ခ န္ဂ န္ဃ န္င န္စ န္ဆ န္ဇ န္ဈ န္ဉ န္ဋ န္ဌ န္ဍ န္ဎ န္ဏ န္တ န္ထ န္ဒ န္ဓ န္န န္ပ န္ဖ န္ဗ န္ဘ န္မ နျ နြ န္လ နွ န္ၐ န္ၑ န္သ နှ", + ), + ]); +} + #[test] fn devanagari_prishthamatra() { let assert_has = @@ -697,17 +855,30 @@ fn iast_unicode_variants() { } #[test] -fn iso_tamil_aytam() { +fn iso_15919_bug_no_greedy_match_on_nfd() { + assert_transliterate("ai\u{0304} au\u{0304}", Iso15919, Devanagari, "अई अऊ"); + assert_transliterate( + "agh\u{0323} agh\u{0331} agh\u{032e}", + Iso15919, + Devanagari, + "अग्ः अग्ᳵ अग्ᳶ", + ); +} + +#[test] +fn iso_15919_tamil_aytam() { assert_transliterate("ஃ", Tamil, Iso15919, "ḳ"); assert_transliterate("\u{1e33}", Iso15919, Tamil, "ஃ"); assert_transliterate("k\u{0323}", Iso15919, Tamil, "ஃ"); } #[test] -fn iso_unicode_variants() { +fn iso_15919_unicode_variants() { assert_supports_unicode_variants( Scheme::Iso15919, - &["ā", "ī", "ū", "ṁ", "ḥ", "ṅ", "ñ", "ṭ", "ḍ", "ṇ", "ś", "ṣ"], + &[ + "ā", "ī", "ū", "ṁ", "ḥ", "ẖ", "ḫ", "ṅ", "ñ", "ṭ", "ḍ", "ṇ", "ś", "ṣ", + ], ); } @@ -771,6 +942,11 @@ fn itrans_backslash_escape() { assert_transliterate("nara\\", Itrans, Devanagari, "नर"); } +#[test] +fn javanese_medial_consonants() { + assert_two_way_pairwise(&[(Slp1, "nya nra"), (Javanese, "ꦤꦾ ꦤꦿ")]); +} + #[test] fn kannada_unicode_variants() { assert_supports_unicode_variants(Scheme::Kannada, &["ಕೀ", "ಕೇ", "ಕೈ", "ಕೊ", "ಕೋ"]); @@ -781,6 +957,30 @@ fn khmer_sign_robat() { assert_two_way_pairwise(&[(Slp1, "kra kara rka arka kara"), (Khmer, "ក្រ ករ ក៌ អក៌ ករ")]); } +#[test] +fn malayalam_chillus() { + // Our chillu support is currently limited to when we transliterate *from* Malayalam. + let pairs = &[ + ("അൺക", "aRka", "अण्क"), + ("അൻക", "anka", "अन्क"), + ("അർക", "arka", "अर्क"), + ("അൽക", "alka", "अल्क"), + ]; + for (input, slp1, devanagari) in pairs { + assert_transliterate(input, Malayalam, Slp1, slp1); + assert_transliterate(input, Malayalam, Devanagari, devanagari); + } +} + +#[test] +fn malayalam_modern_and_archaic_au() { + // Modern sign au + assert_two_way_pairwise(&[(Slp1, "kO"), (Malayalam, "കൗ")]); + + // Archaic sign au + assert_transliterate("കൌ", Malayalam, Slp1, "kO"); +} + #[test] fn masaram_gondi_conjuncts() { assert_two_way_pairwise(&[(Slp1, "kza kzA"), (MasaramGondi, "𑴮 𑴮𑴱")]); diff --git a/vidyut-lipi/www/static/vidyut-lipi-app.js b/vidyut-lipi/www/static/vidyut-lipi-app.js index 943acd7..5a96ab6 100644 --- a/vidyut-lipi/www/static/vidyut-lipi-app.js +++ b/vidyut-lipi/www/static/vidyut-lipi-app.js @@ -19,7 +19,7 @@ import init, { transliterate, detect, Scheme } from "/static/wasm/vidyut_lipi.js let CHEAT_SHEET = [ "a A i I u U R RR lR lRR e ai o au aM aH", - "ka kha ga gha Na ca cha ja jha Ja", + "ka kha ga gha Ga ca cha ja jha Ja", "Ta Tha Da Dha Na ta tha da dha na", "pa pha ba bha ma ya ra la va", "za Sa sa ha La kSa jJa", @@ -115,33 +115,46 @@ let symbols = [ "ऽ" ]; -let schemes = [ +let orderedSchemes = [ Scheme.Devanagari, + Scheme.Assamese, Scheme.Balinese, Scheme.Bengali, Scheme.Bhaiksuki, Scheme.Brahmi, Scheme.Burmese, Scheme.Cham, + Scheme.Dogra, Scheme.Grantha, Scheme.Gujarati, + Scheme.GunjalaGondi, Scheme.Gurmukhi, Scheme.Javanese, + Scheme.Kaithi, Scheme.Kannada, Scheme.Khmer, + Scheme.Khudawadi, + Scheme.Limbu, Scheme.Malayalam, + Scheme.MasaramGondi, + Scheme.MeeteiMayek, Scheme.Modi, + Scheme.Nandinagari, Scheme.Newa, Scheme.Odia, + Scheme.OlChiki, Scheme.Saurashtra, Scheme.Sharada, Scheme.Siddham, Scheme.Sinhala, + Scheme.TaiTham, + Scheme.Takri, Scheme.Tamil, Scheme.Telugu, Scheme.Thai, Scheme.Tibetan, Scheme.Tirhuta, + Scheme.ZanabazarSquare, Scheme.BarahaSouth, Scheme.HarvardKyoto, @@ -155,31 +168,47 @@ let schemes = [ let schemeNames = { [Scheme.Devanagari]: "Devanagari", + [Scheme.Assamese]: "Assamese", [Scheme.Balinese]: "Balinese", [Scheme.Bengali]: "Bengali", [Scheme.Bhaiksuki]: "Bhaiksuki", [Scheme.Brahmi]: "Brahmi", [Scheme.Burmese]: "Burmese", [Scheme.Cham]: "Cham", + [Scheme.Dogra]: "Dogra", [Scheme.Grantha]: "Grantha", [Scheme.Gujarati]: "Gujarati", + [Scheme.GunjalaGondi]: "Gunjala Gondi", [Scheme.Gurmukhi]: "Gurmukhi", [Scheme.Javanese]: "Javanese", + [Scheme.Kaithi]: "Kaithi", [Scheme.Kannada]: "Kannada", + [Scheme.Kharoshthi]: "Kharoshthi", [Scheme.Khmer]: "Khmer", + [Scheme.Khudawadi]: "Khudawadi", + [Scheme.Limbu]: "Limbu", [Scheme.Malayalam]: "Malayalam", + [Scheme.MasaramGondi]: "Masaram Gondi", + [Scheme.MeeteiMayek]: "Meetei Mayek", [Scheme.Modi]: "Modi", + [Scheme.Mon]: "Mon", + [Scheme.Nandinagari]: "Nandinagari", [Scheme.Newa]: "Newa (Nepal Bhasa)", [Scheme.Odia]: "Odia", + [Scheme.OlChiki]: "Ol Chiki", [Scheme.Saurashtra]: "Saurashtra", [Scheme.Sharada]: "Sharada", [Scheme.Siddham]: "Siddham", [Scheme.Sinhala]: "Sinhala", + [Scheme.Soyombo]: "Soyombo", + [Scheme.TaiTham]: "Tai Tham", + [Scheme.Takri]: "Takri", [Scheme.Tamil]: "Tamil", [Scheme.Telugu]: "Telugu", [Scheme.Thai]: "Thai", [Scheme.Tibetan]: "Tibetan", [Scheme.Tirhuta]: "Tirhuta", + [Scheme.ZanabazarSquare]: "Zanabazar Square", [Scheme.BarahaSouth]: "Baraha (Southern)", [Scheme.HarvardKyoto]: "Harvard-Kyoto", @@ -300,10 +329,11 @@ const App = () => ({ async soundTable() { await this.initVidyut(); + let allTables = []; [VOWELS, MARKS, CONSONANTS, symbols].forEach((group) => { let table = []; - schemes.forEach((toScheme) => { + orderedSchemes.forEach((toScheme) => { let name = schemeNames[toScheme]; let row = []; group.forEach((sound) => {