diff --git a/indic_transliteration/aksharamukha_helper.py b/indic_transliteration/aksharamukha_helper.py index 9bc0093..2b5e2a6 100644 --- a/indic_transliteration/aksharamukha_helper.py +++ b/indic_transliteration/aksharamukha_helper.py @@ -6,14 +6,26 @@ import tqdm import regex + def transliterate_tamil(text, dest_script="DEVANAGARI", aksharamukha_pre_options=["TamilTranscribe"], aksharamukha_post_options=[]): source_script = "TAMIL" dest_script = dest_script.capitalize() text = regex.sub("ற", r"ऱ", text) - # Unfortunately, can't even recover the superior transliteration by post-hoc replacement - ஏமாற்று ēmāṟṟu is transliterated to एमाट्रु, and not एमाट्र््ट्रु :-( It's like switching to a superior script and loosing information. So, must do pre-replacement. https://github.com/virtualvinodh/aksharamukha-python/issues/21 + text = regex.sub("ன", r"ऩ", text) + # https://github.com/virtualvinodh/aksharamukha-python/issues/21 text = aksharamukha.transliterate.process(src=source_script, tgt=dest_script, txt=text, nativize = True, pre_options = aksharamukha_pre_options, post_options = aksharamukha_post_options) + text = text.replace("\u200c", "") # https://github.com/virtualvinodh/aksharamukha-python/issues/22 - text = regex.sub("म्([सव])", r"ं\1", text) + text = regex.sub("म्([सव])", r"ं\1", text) + # text = regex.sub("।", r".", text) + return text + + +def fix_naive_ta_transliterations(text): + text = regex.sub("म्([सव])", r"ं\1", text) + text = regex.sub("ट्र", r"ऱ्ऱ", text) + text = regex.sub("ण्ड्र", r"ऩ्ऱ", text) + text = regex.sub("न(?=\s)", r"ऩ", text) return text @@ -23,7 +35,10 @@ def convert_file(source_path, dest_path, source_script, dest_script, pre_options with codecs.open(source_path, "r", "utf-8") as in_file, codecs.open(dest_path, "w", "utf-8") as out_file: text = in_file.read() while text: - out_text = aksharamukha.transliterate.process(src=source_script, tgt=dest_script, txt=text, nativize = True, pre_options = pre_options, post_options = post_options) + if source_script == "TAMIL": + out_text = transliterate_tamil(text, dest_script, pre_options, post_options) + else: + out_text = aksharamukha.transliterate.process(src=source_script, tgt=dest_script, txt=text, nativize = True, pre_options = pre_options, post_options = post_options) out_file.write(out_text) if source_path != dest_path: text = in_file.read() diff --git a/setup.py b/setup.py index 666feec..890697d 100644 --- a/setup.py +++ b/setup.py @@ -44,7 +44,7 @@ # Versions should comply with PEP440. For a discussion on single-sourcing # the version across setup.py and the project code, see # https://packaging.python.org/en/latest/single_source_version.html - version='2.3.64', + version='2.3.65', description='Transliteration tools to convert text in one indic script encoding to another', diff --git a/tests/aksharamukha_helper_test.py b/tests/aksharamukha_helper_test.py new file mode 100644 index 0000000..1bd87ed --- /dev/null +++ b/tests/aksharamukha_helper_test.py @@ -0,0 +1,10 @@ +import pytest +import sys + +from indic_transliteration import aksharamukha_helper + + + +def test_transliterate_tamil(): + assert aksharamukha_helper.transliterate_tamil("அற்று") == "अऱ्ऱु" + \ No newline at end of file