ऩ

indic-transliteration · Oct 2, 2024 · 1c507cd · 1c507cd
1 parent f429a94
commit 1c507cd
Show file tree

Hide file tree

Showing 3 changed files with 29 additions and 4 deletions.
diff --git a/indic_transliteration/aksharamukha_helper.py b/indic_transliteration/aksharamukha_helper.py
@@ -6,14 +6,26 @@
 import tqdm
 import regex
 
+
 def transliterate_tamil(text, dest_script="DEVANAGARI", aksharamukha_pre_options=["TamilTranscribe"], aksharamukha_post_options=[]):
   source_script = "TAMIL"
   dest_script = dest_script.capitalize()
   text = regex.sub("ற", r"ऱ", text)
-  # Unfortunately, can't even recover the superior transliteration by post-hoc replacement - ஏமாற்று ēmāṟṟu is transliterated to एमाट्रु, and not एमाट्र््ट्रु :-( It's like switching to a superior script and loosing information. So, must do pre-replacement. https://github.com/virtualvinodh/aksharamukha-python/issues/21
+  text = regex.sub("ன", r"ऩ", text)
+  # https://github.com/virtualvinodh/aksharamukha-python/issues/21
   text = aksharamukha.transliterate.process(src=source_script, tgt=dest_script, txt=text, nativize = True, pre_options = aksharamukha_pre_options, post_options = aksharamukha_post_options)
+  text = text.replace("\u200c", "")
   # https://github.com/virtualvinodh/aksharamukha-python/issues/22
-  text = regex.sub("म्([सव])", r"ं\1", text) 
+  text = regex.sub("म्([सव])", r"ं\1", text)
+  # text = regex.sub("।", r".", text)
+  return text
+
+
+def fix_naive_ta_transliterations(text):
+  text = regex.sub("म्([सव])", r"ं\1", text)
+  text = regex.sub("ट्र", r"ऱ्ऱ", text)
+  text = regex.sub("ण्ड्र", r"ऩ्ऱ", text)
+  text = regex.sub("न(?=\s)", r"ऩ", text)
   return text
 
 
@@ -23,7 +35,10 @@ def convert_file(source_path, dest_path, source_script, dest_script, pre_options
   with codecs.open(source_path, "r", "utf-8") as in_file, codecs.open(dest_path, "w", "utf-8") as out_file:
     text = in_file.read()
     while text:
-      out_text = aksharamukha.transliterate.process(src=source_script, tgt=dest_script, txt=text, nativize = True, pre_options = pre_options, post_options = post_options)
+      if source_script == "TAMIL":
+        out_text = transliterate_tamil(text, dest_script, pre_options, post_options)
+      else:
+        out_text = aksharamukha.transliterate.process(src=source_script, tgt=dest_script, txt=text, nativize = True, pre_options = pre_options, post_options = post_options)
       out_file.write(out_text)
       if source_path != dest_path:
         text = in_file.read()

diff --git a/setup.py b/setup.py
@@ -44,7 +44,7 @@
   # Versions should comply with PEP440.  For a discussion on single-sourcing
   # the version across setup.py and the project code, see
   # https://packaging.python.org/en/latest/single_source_version.html
-  version='2.3.64',
+  version='2.3.65',
 
 
   description='Transliteration tools to convert text in one indic script encoding to another',

diff --git a/tests/aksharamukha_helper_test.py b/tests/aksharamukha_helper_test.py
@@ -0,0 +1,10 @@
+import pytest
+import sys
+
+from indic_transliteration import aksharamukha_helper
+
+
+
+def test_transliterate_tamil():
+  assert aksharamukha_helper.transliterate_tamil("அற்று") == "अऱ्ऱु"
+