Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
vvasuki committed Oct 2, 2024
1 parent f429a94 commit 1c507cd
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 4 deletions.
21 changes: 18 additions & 3 deletions indic_transliteration/aksharamukha_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,26 @@
import tqdm
import regex


def transliterate_tamil(text, dest_script="DEVANAGARI", aksharamukha_pre_options=["TamilTranscribe"], aksharamukha_post_options=[]):
source_script = "TAMIL"
dest_script = dest_script.capitalize()
text = regex.sub("ற", r"ऱ", text)
# Unfortunately, can't even recover the superior transliteration by post-hoc replacement - ஏமாற்று ēmāṟṟu is transliterated to एमाट्रु, and not एमाट्र््ट्रु :-( It's like switching to a superior script and loosing information. So, must do pre-replacement. https://github.com/virtualvinodh/aksharamukha-python/issues/21
text = regex.sub("ன", r"ऩ", text)
# https://github.com/virtualvinodh/aksharamukha-python/issues/21
text = aksharamukha.transliterate.process(src=source_script, tgt=dest_script, txt=text, nativize = True, pre_options = aksharamukha_pre_options, post_options = aksharamukha_post_options)
text = text.replace("\u200c", "")
# https://github.com/virtualvinodh/aksharamukha-python/issues/22
text = regex.sub("म्([सव])", r"ं\1", text)
text = regex.sub("म्([सव])", r"ं\1", text)
# text = regex.sub("।", r".", text)
return text


def fix_naive_ta_transliterations(text):
text = regex.sub("म्([सव])", r"ं\1", text)
text = regex.sub("ट्र", r"ऱ्ऱ", text)
text = regex.sub("ण्ड्र", r"ऩ्ऱ", text)
text = regex.sub("न(?=\s)", r"ऩ", text)
return text


Expand All @@ -23,7 +35,10 @@ def convert_file(source_path, dest_path, source_script, dest_script, pre_options
with codecs.open(source_path, "r", "utf-8") as in_file, codecs.open(dest_path, "w", "utf-8") as out_file:
text = in_file.read()
while text:
out_text = aksharamukha.transliterate.process(src=source_script, tgt=dest_script, txt=text, nativize = True, pre_options = pre_options, post_options = post_options)
if source_script == "TAMIL":
out_text = transliterate_tamil(text, dest_script, pre_options, post_options)
else:
out_text = aksharamukha.transliterate.process(src=source_script, tgt=dest_script, txt=text, nativize = True, pre_options = pre_options, post_options = post_options)
out_file.write(out_text)
if source_path != dest_path:
text = in_file.read()
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
# Versions should comply with PEP440. For a discussion on single-sourcing
# the version across setup.py and the project code, see
# https://packaging.python.org/en/latest/single_source_version.html
version='2.3.64',
version='2.3.65',


description='Transliteration tools to convert text in one indic script encoding to another',
Expand Down
10 changes: 10 additions & 0 deletions tests/aksharamukha_helper_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import pytest
import sys

from indic_transliteration import aksharamukha_helper



def test_transliterate_tamil():
assert aksharamukha_helper.transliterate_tamil("அற்று") == "अऱ्ऱु"

0 comments on commit 1c507cd

Please sign in to comment.