diff --git a/pwgissues/issue78/debug_transcode.py b/pwgissues/issue78/debug_transcode.py new file mode 100644 index 0000000..9285071 --- /dev/null +++ b/pwgissues/issue78/debug_transcode.py @@ -0,0 +1,100 @@ +#-*- coding:utf-8 -*- +"""debug_transcode.py +""" +from __future__ import print_function +import sys, re,codecs +import transcoder + +def read_lines(filein): + with codecs.open(filein,encoding='utf-8',mode='r') as f: + lines = [x.rstrip('\r\n') for x in f] + print(len(lines),"lines read from",filein) + return lines + +def write_lines(fileout,outarr): + with codecs.open(fileout,"w","utf-8") as f: + for out in outarr: + f.write(out+'\n') + print(len(outarr),"lines written to",fileout) + +def print_unicode(u): + """ Sample output: +x= a/MSa—BU/ +0905 | अ | DEVANAGARI LETTER A +0951 | ॑ | DEVANAGARI STRESS SIGN UDATTA +0902 | ं | DEVANAGARI SIGN ANUSVARA +0936 | श | DEVANAGARI LETTER SHA +2014 | — | EM DASH +092D | भ | DEVANAGARI LETTER BHA +0942 | ू | DEVANAGARI VOWEL SIGN UU +0951 | ॑ | DEVANAGARI STRESS SIGN UDATTA + """ + import unicodedata + outarr = [] + for c in u: + name = unicodedata.name(c) + icode = ord(c) + a = f"{icode:04X} | {c} | {name}" + outarr.append(a) + return outarr + +def transcode(x,tranin,tranout): + y = transcoder.transcoder_processString(x,tranin,tranout) + return y + +def convert_line(line,tranin,tranout,outarr): + # convert text in '{#X#}' -> {#Y#} + #outarr = [] + def f(m): + x = m.group(1) + y = transcode(x,tranin,tranout) + ans = '{#%s#}' % y + #return ans # comment out this line to + # check invertibility + x1 = transcode(y,tranout,tranin) + if x1 != x: + y1 = transcode(x1,tranin,tranout) + outarr.append(' x=%s, y=%s' %(x,y)) + a = print_unicode(x) + for a1 in a: + outarr.append(a1) + outarr.append('x1=%s, y1=%s' %(x1,y1)) + a = print_unicode(x1) + for a1 in a: + outarr.append(a1) + return ans + + regex = '{#(.*?)#}' + lineout = re.sub(regex,f,line) + return lineout + +def convert_lines(lines,tranin,tranout,outarr): + newlines = [] + for line in lines: + newline = convert_line(line,tranin,tranout,outarr) + newlines.append(newline) + return newlines + + +def test(lines): + outarr = [] + def f(m): + x = m.group(1) + y = transcode(x,tranin,tranout) + return '{#%s#}' % y + for iline,line in enumerate(lines): + lineout = re.sub(r'{#(.*?)#}',f,line) + outarr.append(lineout) + return outarr + +if __name__=="__main__": + transcoderdir = sys.argv[1] + transcoder.transcoder_set_dir(transcoderdir) + tranin = sys.argv[2] + tranout = sys.argv[3] + filein = sys.argv[4] + fileout = sys.argv[5] + + lines = read_lines(filein) + outarr = test(lines) + write_lines(fileout,outarr) diff --git a/pwgissues/issue78/readme.txt b/pwgissues/issue78/readme.txt new file mode 100644 index 0000000..9d09d62 --- /dev/null +++ b/pwgissues/issue78/readme.txt @@ -0,0 +1,26 @@ +10-06-2024 +issue: https://github.com/sanskrit-lexicon/PWG/issues/78 + +# This directory +cd /c/xampp/htdocs/sanskrit-lexicon/PWG/pwgissues/issue78 + +# proposed enhancement of slp1 transcoding for vowel-markers. + +https://github.com/user-attachments/files/17129810/PWGVN_1-6_reformatted_.dng.txt + +There is a 'special' transcoding of slp1 to Devanagari +that is used in dictionaries: pwg, pw, sch, pwkvn +The transcoding files are slp1_deva1.xml and its inverse deva1_slp1.xml +We start with the versions of these two files in transcode0 directory. + +The revision is in transcode1 directory. + +python debug_transcode.py transcode1 slp1 deva1 test01_in.txt test01_out.txt +python debug_transcode.py transcode1 deva1 slp1 test01_out.txt test01_in_out.txt + +test01_in_out.txt is different than test01_in.txt. +The invertibility fails. + +Note that deva1_slp1.xml defines a finite-state machine with only one state, + the INIT state. + diff --git a/pwgissues/issue78/test01_in.txt b/pwgissues/issue78/test01_in.txt new file mode 100644 index 0000000..d473d7a --- /dev/null +++ b/pwgissues/issue78/test01_in.txt @@ -0,0 +1,4 @@ +{#tarudU¬AlakA#} +{#pElA¬Ada#} +{#rA¬ARa/#} +{#¬i#} diff --git a/pwgissues/issue78/test01_in_out.txt b/pwgissues/issue78/test01_in_out.txt new file mode 100644 index 0000000..82891ce --- /dev/null +++ b/pwgissues/issue78/test01_in_out.txt @@ -0,0 +1,4 @@ +{#tarudUAlakA#} +{#pElAAda#} +{#rAARa/#} +{#i#} diff --git a/pwgissues/issue78/test01_out.txt b/pwgissues/issue78/test01_out.txt new file mode 100644 index 0000000..29e22b1 --- /dev/null +++ b/pwgissues/issue78/test01_out.txt @@ -0,0 +1,4 @@ +{#तरुदूालका#} +{#पैलााद#} +{#रााण꣫#} +{#ि#} diff --git a/pwgissues/issue78/transcode0/deva1_slp1.xml b/pwgissues/issue78/transcode0/deva1_slp1.xml new file mode 100644 index 0000000..4bc821a --- /dev/null +++ b/pwgissues/issue78/transcode0/deva1_slp1.xml @@ -0,0 +1,220 @@ + + + INIT \u0905 a + INIT \u0906 A + INIT \u0907 i + INIT \u0908 I + INIT \u0909 u + INIT \u090a U + INIT \u090b f + INIT \u0960 F + INIT \u090c x + INIT \u0961 X + INIT \u090f e + INIT \u0910 E + INIT \u0913 o + INIT \u0914 O + + INIT \u0905\ua8eb a/ + INIT \u0906\ua8eb A/ + INIT \u0907\ua8eb i/ + INIT \u0908\ua8eb I/ + INIT \u0909\ua8eb u/ + INIT \u090a\ua8eb U/ + INIT \u090b\ua8eb f/ + INIT \u0960\ua8eb F/ + INIT \u090c\ua8eb x/ + INIT \u0961\ua8eb X/ + INIT \u090f\ua8eb e/ + INIT \u0910\ua8eb E/ + INIT \u0913\ua8eb o/ + INIT \u0914\ua8eb O/ + + INIT \u0905\u0952 a\ + INIT \u0906\u0952 A\ + INIT \u0907\u0952 i\ + INIT \u0908\u0952 I\ + INIT \u0909\u0952 u\ + INIT \u090a\u0952 U\ + INIT \u090b\u0952 f\ + INIT \u0960\u0952 F\ + INIT \u090c\u0952 x\ + INIT \u0961\u0952 X\ + INIT \u090f\u0952 e\ + INIT \u0910\u0952 E\ + INIT \u0913\u0952 o\ + INIT \u0914\u0952 O\ + + INIT \u093e A + INIT \u093f i + INIT \u0940 I + INIT \u0941 u + INIT \u0942 U + INIT \u0943 f + INIT \u0944 F + INIT \u0962 x + INIT \u0963 X + INIT \u0947 e + INIT \u0948 E + INIT \u094b o + INIT \u094c O + + INIT \u0951 ^ + INIT \u093e\u0951 A^ + INIT \u093f\u0951 i^ + INIT \u0940\u0951 I^ + INIT \u0941\u0951 u^ + INIT \u0942\u0951 U^ + INIT \u0943\u0951 f^ + INIT \u0944\u0951 F^ + INIT \u0962\u0951 x^ + INIT \u0963\u0951 X^ + INIT \u0947\u0951 e^ + INIT \u0948\u0951 E^ + INIT \u094b\u0951 o^ + INIT \u094c\u0951 O^ + + INIT \u0952 \ + INIT \u093e\u0952 A\ + INIT \u093f\u0952 i\ + INIT \u0940\u0952 I\ + INIT \u0941\u0952 u\ + INIT \u0942\u0952 U\ + INIT \u0943\u0952 f\ + INIT \u0944\u0952 F\ + INIT \u0962\u0952 x\ + INIT \u0963\u0952 X\ + INIT \u0947\u0952 e\ + INIT \u0948\u0952 E\ + INIT \u094b\u0952 o\ + INIT \u094c\u0952 O\ + + INIT \ua8eb / + INIT \u093e\ua8eb A/ + INIT \u093f\ua8eb i/ + INIT \u0940\ua8eb I/ + INIT \u0941\ua8eb u/ + INIT \u0942\ua8eb U/ + INIT \u0943\ua8eb f/ + INIT \u0944\ua8eb F/ + INIT \u0962\ua8eb x/ + INIT \u0963\ua8eb X/ + INIT \u0947\ua8eb e/ + INIT \u0948\ua8eb E/ + INIT \u094b\ua8eb o/ + INIT \u094c\ua8eb O/ + + + INIT \u094d + + INIT \u0915/^ ka + INIT \u0916/^ Ka + INIT \u0917/^ ga + INIT \u0918/^ Ga + INIT \u0919/^ Na + INIT \u091a/^ ca + INIT \u091b/^ Ca + INIT \u091c/^ ja + INIT \u091d/^ Ja + INIT \u091e/^ Ya + INIT \u091f/^ wa + INIT \u0920/^ Wa + INIT \u0921/^ qa + INIT \u0922/^ Qa + INIT \u0923/^ Ra + INIT \u0924/^ ta + INIT \u0925/^ Ta + INIT \u0926/^ da + INIT \u0927/^ Da + INIT \u0928/^ na + INIT \u092a/^ pa + INIT \u092b/^ Pa + INIT \u092c/^ ba + INIT \u092d/^ Ba + INIT \u092e/^ ma + INIT \u092f/^ ya + INIT \u0930/^ ra + INIT \u0932/^ la + INIT \u0933/^ La + INIT \u0933\u094d\u0939/^ |a + INIT \u0935/^ va + INIT \u0936/^ Sa + INIT \u0937/^ za + INIT \u0938/^ sa + INIT \u0939/^ ha + + + + INIT \u0915 k + INIT \u0916 K + INIT \u0917 g + INIT \u0918 G + INIT \u0919 N + INIT \u091a c + INIT \u091b C + INIT \u091c j + INIT \u091d J + INIT \u091e Y + INIT \u091f w + INIT \u0920 W + INIT \u0921 q + INIT \u0922 Q + INIT \u0923 R + INIT \u0924 t + INIT \u0925 T + INIT \u0926 d + INIT \u0927 D + INIT \u0928 n + INIT \u092a p + INIT \u092b P + INIT \u092c b + INIT \u092d B + INIT \u092e m + INIT \u092f y + INIT \u0930 r + INIT \u0932 l + INIT \u0933 L + INIT \u0933\u094d\u0939 | + INIT \u0935 v + INIT \u0936 S + INIT \u0937 z + INIT \u0938 s + INIT \u0939 h + INIT \u0902 M + INIT \u0903 H + INIT \u093d ' + INIT \u0964 . + INIT \u0965 .. + INIT \u0966 0 + INIT \u0967 1 + INIT \u0968 2 + INIT \u0969 3 + INIT \u096a 4 + INIT \u096b 5 + INIT \u096c 6 + INIT \u096d 7 + INIT \u096e 8 + INIT \u096f 9 + + INIT \u0901 ~ + + INIT \u0950 o~ + + + INIT \u0903\u0952 \H + INIT \u0902\u0952 \M + INIT \u0903\ua8eb /H + INIT \u0902\ua8eb /M + INIT \u0903\u0951 ^H + INIT \u0902\u0951 ^M + + INIT \u0903\u0951\u0952 ^\H + INIT \u0902\u0951\u0952 ^\M + INIT \u0901\u0952 \~ + INIT \u0901\u0951 ^~ + INIT \u0901\ua8eb /~ + INIT \u0901\u0903\ua8eb /~H + INIT \u1cf2 Z + diff --git a/pwgissues/issue78/transcode0/slp1_deva1.xml b/pwgissues/issue78/transcode0/slp1_deva1.xml new file mode 100644 index 0000000..8fcdaa1 --- /dev/null +++ b/pwgissues/issue78/transcode0/slp1_deva1.xml @@ -0,0 +1,175 @@ + + + INIT a \u0905 INIT + INIT A \u0906 INIT + INIT i \u0907 INIT + INIT I \u0908 INIT + INIT u \u0909 INIT + INIT U \u090a INIT + INIT f \u090b INIT + INIT F \u0960 INIT + INIT x \u090c INIT + INIT X \u0961 INIT + INIT e \u090f INIT + INIT E \u0910 INIT + INIT o \u0913 INIT + INIT O \u0914 INIT + + SKT a INIT + SKT A \u093e INIT + SKT i \u093f INIT + SKT I \u0940 INIT + SKT u \u0941 INIT + SKT U \u0942 INIT + SKT f \u0943 INIT + SKT F \u0944 INIT + SKT x \u0962 INIT + SKT X \u0963 INIT + SKT e \u0947 INIT + SKT E \u0948 INIT + SKT o \u094b INIT + SKT O \u094c INIT + + + INIT,SKT k/^([^aAiIuUfFxXeEoO^/\\]) \u0915\u094d SKT + INIT,SKT K/^([^aAiIuUfFxXeEoO^/\\]) \u0916\u094d SKT + INIT,SKT g/^([^aAiIuUfFxXeEoO^/\\]) \u0917\u094d SKT + INIT,SKT G/^([^aAiIuUfFxXeEoO^/\\]) \u0918\u094d SKT + INIT,SKT N/^([^aAiIuUfFxXeEoO^/\\]) \u0919\u094d SKT + INIT,SKT c/^([^aAiIuUfFxXeEoO^/\\]) \u091a\u094d SKT + INIT,SKT C/^([^aAiIuUfFxXeEoO^/\\]) \u091b\u094d SKT + INIT,SKT j/^([^aAiIuUfFxXeEoO^/\\]) \u091c\u094d SKT + INIT,SKT J/^([^aAiIuUfFxXeEoO^/\\]) \u091d\u094d SKT + INIT,SKT Y/^([^aAiIuUfFxXeEoO^/\\]) \u091e\u094d SKT + INIT,SKT w/^([^aAiIuUfFxXeEoO^/\\]) \u091f\u094d SKT + INIT,SKT W/^([^aAiIuUfFxXeEoO^/\\]) \u0920\u094d SKT + INIT,SKT q/^([^aAiIuUfFxXeEoO^/\\]) \u0921\u094d SKT + INIT,SKT Q/^([^aAiIuUfFxXeEoO^/\\]) \u0922\u094d SKT + INIT,SKT R/^([^aAiIuUfFxXeEoO^/\\]) \u0923\u094d SKT + INIT,SKT t/^([^aAiIuUfFxXeEoO^/\\]) \u0924\u094d SKT + INIT,SKT T/^([^aAiIuUfFxXeEoO^/\\]) \u0925\u094d SKT + INIT,SKT d/^([^aAiIuUfFxXeEoO^/\\]) \u0926\u094d SKT + INIT,SKT D/^([^aAiIuUfFxXeEoO^/\\]) \u0927\u094d SKT + INIT,SKT n/^([^aAiIuUfFxXeEoO^/\\]) \u0928\u094d SKT + INIT,SKT p/^([^aAiIuUfFxXeEoO^/\\]) \u092a\u094d SKT + INIT,SKT P/^([^aAiIuUfFxXeEoO^/\\]) \u092b\u094d SKT + INIT,SKT b/^([^aAiIuUfFxXeEoO^/\\]) \u092c\u094d SKT + INIT,SKT B/^([^aAiIuUfFxXeEoO^/\\]) \u092d\u094d SKT + INIT,SKT m/^([^aAiIuUfFxXeEoO^/\\]) \u092e\u094d SKT + INIT,SKT y/^([^aAiIuUfFxXeEoO^/\\]) \u092f\u094d SKT + INIT,SKT r/^([^aAiIuUfFxXeEoO^/\\]) \u0930\u094d SKT + INIT,SKT l/^([^aAiIuUfFxXeEoO^/\\]) \u0932\u094d SKT + INIT,SKT L/^([^aAiIuUfFxXeEoO^/\\]) \u0933\u094d SKT + INIT,SKT |/^([^aAiIuUfFxXeEoO^/\\]) \u0933\u094d\u0939\u094d SKT + INIT,SKT v/^([^aAiIuUfFxXeEoO^/\\]) \u0935\u094d SKT + INIT,SKT S/^([^aAiIuUfFxXeEoO^/\\]) \u0936\u094d SKT + INIT,SKT z/^([^aAiIuUfFxXeEoO^/\\]) \u0937\u094d SKT + INIT,SKT s/^([^aAiIuUfFxXeEoO^/\\]) \u0938\u094d SKT + INIT,SKT h/^([^aAiIuUfFxXeEoO^/\\]) \u0939\u094d SKT + INIT,SKT k \u0915 SKT + INIT,SKT K \u0916 SKT + INIT,SKT g \u0917 SKT + INIT,SKT G \u0918 SKT + INIT,SKT N \u0919 SKT + INIT,SKT c \u091a SKT + INIT,SKT C \u091b SKT + INIT,SKT j \u091c SKT + INIT,SKT J \u091d SKT + INIT,SKT Y \u091e SKT + INIT,SKT w \u091f SKT + INIT,SKT W \u0920 SKT + INIT,SKT q \u0921 SKT + INIT,SKT Q \u0922 SKT + INIT,SKT R \u0923 SKT + INIT,SKT t \u0924 SKT + INIT,SKT T \u0925 SKT + INIT,SKT d \u0926 SKT + INIT,SKT D \u0927 SKT + INIT,SKT n \u0928 SKT + INIT,SKT p \u092a SKT + INIT,SKT P \u092b SKT + INIT,SKT b \u092c SKT + INIT,SKT B \u092d SKT + INIT,SKT m \u092e SKT + INIT,SKT y \u092f SKT + INIT,SKT r \u0930 SKT + INIT,SKT l \u0932 SKT + INIT,SKT L \u0933 SKT + + INIT,SKT | \u0933\u094d\u0939 SKT + INIT,SKT v \u0935 SKT + INIT,SKT S \u0936 SKT + INIT,SKT z \u0937 SKT + INIT,SKT s \u0938 SKT + INIT,SKT h \u0939 SKT + + INIT,SKT M \u0902 INIT + INIT,SKT H \u0903 INIT + INIT,SKT ' \u093d INIT + INIT,SKT . \u0964 INIT + INIT,SKT .. \u0965 INIT + INIT,SKT 0 \u0966 INIT + INIT,SKT 1 \u0967 INIT + INIT,SKT 2 \u0968 INIT + INIT,SKT 3 \u0969 INIT + INIT,SKT 4 \u096a INIT + INIT,SKT 5 \u096b INIT + INIT,SKT 6 \u096c INIT + INIT,SKT 7 \u096d INIT + INIT,SKT 8 \u096e INIT + INIT,SKT 9 \u096f INIT + INIT,SKT \u0020 \u0020 INIT + INIT,SKT \u0009 \u0009 INIT + INIT,SKT \u000d \u000d INIT + INIT,SKT \u000a \u000a INIT + + + INIT,SKT - - INIT + + INIT,SKT \ \u0952 INIT + + INIT,SKT / \ua8eb INIT + + INIT,SKT ^ \u0951 INIT + + + INIT,SKT ~ \u0901 INIT + + INIT,SKT o~ \u0950 INIT + + + INIT,SKT Z \u1cf2 INIT + INIT,SKT V \u1cf2 INIT + + + INIT,SKT \H \u0903\u0952 SKT + INIT,SKT \M \u0902\u0952 SKT + INIT,SKT /H \u0903\ua8eb SKT + INIT,SKT /M \u0902\ua8eb SKT + INIT,SKT ^H \u0903\u0951 SKT + INIT,SKT ^M \u0902\u0951 SKT + + INIT,SKT ^\H \u0903\u0951\u0952 SKT + INIT,SKT ^\M \u0902\u0951\u0952 SKT + INIT,SKT \~ \u0901\u0952 SKT + INIT,SKT ^~ \u0901\u0951 SKT + INIT,SKT /~ \u0901\ua8eb SKT + INIT,SKT /~H \u0901\u0903\ua8eb SKT + + + INIT,SKT £ \ua8f2 INIT + + diff --git a/pwgissues/issue78/transcode1/deva1_slp1.xml b/pwgissues/issue78/transcode1/deva1_slp1.xml new file mode 100644 index 0000000..e6a9ea0 --- /dev/null +++ b/pwgissues/issue78/transcode1/deva1_slp1.xml @@ -0,0 +1,236 @@ + + + INIT \u0905 a + INIT \u0906 A + INIT \u0907 i + INIT \u0908 I + INIT \u0909 u + INIT \u090a U + INIT \u090b f + INIT \u0960 F + INIT \u090c x + INIT \u0961 X + INIT \u090f e + INIT \u0910 E + INIT \u0913 o + INIT \u0914 O + + INIT \u0905\ua8eb a/ + INIT \u0906\ua8eb A/ + INIT \u0907\ua8eb i/ + INIT \u0908\ua8eb I/ + INIT \u0909\ua8eb u/ + INIT \u090a\ua8eb U/ + INIT \u090b\ua8eb f/ + INIT \u0960\ua8eb F/ + INIT \u090c\ua8eb x/ + INIT \u0961\ua8eb X/ + INIT \u090f\ua8eb e/ + INIT \u0910\ua8eb E/ + INIT \u0913\ua8eb o/ + INIT \u0914\ua8eb O/ + + INIT \u0905\u0952 a\ + INIT \u0906\u0952 A\ + INIT \u0907\u0952 i\ + INIT \u0908\u0952 I\ + INIT \u0909\u0952 u\ + INIT \u090a\u0952 U\ + INIT \u090b\u0952 f\ + INIT \u0960\u0952 F\ + INIT \u090c\u0952 x\ + INIT \u0961\u0952 X\ + INIT \u090f\u0952 e\ + INIT \u0910\u0952 E\ + INIT \u0913\u0952 o\ + INIT \u0914\u0952 O\ + + INIT \u093e A + INIT \u093f i + INIT \u0940 I + INIT \u0941 u + INIT \u0942 U + INIT \u0943 f + INIT \u0944 F + INIT \u0962 x + INIT \u0963 X + INIT \u0947 e + INIT \u0948 E + INIT \u094b o + INIT \u094c O + + INIT \u0951 ^ + INIT \u093e\u0951 A^ + INIT \u093f\u0951 i^ + INIT \u0940\u0951 I^ + INIT \u0941\u0951 u^ + INIT \u0942\u0951 U^ + INIT \u0943\u0951 f^ + INIT \u0944\u0951 F^ + INIT \u0962\u0951 x^ + INIT \u0963\u0951 X^ + INIT \u0947\u0951 e^ + INIT \u0948\u0951 E^ + INIT \u094b\u0951 o^ + INIT \u094c\u0951 O^ + + INIT \u0952 \ + INIT \u093e\u0952 A\ + INIT \u093f\u0952 i\ + INIT \u0940\u0952 I\ + INIT \u0941\u0952 u\ + INIT \u0942\u0952 U\ + INIT \u0943\u0952 f\ + INIT \u0944\u0952 F\ + INIT \u0962\u0952 x\ + INIT \u0963\u0952 X\ + INIT \u0947\u0952 e\ + INIT \u0948\u0952 E\ + INIT \u094b\u0952 o\ + INIT \u094c\u0952 O\ + + INIT \ua8eb / + INIT \u093e\ua8eb A/ + INIT \u093f\ua8eb i/ + INIT \u0940\ua8eb I/ + INIT \u0941\ua8eb u/ + INIT \u0942\ua8eb U/ + INIT \u0943\ua8eb f/ + INIT \u0944\ua8eb F/ + INIT \u0962\ua8eb x/ + INIT \u0963\ua8eb X/ + INIT \u0947\ua8eb e/ + INIT \u0948\ua8eb E/ + INIT \u094b\ua8eb o/ + INIT \u094c\ua8eb O/ + + + INIT \u094d + + INIT \u0915/^ ka + INIT \u0916/^ Ka + INIT \u0917/^ ga + INIT \u0918/^ Ga + INIT \u0919/^ Na + INIT \u091a/^ ca + INIT \u091b/^ Ca + INIT \u091c/^ ja + INIT \u091d/^ Ja + INIT \u091e/^ Ya + INIT \u091f/^ wa + INIT \u0920/^ Wa + INIT \u0921/^ qa + INIT \u0922/^ Qa + INIT \u0923/^ Ra + INIT \u0924/^ ta + INIT \u0925/^ Ta + INIT \u0926/^ da + INIT \u0927/^ Da + INIT \u0928/^ na + INIT \u092a/^ pa + INIT \u092b/^ Pa + INIT \u092c/^ ba + INIT \u092d/^ Ba + INIT \u092e/^ ma + INIT \u092f/^ ya + INIT \u0930/^ ra + INIT \u0932/^ la + INIT \u0933/^ La + INIT \u0933\u094d\u0939/^ |a + INIT \u0935/^ va + INIT \u0936/^ Sa + INIT \u0937/^ za + INIT \u0938/^ sa + INIT \u0939/^ ha + + + + INIT \u0915 k + INIT \u0916 K + INIT \u0917 g + INIT \u0918 G + INIT \u0919 N + INIT \u091a c + INIT \u091b C + INIT \u091c j + INIT \u091d J + INIT \u091e Y + INIT \u091f w + INIT \u0920 W + INIT \u0921 q + INIT \u0922 Q + INIT \u0923 R + INIT \u0924 t + INIT \u0925 T + INIT \u0926 d + INIT \u0927 D + INIT \u0928 n + INIT \u092a p + INIT \u092b P + INIT \u092c b + INIT \u092d B + INIT \u092e m + INIT \u092f y + INIT \u0930 r + INIT \u0932 l + INIT \u0933 L + INIT \u0933\u094d\u0939 | + INIT \u0935 v + INIT \u0936 S + INIT \u0937 z + INIT \u0938 s + INIT \u0939 h + INIT \u0902 M + INIT \u0903 H + INIT \u093d ' + INIT \u0964 . + INIT \u0965 .. + INIT \u0966 0 + INIT \u0967 1 + INIT \u0968 2 + INIT \u0969 3 + INIT \u096a 4 + INIT \u096b 5 + INIT \u096c 6 + INIT \u096d 7 + INIT \u096e 8 + INIT \u096f 9 + + INIT \u0901 ~ + + INIT \u0950 o~ + + + INIT \u0903\u0952 \H + INIT \u0902\u0952 \M + INIT \u0903\ua8eb /H + INIT \u0902\ua8eb /M + INIT \u0903\u0951 ^H + INIT \u0902\u0951 ^M + + INIT \u0903\u0951\u0952 ^\H + INIT \u0902\u0951\u0952 ^\M + INIT \u0901\u0952 \~ + INIT \u0901\u0951 ^~ + INIT \u0901\ua8eb /~ + INIT \u0901\u0903\ua8eb /~H + INIT \u1cf2 Z + + + INIT \u093e ¬A + INIT \u093f ¬i + INIT \u0940 ¬I + INIT \u0941 ¬u + INIT \u0942 ¬U + INIT \u0943 ¬f + INIT \u0944 ¬F + INIT \u0962 ¬x + INIT \u0963 ¬X + INIT \u0947 ¬e + INIT \u0948 ¬E + INIT \u094b ¬o + INIT \u094c ¬O + + diff --git a/pwgissues/issue78/transcode1/slp1_deva1.xml b/pwgissues/issue78/transcode1/slp1_deva1.xml new file mode 100644 index 0000000..737208e --- /dev/null +++ b/pwgissues/issue78/transcode1/slp1_deva1.xml @@ -0,0 +1,195 @@ + + + INIT a \u0905 INIT + INIT A \u0906 INIT + INIT i \u0907 INIT + INIT I \u0908 INIT + INIT u \u0909 INIT + INIT U \u090a INIT + INIT f \u090b INIT + INIT F \u0960 INIT + INIT x \u090c INIT + INIT X \u0961 INIT + INIT e \u090f INIT + INIT E \u0910 INIT + INIT o \u0913 INIT + INIT O \u0914 INIT + + SKT a INIT + SKT A \u093e INIT + SKT i \u093f INIT + SKT I \u0940 INIT + SKT u \u0941 INIT + SKT U \u0942 INIT + SKT f \u0943 INIT + SKT F \u0944 INIT + SKT x \u0962 INIT + SKT X \u0963 INIT + SKT e \u0947 INIT + SKT E \u0948 INIT + SKT o \u094b INIT + SKT O \u094c INIT + + + INIT,SKT k/^([^aAiIuUfFxXeEoO^/\\]) \u0915\u094d SKT + INIT,SKT K/^([^aAiIuUfFxXeEoO^/\\]) \u0916\u094d SKT + INIT,SKT g/^([^aAiIuUfFxXeEoO^/\\]) \u0917\u094d SKT + INIT,SKT G/^([^aAiIuUfFxXeEoO^/\\]) \u0918\u094d SKT + INIT,SKT N/^([^aAiIuUfFxXeEoO^/\\]) \u0919\u094d SKT + INIT,SKT c/^([^aAiIuUfFxXeEoO^/\\]) \u091a\u094d SKT + INIT,SKT C/^([^aAiIuUfFxXeEoO^/\\]) \u091b\u094d SKT + INIT,SKT j/^([^aAiIuUfFxXeEoO^/\\]) \u091c\u094d SKT + INIT,SKT J/^([^aAiIuUfFxXeEoO^/\\]) \u091d\u094d SKT + INIT,SKT Y/^([^aAiIuUfFxXeEoO^/\\]) \u091e\u094d SKT + INIT,SKT w/^([^aAiIuUfFxXeEoO^/\\]) \u091f\u094d SKT + INIT,SKT W/^([^aAiIuUfFxXeEoO^/\\]) \u0920\u094d SKT + INIT,SKT q/^([^aAiIuUfFxXeEoO^/\\]) \u0921\u094d SKT + INIT,SKT Q/^([^aAiIuUfFxXeEoO^/\\]) \u0922\u094d SKT + INIT,SKT R/^([^aAiIuUfFxXeEoO^/\\]) \u0923\u094d SKT + INIT,SKT t/^([^aAiIuUfFxXeEoO^/\\]) \u0924\u094d SKT + INIT,SKT T/^([^aAiIuUfFxXeEoO^/\\]) \u0925\u094d SKT + INIT,SKT d/^([^aAiIuUfFxXeEoO^/\\]) \u0926\u094d SKT + INIT,SKT D/^([^aAiIuUfFxXeEoO^/\\]) \u0927\u094d SKT + INIT,SKT n/^([^aAiIuUfFxXeEoO^/\\]) \u0928\u094d SKT + INIT,SKT p/^([^aAiIuUfFxXeEoO^/\\]) \u092a\u094d SKT + INIT,SKT P/^([^aAiIuUfFxXeEoO^/\\]) \u092b\u094d SKT + INIT,SKT b/^([^aAiIuUfFxXeEoO^/\\]) \u092c\u094d SKT + INIT,SKT B/^([^aAiIuUfFxXeEoO^/\\]) \u092d\u094d SKT + INIT,SKT m/^([^aAiIuUfFxXeEoO^/\\]) \u092e\u094d SKT + INIT,SKT y/^([^aAiIuUfFxXeEoO^/\\]) \u092f\u094d SKT + INIT,SKT r/^([^aAiIuUfFxXeEoO^/\\]) \u0930\u094d SKT + INIT,SKT l/^([^aAiIuUfFxXeEoO^/\\]) \u0932\u094d SKT + INIT,SKT L/^([^aAiIuUfFxXeEoO^/\\]) \u0933\u094d SKT + INIT,SKT |/^([^aAiIuUfFxXeEoO^/\\]) \u0933\u094d\u0939\u094d SKT + INIT,SKT v/^([^aAiIuUfFxXeEoO^/\\]) \u0935\u094d SKT + INIT,SKT S/^([^aAiIuUfFxXeEoO^/\\]) \u0936\u094d SKT + INIT,SKT z/^([^aAiIuUfFxXeEoO^/\\]) \u0937\u094d SKT + INIT,SKT s/^([^aAiIuUfFxXeEoO^/\\]) \u0938\u094d SKT + INIT,SKT h/^([^aAiIuUfFxXeEoO^/\\]) \u0939\u094d SKT + INIT,SKT k \u0915 SKT + INIT,SKT K \u0916 SKT + INIT,SKT g \u0917 SKT + INIT,SKT G \u0918 SKT + INIT,SKT N \u0919 SKT + INIT,SKT c \u091a SKT + INIT,SKT C \u091b SKT + INIT,SKT j \u091c SKT + INIT,SKT J \u091d SKT + INIT,SKT Y \u091e SKT + INIT,SKT w \u091f SKT + INIT,SKT W \u0920 SKT + INIT,SKT q \u0921 SKT + INIT,SKT Q \u0922 SKT + INIT,SKT R \u0923 SKT + INIT,SKT t \u0924 SKT + INIT,SKT T \u0925 SKT + INIT,SKT d \u0926 SKT + INIT,SKT D \u0927 SKT + INIT,SKT n \u0928 SKT + INIT,SKT p \u092a SKT + INIT,SKT P \u092b SKT + INIT,SKT b \u092c SKT + INIT,SKT B \u092d SKT + INIT,SKT m \u092e SKT + INIT,SKT y \u092f SKT + INIT,SKT r \u0930 SKT + INIT,SKT l \u0932 SKT + INIT,SKT L \u0933 SKT + + INIT,SKT | \u0933\u094d\u0939 SKT + INIT,SKT v \u0935 SKT + INIT,SKT S \u0936 SKT + INIT,SKT z \u0937 SKT + INIT,SKT s \u0938 SKT + INIT,SKT h \u0939 SKT + + INIT,SKT M \u0902 INIT + INIT,SKT H \u0903 INIT + INIT,SKT ' \u093d INIT + INIT,SKT . \u0964 INIT + INIT,SKT .. \u0965 INIT + INIT,SKT 0 \u0966 INIT + INIT,SKT 1 \u0967 INIT + INIT,SKT 2 \u0968 INIT + INIT,SKT 3 \u0969 INIT + INIT,SKT 4 \u096a INIT + INIT,SKT 5 \u096b INIT + INIT,SKT 6 \u096c INIT + INIT,SKT 7 \u096d INIT + INIT,SKT 8 \u096e INIT + INIT,SKT 9 \u096f INIT + INIT,SKT \u0020 \u0020 INIT + INIT,SKT \u0009 \u0009 INIT + INIT,SKT \u000d \u000d INIT + INIT,SKT \u000a \u000a INIT + + + INIT,SKT - - INIT + + INIT,SKT \ \u0952 INIT + + INIT,SKT / \ua8eb INIT + + INIT,SKT ^ \u0951 INIT + + + INIT,SKT ~ \u0901 INIT + + INIT,SKT o~ \u0950 INIT + + + INIT,SKT Z \u1cf2 INIT + INIT,SKT V \u1cf2 INIT + + + INIT,SKT \H \u0903\u0952 SKT + INIT,SKT \M \u0902\u0952 SKT + INIT,SKT /H \u0903\ua8eb SKT + INIT,SKT /M \u0902\ua8eb SKT + INIT,SKT ^H \u0903\u0951 SKT + INIT,SKT ^M \u0902\u0951 SKT + + INIT,SKT ^\H \u0903\u0951\u0952 SKT + INIT,SKT ^\M \u0902\u0951\u0952 SKT + INIT,SKT \~ \u0901\u0952 SKT + INIT,SKT ^~ \u0901\u0951 SKT + INIT,SKT /~ \u0901\ua8eb SKT + INIT,SKT /~H \u0901\u0903\ua8eb SKT + + + INIT,SKT £ \ua8f2 INIT + + + INIT,SKT ¬A \u093e + INIT,SKT ¬i \u093f + INIT,SKT ¬I \u0940 + INIT,SKT ¬u \u0941 + INIT,SKT ¬U \u0942 + INIT,SKT ¬f \u0943 + INIT,SKT ¬F \u0944 + INIT,SKT ¬x \u0962 + INIT,SKT ¬X \u0963 + INIT,SKT ¬e \u0947 + INIT,SKT ¬E \u0948 + INIT,SKT ¬o \u094b + INIT,SKT ¬O \u094c + + + diff --git a/pwgissues/issue78/transcoder.py b/pwgissues/issue78/transcoder.py new file mode 100644 index 0000000..aee6177 --- /dev/null +++ b/pwgissues/issue78/transcoder.py @@ -0,0 +1,411 @@ +#!/usr/bin/env python +#-*- coding:utf-8 -*- + +""" Python version of transcoder. + Uses built-in library xml.etree.ElementTree, + rather than lxml. + Revised 02-20-2017 Regarding special handling of slp1 to deva; + search for regexCode variable, and fsmentry['regex'] for where this comes into play. + This kind of coding is ugly, and needs to be revised for greater generality. +""" +from __future__ import print_function + +__program_name__ = 'transcoder.py' +__author__ = 'Jim Funderburk' +__email__ = 'funderburk1@verizon.net' +__copyright__ = 'Copyright 2011, Jim Funderburk' +__license__ = 'GPL http://www.gnu.org/licenses/gpl.txt' +__date__ = '2011-12' + +# Python Standard Library +import os +import sys +import codecs +#import locale +import re +#import logging +from unicodedata import normalize +#from operator import itemgetter +#from lxml import etree +import xml.etree.ElementTree as ET + +## Jim Funderburk recoding into php of Java code developed by +## Ralph Bunker. +## This software is made available under the Creative Commons +## Creative Commons Attribution Non-Commercial Share Alike license available in full at , and summarized at . Permission is granted to build upon this work non-commercially, as long as credit is explicitly acknowledged exactly as described herein and derivative work is distributed under the same license. +## Assume transcoder xml files are in directory ../data/transcoder, +## relative to the directory containing this transcoder.php file + +## two global variables +# Assume transcoder xml files are in directory ../data/transcoder, +# relative to the directory containing this transcoder.py file + + +global transcoder_dir,transcoder_fsmarr +transcoder_dir =os.path.dirname(os.path.abspath(__file__)) +transcoder_dir = os.path.dirname(transcoder_dir) ## parent +transcoder_dir += "/data/transcoder" +transcoder_fsmarr = {} # a dictionary. keys are from+to +global python_version +python_version = sys.version[0] # first character: 2 or 3 +if python_version == '3': + xrange = range + unichr = chr + +def transcoder_fsm(sfrom,to) : + global transcoder_dir,transcoder_fsmarr + fromto = sfrom + "_" + to + if (fromto in transcoder_fsmarr) : + return + regexCode=None + regexpairs = [('slp1','deva'),('hkt','tamil')] + if sfrom.startswith('slp1') and to.startswith('deva'): + regexCode = 'slp1_deva' + elif sfrom.startswith('deva') and to.startswith('slp1'): + regexCode = 'deva_slp1' + elif sfrom.startswith('hkt') and to.startswith('tamil'): + regexCode = 'hkt_tamil' + + filein = transcoder_dir + '/' + fromto + ".xml" + if (not os.path.exists(filein)) : + # print("file does not exist = " + filein) + return + # print("file exists = " + filein) + tree = ET.parse(filein) + xml = tree.getroot() + attributes = xml.attrib + # for a in attributes: + # print(a + "," + attributes[a]) + start = attributes['start'] ## required + entries = list(xml) ## children + fsm = {} ## finite state machine to construct + fsm['start']=start + # fsmentries is a list of fsmentry elements, each of which is a hash + # corresponding to one of the 'e' elements in the xml file. + fsmentries = [] # initially an empty list + n = 0 + for e in entries: + if (e.tag != 'e'): + # skip comments + continue + x = e.find("in") + inval = x.text + if not inval: # + inval='' + conlook = False + match = re.match(r'^([^/]+)/\^',inval) + if match : + ## In transcoding from slp1 to devanagari, it is necessary to do a + ## 'look-ahead' when deciding how to code a consonant. If the + ## consonant is not followed by a vowel, then a vigraha has to be emitted. + ## The input codes inval in such cases as: + ## k/^([^aAiIuUfFxXeEoO^/\\]) + ## Which is to be intepreted as: starting at the next character, + ## check if the input string does NOT match the regular expression + ## [^aAiIuUfFxXeEoO^/\\]. + ## Note that the last 3 elements '^', '/', and '\' are present only + ## because of accents. + ## except in these two cases, we process this entry no further + ## 02-22-2017. Allow some other names for from and to + #if ( (fromto != 'slp1_deva') and (fromto != 'hkt_tamil')and + # (fromto != 'deva_slp1')) : + if not regexCode: + continue + inval = match.group(1) + conlook=True + x = e.find("s") # s = state name of this entry. Can be a comma-delimited list + sval = x.text + startStates = re.split(",",sval) + x = e.find("out") # out = the transformation of the input + outval = x.text + if (outval == None): # apparently parser returns this from + outval='' + x = e.find("next") # next state, this is optional. Its absence means use sval + if x is not None: + nextState = x.text + else: + nextState = startStates[0] + + # inval, outval may be strings representing unicode. + # the format expected is \uxxxx\uyyyy etc. where xxxx and yyyy are + # four hex digits. + newinval = to_unicode(inval) + newoutval = to_unicode(outval) + # constuct this fsmentry as a hash of mixed values + fsmentry = {} + fsmentry['starts'] = startStates + fsmentry['in'] = newinval + # fsmentry['regex'] is defined only when conlook is true + if conlook: + fsmentry['regex']=regexCode + fsmentry['out']=newoutval + fsmentry['next']=nextState + # Dec 5, 2013 save raw inval/outval + fsmentry['inraw']=inval + fsmentry['outraw']=outval + fsmentry['e-elt'] = ET.tostring(e) + fsmentries.append(fsmentry) + + n += 1 + + fsm['fsm']=fsmentries + ## make associative array states, whose keys are characters, + ## and whose value at a key is an array of subscripts into fsmentries. + ## i is a subscript for a key provided that the fsmentries[i]['in'] = + ## first character of key + states={} + ientry=0 + for fsmentry in fsmentries: + inval = fsmentry['in'] + #print("inval=",inval) + # special logic for deva_slp1 for a, + # where inval is empty string + if (len(inval)>0): + c = inval[0] # first character of inval + else: + c = inval # empty string + if (c in states): + state=states[c] + state.append(ientry) + states[c]=state + else : + state = [] + state.append(ientry) + states[c]=state + ientry += 1 + + fsm['states']=states + transcoder_fsmarr[fromto]=fsm + #debug + if (False): + print("filein=",filein) + filedbg = "dbg_%s.txt" %fromto + print("transcoder.py. Dbg info written to",filedbg) + fdbg = codecs.open(filedbg,"w","utf-8") + fdbg.write("fsmentries=...\n") + keys = ['starts','in','regex','out','next','inraw','outraw'] + for i in xrange(0,len(fsmentries)): + fsmentry = fsmentries[i] + s = [] + + #for key in fsmentry: + for key in keys: + if key not in fsmentry: # regex + continue + val = fsmentry[key] + if key == 'starts': + val = ' '.join(val) + s.append("%s => %s" %(key,val)) + sout = ' , '.join(s) + out = "fsmentry[%s]=%s" %(i,sout) + #print(out.encode('utf-8')) + fdbg.write("%s\n" % out) + fdbg.write(" e-elt=%s\n" % fsmentry['e-elt']) + #print("states=...") + fdbg.write("states=...\n") + for c in states: + state = states[c] + y = [] + for i in state: + y.append('%s' % i) + x = ' '.join(y) + out = "c=%s, state=%s" %(c,x) + #print(out.encode('utf-8')) + fdbg.write("%s\n" % out) + fdbg.close() +def to_unicode(x): + # x is assumed to be a string with one of two forms + # (a) \uxxxx\uyyyy this is interpreted as unicode + # (b) other - this is returned without change + global python_version + if (x == r"\u"): # a case where notation is confusing + return x + match = re.match('\\\\u',x) + if match: + y = re.split('\\\\u',x) + ans='' + for z in y: + if (z == ''): + continue + z1 = z + z2 = '' + if (len(z) > 4): + z1 = z[:4] + z2 = z[4:] + zint= int(z1,16) + zuni = unichr(zint) + ans += zuni + ans += z2 + return ans + else: + return x + +vowel_signs = ['\u094d','\u093e','\u093f','\u0940','\u0941','\u0942','\u0943','\u0944','\u0962','\u0963','\u0947','\u0948','\u094b','\u094c'] +vowel_signs_unicode=[] +for vowel_sign in vowel_signs: + vowel_sign1 = to_unicode(vowel_sign) + vowel_signs_unicode.append(vowel_sign1) + +def transcoder_processString(line,from1,to) : + global transcoder_dir,transcoder_fsmarr + if (from1 == to) : + return line + fromto = from1 + "_" + to + if (fromto in transcoder_fsmarr): + fsm = transcoder_fsmarr[fromto] + else: + transcoder_fsm(from1,to) + if (fromto in transcoder_fsmarr): + fsm = transcoder_fsmarr[fromto] + else: + return line + currentState=fsm['start'] + fsmentries = fsm['fsm'] + states = fsm['states'] + n=0 ## current character position in line + result='' ## returned value + m=len(line) + while (n < m) : + c = line[n] # character at position n + if (c not in states): + result += c + currentState=fsm['start'] + n += 1 + continue + isubs = states[c] + best="" + nbest=0 + bestFE = None + for isub in isubs : + fsmentry=fsmentries[isub] + startStates=fsmentry['starts'] + k=-1 + nstartStates=len(startStates) + j=0 + while (j < nstartStates): + if (startStates[j] == currentState) : + k=j + j=nstartStates + j += 1 + if (k == -1) :continue + match = transcoder_processString_match(line,n,m,fsmentry) + nmatch=len(match) + ## echo "chk2: n=n, c='c', nmatch=nmatch
\n" + #out = "chk2: n=%s, c='%s', nmatch=%s" %(n,c,nmatch) + #print(out.encode('utf-8')) + if (nmatch > nbest) : + best = match + nbest=nmatch + bestFE=fsmentry + + if (bestFE) : + result += bestFE['out'] + n += nbest + currentState=bestFE['next'] + else : + ## Default condition. emit the character and change state to start + result += c + currentState=fsm['start'] + n += 1 + + return result + +def transcoder_processString_match(line,n,m,fsmentry) : + match="" ## value returned + edge = fsmentry['in'] + nedge=len(edge) + j=n + k=0 + b=True + while ( (j < m) and (k < nedge) and b) : + if(line[j] == edge[k]) : + j += 1 + k += 1 + else : + b=False + if (not b) : + return match + if (k != nedge) : + return match + match=edge + if (not 'regex' in fsmentry): + return match + + ## additional logic when fsmentry['regex'] is DEVA or TAMIL + ## see discussion of 'regex' in transcoder_fsm + ## This logic only works with slp1_deva xml file. + ## Also, it ignores the use of '/^\' as vowel accents. + nmatch=len(match) + n1=n+nmatch + if (n1 == m) : + return match + d = line[n1] + #if (fsmentry['regex'] == 'deva') : + if (fsmentry['regex'] == 'slp1_deva') : + #test = re.match('[^aAiIuUfFxXeEoO^\/\\\\]',d) + test = re.match(r'[^aAiIuUfFxXeEoO^\/\\\\]',d) + if (test) : + return match + return "" + + if (fsmentry['regex'] == 'hkt_tamil') : + test = re.match('[^aAiIuUeEoO]',d) + if (test): + return match + return "" + + if (fsmentry['regex'] == 'deva_slp1'): + for vowel_sign1 in vowel_signs_unicode: + vowel_sign1_len = len(vowel_sign1) + found=True + for j in xrange(0,vowel_sign1_len): + k = n1 + j + if k >= m: + found=False + continue + if vowel_sign1[j] != line[k]: + found = False + continue + if found: + # the consonant is followed by $vowel_sign. + # return empty string to indicate rule failure. + # This program logic cannot distinguish between + # a mismatch, and an empty string. + # In particular, we don't handle virama properly otherwise, + # so we do this special test to correct the problem + # if ($j == 0) {return $match;} # case of virama + return "" # case of a vowel sign + # the consonant is not followed by either virama or a vowel sign. + return match # fell through for vowel_sign1 + return "" +def transcoder_processElements(line,from1,to,tagname): + global transcoder_from,transcoder_to + transcoder_from = from1 + transcoder_to = to + ## Assume parts of line to be converted are marked in an xml way. + ## For example, if tagname = 'SA': + ## and line = 'The word rAma refers to a person', + ## returned would be 'The word XXX refers to a person', + ## where XXX is the transformation of the the string 'rAma' acc. to from,to + + ## ans = preg_replace("/(.*?)<\/tagname>/e", + ## "transcoder_processString('\\1','from','to')",line) + #regex = str.format('<{0}>(.*?)',tagname) + regex = '<%s>(.*?)'%(tagname,tagname) + ans = re.sub(regex,transcoder_processElements_callback,line) + return ans + +def transcoder_processElements_callback(match) : + global transcoder_from,transcoder_to + return transcoder_processString(match.group(1),transcoder_from,transcoder_to) +def transcoder_set_dir(dir) : + ## may return FALSE if string dir is improper in some way + global transcoder_dir + path = os.path.abspath(dir) + if os.path.exists(path): + transcoder_dir = path + return transcoder_dir + +def transcoder_get_dir() : + global transcoder_dir + return transcoder_dir