diff --git a/pwgissues/issue78/debug_transcode.py b/pwgissues/issue78/debug_transcode.py
new file mode 100644
index 0000000..9285071
--- /dev/null
+++ b/pwgissues/issue78/debug_transcode.py
@@ -0,0 +1,100 @@
+#-*- coding:utf-8 -*-
+"""debug_transcode.py
+"""
+from __future__ import print_function
+import sys, re,codecs
+import transcoder
+
+def read_lines(filein):
+ with codecs.open(filein,encoding='utf-8',mode='r') as f:
+ lines = [x.rstrip('\r\n') for x in f]
+ print(len(lines),"lines read from",filein)
+ return lines
+
+def write_lines(fileout,outarr):
+ with codecs.open(fileout,"w","utf-8") as f:
+ for out in outarr:
+ f.write(out+'\n')
+ print(len(outarr),"lines written to",fileout)
+
+def print_unicode(u):
+ """ Sample output:
+x= a/MSa—BU/
+0905 | अ | DEVANAGARI LETTER A
+0951 | ॑ | DEVANAGARI STRESS SIGN UDATTA
+0902 | ं | DEVANAGARI SIGN ANUSVARA
+0936 | श | DEVANAGARI LETTER SHA
+2014 | — | EM DASH
+092D | भ | DEVANAGARI LETTER BHA
+0942 | ू | DEVANAGARI VOWEL SIGN UU
+0951 | ॑ | DEVANAGARI STRESS SIGN UDATTA
+ """
+ import unicodedata
+ outarr = []
+ for c in u:
+ name = unicodedata.name(c)
+ icode = ord(c)
+ a = f"{icode:04X} | {c} | {name}"
+ outarr.append(a)
+ return outarr
+
+def transcode(x,tranin,tranout):
+ y = transcoder.transcoder_processString(x,tranin,tranout)
+ return y
+
+def convert_line(line,tranin,tranout,outarr):
+ # convert text in '{#X#}' -> {#Y#}
+ #outarr = []
+ def f(m):
+ x = m.group(1)
+ y = transcode(x,tranin,tranout)
+ ans = '{#%s#}' % y
+ #return ans # comment out this line to
+ # check invertibility
+ x1 = transcode(y,tranout,tranin)
+ if x1 != x:
+ y1 = transcode(x1,tranin,tranout)
+ outarr.append(' x=%s, y=%s' %(x,y))
+ a = print_unicode(x)
+ for a1 in a:
+ outarr.append(a1)
+ outarr.append('x1=%s, y1=%s' %(x1,y1))
+ a = print_unicode(x1)
+ for a1 in a:
+ outarr.append(a1)
+ return ans
+
+ regex = '{#(.*?)#}'
+ lineout = re.sub(regex,f,line)
+ return lineout
+
+def convert_lines(lines,tranin,tranout,outarr):
+ newlines = []
+ for line in lines:
+ newline = convert_line(line,tranin,tranout,outarr)
+ newlines.append(newline)
+ return newlines
+
+
+def test(lines):
+ outarr = []
+ def f(m):
+ x = m.group(1)
+ y = transcode(x,tranin,tranout)
+ return '{#%s#}' % y
+ for iline,line in enumerate(lines):
+ lineout = re.sub(r'{#(.*?)#}',f,line)
+ outarr.append(lineout)
+ return outarr
+
+if __name__=="__main__":
+ transcoderdir = sys.argv[1]
+ transcoder.transcoder_set_dir(transcoderdir)
+ tranin = sys.argv[2]
+ tranout = sys.argv[3]
+ filein = sys.argv[4]
+ fileout = sys.argv[5]
+
+ lines = read_lines(filein)
+ outarr = test(lines)
+ write_lines(fileout,outarr)
diff --git a/pwgissues/issue78/readme.txt b/pwgissues/issue78/readme.txt
new file mode 100644
index 0000000..9d09d62
--- /dev/null
+++ b/pwgissues/issue78/readme.txt
@@ -0,0 +1,26 @@
+10-06-2024
+issue: https://github.com/sanskrit-lexicon/PWG/issues/78
+
+# This directory
+cd /c/xampp/htdocs/sanskrit-lexicon/PWG/pwgissues/issue78
+
+# proposed enhancement of slp1 transcoding for vowel-markers.
+
+https://github.com/user-attachments/files/17129810/PWGVN_1-6_reformatted_.dng.txt
+
+There is a 'special' transcoding of slp1 to Devanagari
+that is used in dictionaries: pwg, pw, sch, pwkvn
+The transcoding files are slp1_deva1.xml and its inverse deva1_slp1.xml
+We start with the versions of these two files in transcode0 directory.
+
+The revision is in transcode1 directory.
+
+python debug_transcode.py transcode1 slp1 deva1 test01_in.txt test01_out.txt
+python debug_transcode.py transcode1 deva1 slp1 test01_out.txt test01_in_out.txt
+
+test01_in_out.txt is different than test01_in.txt.
+The invertibility fails.
+
+Note that deva1_slp1.xml defines a finite-state machine with only one state,
+ the INIT state.
+
diff --git a/pwgissues/issue78/test01_in.txt b/pwgissues/issue78/test01_in.txt
new file mode 100644
index 0000000..d473d7a
--- /dev/null
+++ b/pwgissues/issue78/test01_in.txt
@@ -0,0 +1,4 @@
+{#tarudU¬AlakA#}
+{#pElA¬Ada#}
+{#rA¬ARa/#}
+{#¬i#}
diff --git a/pwgissues/issue78/test01_in_out.txt b/pwgissues/issue78/test01_in_out.txt
new file mode 100644
index 0000000..82891ce
--- /dev/null
+++ b/pwgissues/issue78/test01_in_out.txt
@@ -0,0 +1,4 @@
+{#tarudUAlakA#}
+{#pElAAda#}
+{#rAARa/#}
+{#i#}
diff --git a/pwgissues/issue78/test01_out.txt b/pwgissues/issue78/test01_out.txt
new file mode 100644
index 0000000..29e22b1
--- /dev/null
+++ b/pwgissues/issue78/test01_out.txt
@@ -0,0 +1,4 @@
+{#तरुदूालका#}
+{#पैलााद#}
+{#रााण꣫#}
+{#ि#}
diff --git a/pwgissues/issue78/transcode0/deva1_slp1.xml b/pwgissues/issue78/transcode0/deva1_slp1.xml
new file mode 100644
index 0000000..4bc821a
--- /dev/null
+++ b/pwgissues/issue78/transcode0/deva1_slp1.xml
@@ -0,0 +1,220 @@
+
+
+ INIT \u0905 a
+ INIT \u0906 A
+ INIT \u0907 i
+ INIT \u0908 I
+ INIT \u0909 u
+ INIT \u090a U
+ INIT \u090b f
+ INIT \u0960 F
+ INIT \u090c x
+ INIT \u0961 X
+ INIT \u090f e
+ INIT \u0910 E
+ INIT \u0913 o
+ INIT \u0914 O
+
+ INIT \u0905\ua8eb a/
+ INIT \u0906\ua8eb A/
+ INIT \u0907\ua8eb i/
+ INIT \u0908\ua8eb I/
+ INIT \u0909\ua8eb u/
+ INIT \u090a\ua8eb U/
+ INIT \u090b\ua8eb f/
+ INIT \u0960\ua8eb F/
+ INIT \u090c\ua8eb x/
+ INIT \u0961\ua8eb X/
+ INIT \u090f\ua8eb e/
+ INIT \u0910\ua8eb E/
+ INIT \u0913\ua8eb o/
+ INIT \u0914\ua8eb O/
+
+ INIT \u0905\u0952 a\
+ INIT \u0906\u0952 A\
+ INIT \u0907\u0952 i\
+ INIT \u0908\u0952 I\
+ INIT \u0909\u0952 u\
+ INIT \u090a\u0952 U\
+ INIT \u090b\u0952 f\
+ INIT \u0960\u0952 F\
+ INIT \u090c\u0952 x\
+ INIT \u0961\u0952 X\
+ INIT \u090f\u0952 e\
+ INIT \u0910\u0952 E\
+ INIT \u0913\u0952 o\
+ INIT \u0914\u0952 O\
+
+ INIT \u093e A
+ INIT \u093f i
+ INIT \u0940 I
+ INIT \u0941 u
+ INIT \u0942 U
+ INIT \u0943 f
+ INIT \u0944 F
+ INIT \u0962 x
+ INIT \u0963 X
+ INIT \u0947 e
+ INIT \u0948 E
+ INIT \u094b o
+ INIT \u094c O
+
+ INIT \u0951 ^
+ INIT \u093e\u0951 A^
+ INIT \u093f\u0951 i^
+ INIT \u0940\u0951 I^
+ INIT \u0941\u0951 u^
+ INIT \u0942\u0951 U^
+ INIT \u0943\u0951 f^
+ INIT \u0944\u0951 F^
+ INIT \u0962\u0951 x^
+ INIT \u0963\u0951 X^
+ INIT \u0947\u0951 e^
+ INIT \u0948\u0951 E^
+ INIT \u094b\u0951 o^
+ INIT \u094c\u0951 O^
+
+ INIT \u0952 \
+ INIT \u093e\u0952 A\
+ INIT \u093f\u0952 i\
+ INIT \u0940\u0952 I\
+ INIT \u0941\u0952 u\
+ INIT \u0942\u0952 U\
+ INIT \u0943\u0952 f\
+ INIT \u0944\u0952 F\
+ INIT \u0962\u0952 x\
+ INIT \u0963\u0952 X\
+ INIT \u0947\u0952 e\
+ INIT \u0948\u0952 E\
+ INIT \u094b\u0952 o\
+ INIT \u094c\u0952 O\
+
+ INIT \ua8eb /
+ INIT \u093e\ua8eb A/
+ INIT \u093f\ua8eb i/
+ INIT \u0940\ua8eb I/
+ INIT \u0941\ua8eb u/
+ INIT \u0942\ua8eb U/
+ INIT \u0943\ua8eb f/
+ INIT \u0944\ua8eb F/
+ INIT \u0962\ua8eb x/
+ INIT \u0963\ua8eb X/
+ INIT \u0947\ua8eb e/
+ INIT \u0948\ua8eb E/
+ INIT \u094b\ua8eb o/
+ INIT \u094c\ua8eb O/
+
+
+ INIT \u094d
+
+ INIT \u0915/^ ka
+ INIT \u0916/^ Ka
+ INIT \u0917/^ ga
+ INIT \u0918/^ Ga
+ INIT \u0919/^ Na
+ INIT \u091a/^ ca
+ INIT \u091b/^ Ca
+ INIT \u091c/^ ja
+ INIT \u091d/^ Ja
+ INIT \u091e/^ Ya
+ INIT \u091f/^ wa
+ INIT \u0920/^ Wa
+ INIT \u0921/^ qa
+ INIT \u0922/^ Qa
+ INIT \u0923/^ Ra
+ INIT \u0924/^ ta
+ INIT \u0925/^ Ta
+ INIT \u0926/^ da
+ INIT \u0927/^ Da
+ INIT \u0928/^ na
+ INIT \u092a/^ pa
+ INIT \u092b/^ Pa
+ INIT \u092c/^ ba
+ INIT \u092d/^ Ba
+ INIT \u092e/^ ma
+ INIT \u092f/^ ya
+ INIT \u0930/^ ra
+ INIT \u0932/^ la
+ INIT \u0933/^ La
+ INIT \u0933\u094d\u0939/^ |a
+ INIT \u0935/^ va
+ INIT \u0936/^ Sa
+ INIT \u0937/^ za
+ INIT \u0938/^ sa
+ INIT \u0939/^ ha
+
+
+
+ INIT \u0915 k
+ INIT \u0916 K
+ INIT \u0917 g
+ INIT \u0918 G
+ INIT \u0919 N
+ INIT \u091a c
+ INIT \u091b C
+ INIT \u091c j
+ INIT \u091d J
+ INIT \u091e Y
+ INIT \u091f w
+ INIT \u0920 W
+ INIT \u0921 q
+ INIT \u0922 Q
+ INIT \u0923 R
+ INIT \u0924 t
+ INIT \u0925 T
+ INIT \u0926 d
+ INIT \u0927 D
+ INIT \u0928 n
+ INIT \u092a p
+ INIT \u092b P
+ INIT \u092c b
+ INIT \u092d B
+ INIT \u092e m
+ INIT \u092f y
+ INIT \u0930 r
+ INIT \u0932 l
+ INIT \u0933 L
+ INIT \u0933\u094d\u0939 |
+ INIT \u0935 v
+ INIT \u0936 S
+ INIT \u0937 z
+ INIT \u0938 s
+ INIT \u0939 h
+ INIT \u0902 M
+ INIT \u0903 H
+ INIT \u093d '
+ INIT \u0964 .
+ INIT \u0965 ..
+ INIT \u0966 0
+ INIT \u0967 1
+ INIT \u0968 2
+ INIT \u0969 3
+ INIT \u096a 4
+ INIT \u096b 5
+ INIT \u096c 6
+ INIT \u096d 7
+ INIT \u096e 8
+ INIT \u096f 9
+
+ INIT \u0901 ~
+
+ INIT \u0950 o~
+
+
+ INIT \u0903\u0952 \H
+ INIT \u0902\u0952 \M
+ INIT \u0903\ua8eb /H
+ INIT \u0902\ua8eb /M
+ INIT \u0903\u0951 ^H
+ INIT \u0902\u0951 ^M
+
+ INIT \u0903\u0951\u0952 ^\H
+ INIT \u0902\u0951\u0952 ^\M
+ INIT \u0901\u0952 \~
+ INIT \u0901\u0951 ^~
+ INIT \u0901\ua8eb /~
+ INIT \u0901\u0903\ua8eb /~H
+ INIT \u1cf2 Z
+
diff --git a/pwgissues/issue78/transcode0/slp1_deva1.xml b/pwgissues/issue78/transcode0/slp1_deva1.xml
new file mode 100644
index 0000000..8fcdaa1
--- /dev/null
+++ b/pwgissues/issue78/transcode0/slp1_deva1.xml
@@ -0,0 +1,175 @@
+
+
+ INIT a \u0905 INIT
+ INIT A \u0906 INIT
+ INIT i \u0907 INIT
+ INIT I \u0908 INIT
+ INIT u \u0909 INIT
+ INIT U \u090a INIT
+ INIT f \u090b INIT
+ INIT F \u0960 INIT
+ INIT x \u090c INIT
+ INIT X \u0961 INIT
+ INIT e \u090f INIT
+ INIT E \u0910 INIT
+ INIT o \u0913 INIT
+ INIT O \u0914 INIT
+
+ SKT a INIT
+ SKT A \u093e INIT
+ SKT i \u093f INIT
+ SKT I \u0940 INIT
+ SKT u \u0941 INIT
+ SKT U \u0942 INIT
+ SKT f \u0943 INIT
+ SKT F \u0944 INIT
+ SKT x \u0962 INIT
+ SKT X \u0963 INIT
+ SKT e \u0947 INIT
+ SKT E \u0948 INIT
+ SKT o \u094b INIT
+ SKT O \u094c INIT
+
+
+ INIT,SKT k/^([^aAiIuUfFxXeEoO^/\\]) \u0915\u094d SKT
+ INIT,SKT K/^([^aAiIuUfFxXeEoO^/\\]) \u0916\u094d SKT
+ INIT,SKT g/^([^aAiIuUfFxXeEoO^/\\]) \u0917\u094d SKT
+ INIT,SKT G/^([^aAiIuUfFxXeEoO^/\\]) \u0918\u094d SKT
+ INIT,SKT N/^([^aAiIuUfFxXeEoO^/\\]) \u0919\u094d SKT
+ INIT,SKT c/^([^aAiIuUfFxXeEoO^/\\]) \u091a\u094d SKT
+ INIT,SKT C/^([^aAiIuUfFxXeEoO^/\\]) \u091b\u094d SKT
+ INIT,SKT j/^([^aAiIuUfFxXeEoO^/\\]) \u091c\u094d SKT
+ INIT,SKT J/^([^aAiIuUfFxXeEoO^/\\]) \u091d\u094d SKT
+ INIT,SKT Y/^([^aAiIuUfFxXeEoO^/\\]) \u091e\u094d SKT
+ INIT,SKT w/^([^aAiIuUfFxXeEoO^/\\]) \u091f\u094d SKT
+ INIT,SKT W/^([^aAiIuUfFxXeEoO^/\\]) \u0920\u094d SKT
+ INIT,SKT q/^([^aAiIuUfFxXeEoO^/\\]) \u0921\u094d SKT
+ INIT,SKT Q/^([^aAiIuUfFxXeEoO^/\\]) \u0922\u094d SKT
+ INIT,SKT R/^([^aAiIuUfFxXeEoO^/\\]) \u0923\u094d SKT
+ INIT,SKT t/^([^aAiIuUfFxXeEoO^/\\]) \u0924\u094d SKT
+ INIT,SKT T/^([^aAiIuUfFxXeEoO^/\\]) \u0925\u094d SKT
+ INIT,SKT d/^([^aAiIuUfFxXeEoO^/\\]) \u0926\u094d SKT
+ INIT,SKT D/^([^aAiIuUfFxXeEoO^/\\]) \u0927\u094d SKT
+ INIT,SKT n/^([^aAiIuUfFxXeEoO^/\\]) \u0928\u094d SKT
+ INIT,SKT p/^([^aAiIuUfFxXeEoO^/\\]) \u092a\u094d SKT
+ INIT,SKT P/^([^aAiIuUfFxXeEoO^/\\]) \u092b\u094d SKT
+ INIT,SKT b/^([^aAiIuUfFxXeEoO^/\\]) \u092c\u094d SKT
+ INIT,SKT B/^([^aAiIuUfFxXeEoO^/\\]) \u092d\u094d SKT
+ INIT,SKT m/^([^aAiIuUfFxXeEoO^/\\]) \u092e\u094d SKT
+ INIT,SKT y/^([^aAiIuUfFxXeEoO^/\\]) \u092f\u094d SKT
+ INIT,SKT r/^([^aAiIuUfFxXeEoO^/\\]) \u0930\u094d SKT
+ INIT,SKT l/^([^aAiIuUfFxXeEoO^/\\]) \u0932\u094d SKT
+ INIT,SKT L/^([^aAiIuUfFxXeEoO^/\\]) \u0933\u094d SKT
+ INIT,SKT |/^([^aAiIuUfFxXeEoO^/\\]) \u0933\u094d\u0939\u094d SKT
+ INIT,SKT v/^([^aAiIuUfFxXeEoO^/\\]) \u0935\u094d SKT
+ INIT,SKT S/^([^aAiIuUfFxXeEoO^/\\]) \u0936\u094d SKT
+ INIT,SKT z/^([^aAiIuUfFxXeEoO^/\\]) \u0937\u094d SKT
+ INIT,SKT s/^([^aAiIuUfFxXeEoO^/\\]) \u0938\u094d SKT
+ INIT,SKT h/^([^aAiIuUfFxXeEoO^/\\]) \u0939\u094d SKT
+ INIT,SKT k \u0915 SKT
+ INIT,SKT K \u0916 SKT
+ INIT,SKT g \u0917 SKT
+ INIT,SKT G \u0918 SKT
+ INIT,SKT N \u0919 SKT
+ INIT,SKT c \u091a SKT
+ INIT,SKT C \u091b SKT
+ INIT,SKT j \u091c SKT
+ INIT,SKT J \u091d SKT
+ INIT,SKT Y \u091e SKT
+ INIT,SKT w \u091f SKT
+ INIT,SKT W \u0920 SKT
+ INIT,SKT q \u0921 SKT
+ INIT,SKT Q \u0922 SKT
+ INIT,SKT R \u0923 SKT
+ INIT,SKT t \u0924 SKT
+ INIT,SKT T \u0925 SKT
+ INIT,SKT d \u0926 SKT
+ INIT,SKT D \u0927 SKT
+ INIT,SKT n \u0928 SKT
+ INIT,SKT p \u092a SKT
+ INIT,SKT P \u092b SKT
+ INIT,SKT b \u092c SKT
+ INIT,SKT B \u092d SKT
+ INIT,SKT m \u092e SKT
+ INIT,SKT y \u092f SKT
+ INIT,SKT r \u0930 SKT
+ INIT,SKT l \u0932 SKT
+ INIT,SKT L \u0933 SKT
+
+ INIT,SKT | \u0933\u094d\u0939 SKT
+ INIT,SKT v \u0935 SKT
+ INIT,SKT S \u0936 SKT
+ INIT,SKT z \u0937 SKT
+ INIT,SKT s \u0938 SKT
+ INIT,SKT h \u0939 SKT
+
+ INIT,SKT M \u0902 INIT
+ INIT,SKT H \u0903 INIT
+ INIT,SKT ' \u093d INIT
+ INIT,SKT . \u0964 INIT
+ INIT,SKT .. \u0965 INIT
+ INIT,SKT 0 \u0966 INIT
+ INIT,SKT 1 \u0967 INIT
+ INIT,SKT 2 \u0968 INIT
+ INIT,SKT 3 \u0969 INIT
+ INIT,SKT 4 \u096a INIT
+ INIT,SKT 5 \u096b INIT
+ INIT,SKT 6 \u096c INIT
+ INIT,SKT 7 \u096d INIT
+ INIT,SKT 8 \u096e INIT
+ INIT,SKT 9 \u096f INIT
+ INIT,SKT \u0020 \u0020 INIT
+ INIT,SKT \u0009 \u0009 INIT
+ INIT,SKT \u000d \u000d INIT
+ INIT,SKT \u000a \u000a INIT
+
+
+ INIT,SKT - - INIT
+
+ INIT,SKT \ \u0952 INIT
+
+ INIT,SKT / \ua8eb INIT
+
+ INIT,SKT ^ \u0951 INIT
+
+
+ INIT,SKT ~ \u0901 INIT
+
+ INIT,SKT o~ \u0950 INIT
+
+
+ INIT,SKT Z \u1cf2 INIT
+ INIT,SKT V \u1cf2 INIT
+
+
+ INIT,SKT \H \u0903\u0952 SKT
+ INIT,SKT \M \u0902\u0952 SKT
+ INIT,SKT /H \u0903\ua8eb SKT
+ INIT,SKT /M \u0902\ua8eb SKT
+ INIT,SKT ^H \u0903\u0951 SKT
+ INIT,SKT ^M \u0902\u0951 SKT
+
+ INIT,SKT ^\H \u0903\u0951\u0952 SKT
+ INIT,SKT ^\M \u0902\u0951\u0952 SKT
+ INIT,SKT \~ \u0901\u0952 SKT
+ INIT,SKT ^~ \u0901\u0951 SKT
+ INIT,SKT /~ \u0901\ua8eb SKT
+ INIT,SKT /~H \u0901\u0903\ua8eb SKT
+
+
+ INIT,SKT £ \ua8f2 INIT
+
+
diff --git a/pwgissues/issue78/transcode1/deva1_slp1.xml b/pwgissues/issue78/transcode1/deva1_slp1.xml
new file mode 100644
index 0000000..e6a9ea0
--- /dev/null
+++ b/pwgissues/issue78/transcode1/deva1_slp1.xml
@@ -0,0 +1,236 @@
+
+
+ INIT \u0905 a
+ INIT \u0906 A
+ INIT \u0907 i
+ INIT \u0908 I
+ INIT \u0909 u
+ INIT \u090a U
+ INIT \u090b f
+ INIT \u0960 F
+ INIT \u090c x
+ INIT \u0961 X
+ INIT \u090f e
+ INIT \u0910 E
+ INIT \u0913 o
+ INIT \u0914 O
+
+ INIT \u0905\ua8eb a/
+ INIT \u0906\ua8eb A/
+ INIT \u0907\ua8eb i/
+ INIT \u0908\ua8eb I/
+ INIT \u0909\ua8eb u/
+ INIT \u090a\ua8eb U/
+ INIT \u090b\ua8eb f/
+ INIT \u0960\ua8eb F/
+ INIT \u090c\ua8eb x/
+ INIT \u0961\ua8eb X/
+ INIT \u090f\ua8eb e/
+ INIT \u0910\ua8eb E/
+ INIT \u0913\ua8eb o/
+ INIT \u0914\ua8eb O/
+
+ INIT \u0905\u0952 a\
+ INIT \u0906\u0952 A\
+ INIT \u0907\u0952 i\
+ INIT \u0908\u0952 I\
+ INIT \u0909\u0952 u\
+ INIT \u090a\u0952 U\
+ INIT \u090b\u0952 f\
+ INIT \u0960\u0952 F\
+ INIT \u090c\u0952 x\
+ INIT \u0961\u0952 X\
+ INIT \u090f\u0952 e\
+ INIT \u0910\u0952 E\
+ INIT \u0913\u0952 o\
+ INIT \u0914\u0952 O\
+
+ INIT \u093e A
+ INIT \u093f i
+ INIT \u0940 I
+ INIT \u0941 u
+ INIT \u0942 U
+ INIT \u0943 f
+ INIT \u0944 F
+ INIT \u0962 x
+ INIT \u0963 X
+ INIT \u0947 e
+ INIT \u0948 E
+ INIT \u094b o
+ INIT \u094c O
+
+ INIT \u0951 ^
+ INIT \u093e\u0951 A^
+ INIT \u093f\u0951 i^
+ INIT \u0940\u0951 I^
+ INIT \u0941\u0951 u^
+ INIT \u0942\u0951 U^
+ INIT \u0943\u0951 f^
+ INIT \u0944\u0951 F^
+ INIT \u0962\u0951 x^
+ INIT \u0963\u0951 X^
+ INIT \u0947\u0951 e^
+ INIT \u0948\u0951 E^
+ INIT \u094b\u0951 o^
+ INIT \u094c\u0951 O^
+
+ INIT \u0952 \
+ INIT \u093e\u0952 A\
+ INIT \u093f\u0952 i\
+ INIT \u0940\u0952 I\
+ INIT \u0941\u0952 u\
+ INIT \u0942\u0952 U\
+ INIT \u0943\u0952 f\
+ INIT \u0944\u0952 F\
+ INIT \u0962\u0952 x\
+ INIT \u0963\u0952 X\
+ INIT \u0947\u0952 e\
+ INIT \u0948\u0952 E\
+ INIT \u094b\u0952 o\
+ INIT \u094c\u0952 O\
+
+ INIT \ua8eb /
+ INIT \u093e\ua8eb A/
+ INIT \u093f\ua8eb i/
+ INIT \u0940\ua8eb I/
+ INIT \u0941\ua8eb u/
+ INIT \u0942\ua8eb U/
+ INIT \u0943\ua8eb f/
+ INIT \u0944\ua8eb F/
+ INIT \u0962\ua8eb x/
+ INIT \u0963\ua8eb X/
+ INIT \u0947\ua8eb e/
+ INIT \u0948\ua8eb E/
+ INIT \u094b\ua8eb o/
+ INIT \u094c\ua8eb O/
+
+
+ INIT \u094d
+
+ INIT \u0915/^ ka
+ INIT \u0916/^ Ka
+ INIT \u0917/^ ga
+ INIT \u0918/^ Ga
+ INIT \u0919/^ Na
+ INIT \u091a/^ ca
+ INIT \u091b/^ Ca
+ INIT \u091c/^ ja
+ INIT \u091d/^ Ja
+ INIT \u091e/^ Ya
+ INIT \u091f/^ wa
+ INIT \u0920/^ Wa
+ INIT \u0921/^ qa
+ INIT \u0922/^ Qa
+ INIT \u0923/^ Ra
+ INIT \u0924/^ ta
+ INIT \u0925/^ Ta
+ INIT \u0926/^ da
+ INIT \u0927/^ Da
+ INIT \u0928/^ na
+ INIT \u092a/^ pa
+ INIT \u092b/^ Pa
+ INIT \u092c/^ ba
+ INIT \u092d/^ Ba
+ INIT \u092e/^ ma
+ INIT \u092f/^ ya
+ INIT \u0930/^ ra
+ INIT \u0932/^ la
+ INIT \u0933/^ La
+ INIT \u0933\u094d\u0939/^ |a
+ INIT \u0935/^ va
+ INIT \u0936/^ Sa
+ INIT \u0937/^ za
+ INIT \u0938/^ sa
+ INIT \u0939/^ ha
+
+
+
+ INIT \u0915 k
+ INIT \u0916 K
+ INIT \u0917 g
+ INIT \u0918 G
+ INIT \u0919 N
+ INIT \u091a c
+ INIT \u091b C
+ INIT \u091c j
+ INIT \u091d J
+ INIT \u091e Y
+ INIT \u091f w
+ INIT \u0920 W
+ INIT \u0921 q
+ INIT \u0922 Q
+ INIT \u0923 R
+ INIT \u0924 t
+ INIT \u0925 T
+ INIT \u0926 d
+ INIT \u0927 D
+ INIT \u0928 n
+ INIT \u092a p
+ INIT \u092b P
+ INIT \u092c b
+ INIT \u092d B
+ INIT \u092e m
+ INIT \u092f y
+ INIT \u0930 r
+ INIT \u0932 l
+ INIT \u0933 L
+ INIT \u0933\u094d\u0939 |
+ INIT \u0935 v
+ INIT \u0936 S
+ INIT \u0937 z
+ INIT \u0938 s
+ INIT \u0939 h
+ INIT \u0902 M
+ INIT \u0903 H
+ INIT \u093d '
+ INIT \u0964 .
+ INIT \u0965 ..
+ INIT \u0966 0
+ INIT \u0967 1
+ INIT \u0968 2
+ INIT \u0969 3
+ INIT \u096a 4
+ INIT \u096b 5
+ INIT \u096c 6
+ INIT \u096d 7
+ INIT \u096e 8
+ INIT \u096f 9
+
+ INIT \u0901 ~
+
+ INIT \u0950 o~
+
+
+ INIT \u0903\u0952 \H
+ INIT \u0902\u0952 \M
+ INIT \u0903\ua8eb /H
+ INIT \u0902\ua8eb /M
+ INIT \u0903\u0951 ^H
+ INIT \u0902\u0951 ^M
+
+ INIT \u0903\u0951\u0952 ^\H
+ INIT \u0902\u0951\u0952 ^\M
+ INIT \u0901\u0952 \~
+ INIT \u0901\u0951 ^~
+ INIT \u0901\ua8eb /~
+ INIT \u0901\u0903\ua8eb /~H
+ INIT \u1cf2 Z
+
+
+ INIT \u093e ¬A
+ INIT \u093f ¬i
+ INIT \u0940 ¬I
+ INIT \u0941 ¬u
+ INIT \u0942 ¬U
+ INIT \u0943 ¬f
+ INIT \u0944 ¬F
+ INIT \u0962 ¬x
+ INIT \u0963 ¬X
+ INIT \u0947 ¬e
+ INIT \u0948 ¬E
+ INIT \u094b ¬o
+ INIT \u094c ¬O
+
+
diff --git a/pwgissues/issue78/transcode1/slp1_deva1.xml b/pwgissues/issue78/transcode1/slp1_deva1.xml
new file mode 100644
index 0000000..737208e
--- /dev/null
+++ b/pwgissues/issue78/transcode1/slp1_deva1.xml
@@ -0,0 +1,195 @@
+
+
+ INIT a \u0905 INIT
+ INIT A \u0906 INIT
+ INIT i \u0907 INIT
+ INIT I \u0908 INIT
+ INIT u \u0909 INIT
+ INIT U \u090a INIT
+ INIT f \u090b INIT
+ INIT F \u0960 INIT
+ INIT x \u090c INIT
+ INIT X \u0961 INIT
+ INIT e \u090f INIT
+ INIT E \u0910 INIT
+ INIT o \u0913 INIT
+ INIT O \u0914 INIT
+
+ SKT a INIT
+ SKT A \u093e INIT
+ SKT i \u093f INIT
+ SKT I \u0940 INIT
+ SKT u \u0941 INIT
+ SKT U \u0942 INIT
+ SKT f \u0943 INIT
+ SKT F \u0944 INIT
+ SKT x \u0962 INIT
+ SKT X \u0963 INIT
+ SKT e \u0947 INIT
+ SKT E \u0948 INIT
+ SKT o \u094b INIT
+ SKT O \u094c INIT
+
+
+ INIT,SKT k/^([^aAiIuUfFxXeEoO^/\\]) \u0915\u094d SKT
+ INIT,SKT K/^([^aAiIuUfFxXeEoO^/\\]) \u0916\u094d SKT
+ INIT,SKT g/^([^aAiIuUfFxXeEoO^/\\]) \u0917\u094d SKT
+ INIT,SKT G/^([^aAiIuUfFxXeEoO^/\\]) \u0918\u094d SKT
+ INIT,SKT N/^([^aAiIuUfFxXeEoO^/\\]) \u0919\u094d SKT
+ INIT,SKT c/^([^aAiIuUfFxXeEoO^/\\]) \u091a\u094d SKT
+ INIT,SKT C/^([^aAiIuUfFxXeEoO^/\\]) \u091b\u094d SKT
+ INIT,SKT j/^([^aAiIuUfFxXeEoO^/\\]) \u091c\u094d SKT
+ INIT,SKT J/^([^aAiIuUfFxXeEoO^/\\]) \u091d\u094d SKT
+ INIT,SKT Y/^([^aAiIuUfFxXeEoO^/\\]) \u091e\u094d SKT
+ INIT,SKT w/^([^aAiIuUfFxXeEoO^/\\]) \u091f\u094d SKT
+ INIT,SKT W/^([^aAiIuUfFxXeEoO^/\\]) \u0920\u094d SKT
+ INIT,SKT q/^([^aAiIuUfFxXeEoO^/\\]) \u0921\u094d SKT
+ INIT,SKT Q/^([^aAiIuUfFxXeEoO^/\\]) \u0922\u094d SKT
+ INIT,SKT R/^([^aAiIuUfFxXeEoO^/\\]) \u0923\u094d SKT
+ INIT,SKT t/^([^aAiIuUfFxXeEoO^/\\]) \u0924\u094d SKT
+ INIT,SKT T/^([^aAiIuUfFxXeEoO^/\\]) \u0925\u094d SKT
+ INIT,SKT d/^([^aAiIuUfFxXeEoO^/\\]) \u0926\u094d SKT
+ INIT,SKT D/^([^aAiIuUfFxXeEoO^/\\]) \u0927\u094d SKT
+ INIT,SKT n/^([^aAiIuUfFxXeEoO^/\\]) \u0928\u094d SKT
+ INIT,SKT p/^([^aAiIuUfFxXeEoO^/\\]) \u092a\u094d SKT
+ INIT,SKT P/^([^aAiIuUfFxXeEoO^/\\]) \u092b\u094d SKT
+ INIT,SKT b/^([^aAiIuUfFxXeEoO^/\\]) \u092c\u094d SKT
+ INIT,SKT B/^([^aAiIuUfFxXeEoO^/\\]) \u092d\u094d SKT
+ INIT,SKT m/^([^aAiIuUfFxXeEoO^/\\]) \u092e\u094d SKT
+ INIT,SKT y/^([^aAiIuUfFxXeEoO^/\\]) \u092f\u094d SKT
+ INIT,SKT r/^([^aAiIuUfFxXeEoO^/\\]) \u0930\u094d SKT
+ INIT,SKT l/^([^aAiIuUfFxXeEoO^/\\]) \u0932\u094d SKT
+ INIT,SKT L/^([^aAiIuUfFxXeEoO^/\\]) \u0933\u094d SKT
+ INIT,SKT |/^([^aAiIuUfFxXeEoO^/\\]) \u0933\u094d\u0939\u094d SKT
+ INIT,SKT v/^([^aAiIuUfFxXeEoO^/\\]) \u0935\u094d SKT
+ INIT,SKT S/^([^aAiIuUfFxXeEoO^/\\]) \u0936\u094d SKT
+ INIT,SKT z/^([^aAiIuUfFxXeEoO^/\\]) \u0937\u094d SKT
+ INIT,SKT s/^([^aAiIuUfFxXeEoO^/\\]) \u0938\u094d SKT
+ INIT,SKT h/^([^aAiIuUfFxXeEoO^/\\]) \u0939\u094d SKT
+ INIT,SKT k \u0915 SKT
+ INIT,SKT K \u0916 SKT
+ INIT,SKT g \u0917 SKT
+ INIT,SKT G \u0918 SKT
+ INIT,SKT N \u0919 SKT
+ INIT,SKT c \u091a SKT
+ INIT,SKT C \u091b SKT
+ INIT,SKT j \u091c SKT
+ INIT,SKT J \u091d SKT
+ INIT,SKT Y \u091e SKT
+ INIT,SKT w \u091f SKT
+ INIT,SKT W \u0920 SKT
+ INIT,SKT q \u0921 SKT
+ INIT,SKT Q \u0922 SKT
+ INIT,SKT R \u0923 SKT
+ INIT,SKT t \u0924 SKT
+ INIT,SKT T \u0925 SKT
+ INIT,SKT d \u0926 SKT
+ INIT,SKT D \u0927 SKT
+ INIT,SKT n \u0928 SKT
+ INIT,SKT p \u092a SKT
+ INIT,SKT P \u092b SKT
+ INIT,SKT b \u092c SKT
+ INIT,SKT B \u092d SKT
+ INIT,SKT m \u092e SKT
+ INIT,SKT y \u092f SKT
+ INIT,SKT r \u0930 SKT
+ INIT,SKT l \u0932 SKT
+ INIT,SKT L \u0933 SKT
+
+ INIT,SKT | \u0933\u094d\u0939 SKT
+ INIT,SKT v \u0935 SKT
+ INIT,SKT S \u0936 SKT
+ INIT,SKT z \u0937 SKT
+ INIT,SKT s \u0938 SKT
+ INIT,SKT h \u0939 SKT
+
+ INIT,SKT M \u0902 INIT
+ INIT,SKT H \u0903 INIT
+ INIT,SKT ' \u093d INIT
+ INIT,SKT . \u0964 INIT
+ INIT,SKT .. \u0965 INIT
+ INIT,SKT 0 \u0966 INIT
+ INIT,SKT 1 \u0967 INIT
+ INIT,SKT 2 \u0968 INIT
+ INIT,SKT 3 \u0969 INIT
+ INIT,SKT 4 \u096a INIT
+ INIT,SKT 5 \u096b INIT
+ INIT,SKT 6 \u096c INIT
+ INIT,SKT 7 \u096d INIT
+ INIT,SKT 8 \u096e INIT
+ INIT,SKT 9 \u096f INIT
+ INIT,SKT \u0020 \u0020 INIT
+ INIT,SKT \u0009 \u0009 INIT
+ INIT,SKT \u000d \u000d INIT
+ INIT,SKT \u000a \u000a INIT
+
+
+ INIT,SKT - - INIT
+
+ INIT,SKT \ \u0952 INIT
+
+ INIT,SKT / \ua8eb INIT
+
+ INIT,SKT ^ \u0951 INIT
+
+
+ INIT,SKT ~ \u0901 INIT
+
+ INIT,SKT o~ \u0950 INIT
+
+
+ INIT,SKT Z \u1cf2 INIT
+ INIT,SKT V \u1cf2 INIT
+
+
+ INIT,SKT \H \u0903\u0952 SKT
+ INIT,SKT \M \u0902\u0952 SKT
+ INIT,SKT /H \u0903\ua8eb SKT
+ INIT,SKT /M \u0902\ua8eb SKT
+ INIT,SKT ^H \u0903\u0951 SKT
+ INIT,SKT ^M \u0902\u0951 SKT
+
+ INIT,SKT ^\H \u0903\u0951\u0952 SKT
+ INIT,SKT ^\M \u0902\u0951\u0952 SKT
+ INIT,SKT \~ \u0901\u0952 SKT
+ INIT,SKT ^~ \u0901\u0951 SKT
+ INIT,SKT /~ \u0901\ua8eb SKT
+ INIT,SKT /~H \u0901\u0903\ua8eb SKT
+
+
+ INIT,SKT £ \ua8f2 INIT
+
+
+ INIT,SKT ¬A \u093e
+ INIT,SKT ¬i \u093f
+ INIT,SKT ¬I \u0940
+ INIT,SKT ¬u \u0941
+ INIT,SKT ¬U \u0942
+ INIT,SKT ¬f \u0943
+ INIT,SKT ¬F \u0944
+ INIT,SKT ¬x \u0962
+ INIT,SKT ¬X \u0963
+ INIT,SKT ¬e \u0947
+ INIT,SKT ¬E \u0948
+ INIT,SKT ¬o \u094b
+ INIT,SKT ¬O \u094c
+
+
+
diff --git a/pwgissues/issue78/transcoder.py b/pwgissues/issue78/transcoder.py
new file mode 100644
index 0000000..aee6177
--- /dev/null
+++ b/pwgissues/issue78/transcoder.py
@@ -0,0 +1,411 @@
+#!/usr/bin/env python
+#-*- coding:utf-8 -*-
+
+""" Python version of transcoder.
+ Uses built-in library xml.etree.ElementTree,
+ rather than lxml.
+ Revised 02-20-2017 Regarding special handling of slp1 to deva;
+ search for regexCode variable, and fsmentry['regex'] for where this comes into play.
+ This kind of coding is ugly, and needs to be revised for greater generality.
+"""
+from __future__ import print_function
+
+__program_name__ = 'transcoder.py'
+__author__ = 'Jim Funderburk'
+__email__ = 'funderburk1@verizon.net'
+__copyright__ = 'Copyright 2011, Jim Funderburk'
+__license__ = 'GPL http://www.gnu.org/licenses/gpl.txt'
+__date__ = '2011-12'
+
+# Python Standard Library
+import os
+import sys
+import codecs
+#import locale
+import re
+#import logging
+from unicodedata import normalize
+#from operator import itemgetter
+#from lxml import etree
+import xml.etree.ElementTree as ET
+
+## Jim Funderburk recoding into php of Java code developed by
+## Ralph Bunker.
+## This software is made available under the Creative Commons
+## Creative Commons Attribution Non-Commercial Share Alike license available in full at , and summarized at . Permission is granted to build upon this work non-commercially, as long as credit is explicitly acknowledged exactly as described herein and derivative work is distributed under the same license.
+## Assume transcoder xml files are in directory ../data/transcoder,
+## relative to the directory containing this transcoder.php file
+
+## two global variables
+# Assume transcoder xml files are in directory ../data/transcoder,
+# relative to the directory containing this transcoder.py file
+
+
+global transcoder_dir,transcoder_fsmarr
+transcoder_dir =os.path.dirname(os.path.abspath(__file__))
+transcoder_dir = os.path.dirname(transcoder_dir) ## parent
+transcoder_dir += "/data/transcoder"
+transcoder_fsmarr = {} # a dictionary. keys are from+to
+global python_version
+python_version = sys.version[0] # first character: 2 or 3
+if python_version == '3':
+ xrange = range
+ unichr = chr
+
+def transcoder_fsm(sfrom,to) :
+ global transcoder_dir,transcoder_fsmarr
+ fromto = sfrom + "_" + to
+ if (fromto in transcoder_fsmarr) :
+ return
+ regexCode=None
+ regexpairs = [('slp1','deva'),('hkt','tamil')]
+ if sfrom.startswith('slp1') and to.startswith('deva'):
+ regexCode = 'slp1_deva'
+ elif sfrom.startswith('deva') and to.startswith('slp1'):
+ regexCode = 'deva_slp1'
+ elif sfrom.startswith('hkt') and to.startswith('tamil'):
+ regexCode = 'hkt_tamil'
+
+ filein = transcoder_dir + '/' + fromto + ".xml"
+ if (not os.path.exists(filein)) :
+ # print("file does not exist = " + filein)
+ return
+ # print("file exists = " + filein)
+ tree = ET.parse(filein)
+ xml = tree.getroot()
+ attributes = xml.attrib
+ # for a in attributes:
+ # print(a + "," + attributes[a])
+ start = attributes['start'] ## required
+ entries = list(xml) ## children
+ fsm = {} ## finite state machine to construct
+ fsm['start']=start
+ # fsmentries is a list of fsmentry elements, each of which is a hash
+ # corresponding to one of the 'e' elements in the xml file.
+ fsmentries = [] # initially an empty list
+ n = 0
+ for e in entries:
+ if (e.tag != 'e'):
+ # skip comments
+ continue
+ x = e.find("in")
+ inval = x.text
+ if not inval: #
+ inval=''
+ conlook = False
+ match = re.match(r'^([^/]+)/\^',inval)
+ if match :
+ ## In transcoding from slp1 to devanagari, it is necessary to do a
+ ## 'look-ahead' when deciding how to code a consonant. If the
+ ## consonant is not followed by a vowel, then a vigraha has to be emitted.
+ ## The input codes inval in such cases as:
+ ## k/^([^aAiIuUfFxXeEoO^/\\])
+ ## Which is to be intepreted as: starting at the next character,
+ ## check if the input string does NOT match the regular expression
+ ## [^aAiIuUfFxXeEoO^/\\].
+ ## Note that the last 3 elements '^', '/', and '\' are present only
+ ## because of accents.
+ ## except in these two cases, we process this entry no further
+ ## 02-22-2017. Allow some other names for from and to
+ #if ( (fromto != 'slp1_deva') and (fromto != 'hkt_tamil')and
+ # (fromto != 'deva_slp1')) :
+ if not regexCode:
+ continue
+ inval = match.group(1)
+ conlook=True
+ x = e.find("s") # s = state name of this entry. Can be a comma-delimited list
+ sval = x.text
+ startStates = re.split(",",sval)
+ x = e.find("out") # out = the transformation of the input
+ outval = x.text
+ if (outval == None): # apparently parser returns this from
+ outval=''
+ x = e.find("next") # next state, this is optional. Its absence means use sval
+ if x is not None:
+ nextState = x.text
+ else:
+ nextState = startStates[0]
+
+ # inval, outval may be strings representing unicode.
+ # the format expected is \uxxxx\uyyyy etc. where xxxx and yyyy are
+ # four hex digits.
+ newinval = to_unicode(inval)
+ newoutval = to_unicode(outval)
+ # constuct this fsmentry as a hash of mixed values
+ fsmentry = {}
+ fsmentry['starts'] = startStates
+ fsmentry['in'] = newinval
+ # fsmentry['regex'] is defined only when conlook is true
+ if conlook:
+ fsmentry['regex']=regexCode
+ fsmentry['out']=newoutval
+ fsmentry['next']=nextState
+ # Dec 5, 2013 save raw inval/outval
+ fsmentry['inraw']=inval
+ fsmentry['outraw']=outval
+ fsmentry['e-elt'] = ET.tostring(e)
+ fsmentries.append(fsmentry)
+
+ n += 1
+
+ fsm['fsm']=fsmentries
+ ## make associative array states, whose keys are characters,
+ ## and whose value at a key is an array of subscripts into fsmentries.
+ ## i is a subscript for a key provided that the fsmentries[i]['in'] =
+ ## first character of key
+ states={}
+ ientry=0
+ for fsmentry in fsmentries:
+ inval = fsmentry['in']
+ #print("inval=",inval)
+ # special logic for deva_slp1 for a,
+ # where inval is empty string
+ if (len(inval)>0):
+ c = inval[0] # first character of inval
+ else:
+ c = inval # empty string
+ if (c in states):
+ state=states[c]
+ state.append(ientry)
+ states[c]=state
+ else :
+ state = []
+ state.append(ientry)
+ states[c]=state
+ ientry += 1
+
+ fsm['states']=states
+ transcoder_fsmarr[fromto]=fsm
+ #debug
+ if (False):
+ print("filein=",filein)
+ filedbg = "dbg_%s.txt" %fromto
+ print("transcoder.py. Dbg info written to",filedbg)
+ fdbg = codecs.open(filedbg,"w","utf-8")
+ fdbg.write("fsmentries=...\n")
+ keys = ['starts','in','regex','out','next','inraw','outraw']
+ for i in xrange(0,len(fsmentries)):
+ fsmentry = fsmentries[i]
+ s = []
+
+ #for key in fsmentry:
+ for key in keys:
+ if key not in fsmentry: # regex
+ continue
+ val = fsmentry[key]
+ if key == 'starts':
+ val = ' '.join(val)
+ s.append("%s => %s" %(key,val))
+ sout = ' , '.join(s)
+ out = "fsmentry[%s]=%s" %(i,sout)
+ #print(out.encode('utf-8'))
+ fdbg.write("%s\n" % out)
+ fdbg.write(" e-elt=%s\n" % fsmentry['e-elt'])
+ #print("states=...")
+ fdbg.write("states=...\n")
+ for c in states:
+ state = states[c]
+ y = []
+ for i in state:
+ y.append('%s' % i)
+ x = ' '.join(y)
+ out = "c=%s, state=%s" %(c,x)
+ #print(out.encode('utf-8'))
+ fdbg.write("%s\n" % out)
+ fdbg.close()
+def to_unicode(x):
+ # x is assumed to be a string with one of two forms
+ # (a) \uxxxx\uyyyy this is interpreted as unicode
+ # (b) other - this is returned without change
+ global python_version
+ if (x == r"\u"): # a case where notation is confusing
+ return x
+ match = re.match('\\\\u',x)
+ if match:
+ y = re.split('\\\\u',x)
+ ans=''
+ for z in y:
+ if (z == ''):
+ continue
+ z1 = z
+ z2 = ''
+ if (len(z) > 4):
+ z1 = z[:4]
+ z2 = z[4:]
+ zint= int(z1,16)
+ zuni = unichr(zint)
+ ans += zuni
+ ans += z2
+ return ans
+ else:
+ return x
+
+vowel_signs = ['\u094d','\u093e','\u093f','\u0940','\u0941','\u0942','\u0943','\u0944','\u0962','\u0963','\u0947','\u0948','\u094b','\u094c']
+vowel_signs_unicode=[]
+for vowel_sign in vowel_signs:
+ vowel_sign1 = to_unicode(vowel_sign)
+ vowel_signs_unicode.append(vowel_sign1)
+
+def transcoder_processString(line,from1,to) :
+ global transcoder_dir,transcoder_fsmarr
+ if (from1 == to) :
+ return line
+ fromto = from1 + "_" + to
+ if (fromto in transcoder_fsmarr):
+ fsm = transcoder_fsmarr[fromto]
+ else:
+ transcoder_fsm(from1,to)
+ if (fromto in transcoder_fsmarr):
+ fsm = transcoder_fsmarr[fromto]
+ else:
+ return line
+ currentState=fsm['start']
+ fsmentries = fsm['fsm']
+ states = fsm['states']
+ n=0 ## current character position in line
+ result='' ## returned value
+ m=len(line)
+ while (n < m) :
+ c = line[n] # character at position n
+ if (c not in states):
+ result += c
+ currentState=fsm['start']
+ n += 1
+ continue
+ isubs = states[c]
+ best=""
+ nbest=0
+ bestFE = None
+ for isub in isubs :
+ fsmentry=fsmentries[isub]
+ startStates=fsmentry['starts']
+ k=-1
+ nstartStates=len(startStates)
+ j=0
+ while (j < nstartStates):
+ if (startStates[j] == currentState) :
+ k=j
+ j=nstartStates
+ j += 1
+ if (k == -1) :continue
+ match = transcoder_processString_match(line,n,m,fsmentry)
+ nmatch=len(match)
+ ## echo "chk2: n=n, c='c', nmatch=nmatch
\n"
+ #out = "chk2: n=%s, c='%s', nmatch=%s" %(n,c,nmatch)
+ #print(out.encode('utf-8'))
+ if (nmatch > nbest) :
+ best = match
+ nbest=nmatch
+ bestFE=fsmentry
+
+ if (bestFE) :
+ result += bestFE['out']
+ n += nbest
+ currentState=bestFE['next']
+ else :
+ ## Default condition. emit the character and change state to start
+ result += c
+ currentState=fsm['start']
+ n += 1
+
+ return result
+
+def transcoder_processString_match(line,n,m,fsmentry) :
+ match="" ## value returned
+ edge = fsmentry['in']
+ nedge=len(edge)
+ j=n
+ k=0
+ b=True
+ while ( (j < m) and (k < nedge) and b) :
+ if(line[j] == edge[k]) :
+ j += 1
+ k += 1
+ else :
+ b=False
+ if (not b) :
+ return match
+ if (k != nedge) :
+ return match
+ match=edge
+ if (not 'regex' in fsmentry):
+ return match
+
+ ## additional logic when fsmentry['regex'] is DEVA or TAMIL
+ ## see discussion of 'regex' in transcoder_fsm
+ ## This logic only works with slp1_deva xml file.
+ ## Also, it ignores the use of '/^\' as vowel accents.
+ nmatch=len(match)
+ n1=n+nmatch
+ if (n1 == m) :
+ return match
+ d = line[n1]
+ #if (fsmentry['regex'] == 'deva') :
+ if (fsmentry['regex'] == 'slp1_deva') :
+ #test = re.match('[^aAiIuUfFxXeEoO^\/\\\\]',d)
+ test = re.match(r'[^aAiIuUfFxXeEoO^\/\\\\]',d)
+ if (test) :
+ return match
+ return ""
+
+ if (fsmentry['regex'] == 'hkt_tamil') :
+ test = re.match('[^aAiIuUeEoO]',d)
+ if (test):
+ return match
+ return ""
+
+ if (fsmentry['regex'] == 'deva_slp1'):
+ for vowel_sign1 in vowel_signs_unicode:
+ vowel_sign1_len = len(vowel_sign1)
+ found=True
+ for j in xrange(0,vowel_sign1_len):
+ k = n1 + j
+ if k >= m:
+ found=False
+ continue
+ if vowel_sign1[j] != line[k]:
+ found = False
+ continue
+ if found:
+ # the consonant is followed by $vowel_sign.
+ # return empty string to indicate rule failure.
+ # This program logic cannot distinguish between
+ # a mismatch, and an empty string.
+ # In particular, we don't handle virama properly otherwise,
+ # so we do this special test to correct the problem
+ # if ($j == 0) {return $match;} # case of virama
+ return "" # case of a vowel sign
+ # the consonant is not followed by either virama or a vowel sign.
+ return match # fell through for vowel_sign1
+ return ""
+def transcoder_processElements(line,from1,to,tagname):
+ global transcoder_from,transcoder_to
+ transcoder_from = from1
+ transcoder_to = to
+ ## Assume parts of line to be converted are marked in an xml way.
+ ## For example, if tagname = 'SA':
+ ## and line = 'The word rAma refers to a person',
+ ## returned would be 'The word XXX refers to a person',
+ ## where XXX is the transformation of the the string 'rAma' acc. to from,to
+
+ ## ans = preg_replace("/(.*?)<\/tagname>/e",
+ ## "transcoder_processString('\\1','from','to')",line)
+ #regex = str.format('<{0}>(.*?){0}>',tagname)
+ regex = '<%s>(.*?)%s>'%(tagname,tagname)
+ ans = re.sub(regex,transcoder_processElements_callback,line)
+ return ans
+
+def transcoder_processElements_callback(match) :
+ global transcoder_from,transcoder_to
+ return transcoder_processString(match.group(1),transcoder_from,transcoder_to)
+def transcoder_set_dir(dir) :
+ ## may return FALSE if string dir is improper in some way
+ global transcoder_dir
+ path = os.path.abspath(dir)
+ if os.path.exists(path):
+ transcoder_dir = path
+ return transcoder_dir
+
+def transcoder_get_dir() :
+ global transcoder_dir
+ return transcoder_dir