Skip to content

Commit

Permalink
#78 first test
Browse files Browse the repository at this point in the history
  • Loading branch information
funderburkjim committed Oct 6, 2024
1 parent 941e570 commit 366b4f5
Show file tree
Hide file tree
Showing 10 changed files with 1,375 additions and 0 deletions.
100 changes: 100 additions & 0 deletions pwgissues/issue78/debug_transcode.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
#-*- coding:utf-8 -*-
"""debug_transcode.py
"""
from __future__ import print_function
import sys, re,codecs
import transcoder

def read_lines(filein):
with codecs.open(filein,encoding='utf-8',mode='r') as f:
lines = [x.rstrip('\r\n') for x in f]
print(len(lines),"lines read from",filein)
return lines

def write_lines(fileout,outarr):
with codecs.open(fileout,"w","utf-8") as f:
for out in outarr:
f.write(out+'\n')
print(len(outarr),"lines written to",fileout)

def print_unicode(u):
""" Sample output:
x= a/MSa—BU/
0905 | अ | DEVANAGARI LETTER A
0951 | ॑ | DEVANAGARI STRESS SIGN UDATTA
0902 | ं | DEVANAGARI SIGN ANUSVARA
0936 | श | DEVANAGARI LETTER SHA
2014 | — | EM DASH
092D | भ | DEVANAGARI LETTER BHA
0942 | ू | DEVANAGARI VOWEL SIGN UU
0951 | ॑ | DEVANAGARI STRESS SIGN UDATTA
"""
import unicodedata
outarr = []
for c in u:
name = unicodedata.name(c)
icode = ord(c)
a = f"{icode:04X} | {c} | {name}"
outarr.append(a)
return outarr

def transcode(x,tranin,tranout):
y = transcoder.transcoder_processString(x,tranin,tranout)
return y

def convert_line(line,tranin,tranout,outarr):
# convert text in '{#X#}' -> {#Y#}
#outarr = []
def f(m):
x = m.group(1)
y = transcode(x,tranin,tranout)
ans = '{#%s#}' % y
#return ans # comment out this line to
# check invertibility
x1 = transcode(y,tranout,tranin)
if x1 != x:
y1 = transcode(x1,tranin,tranout)
outarr.append(' x=%s, y=%s' %(x,y))
a = print_unicode(x)
for a1 in a:
outarr.append(a1)
outarr.append('x1=%s, y1=%s' %(x1,y1))
a = print_unicode(x1)
for a1 in a:
outarr.append(a1)
return ans

regex = '{#(.*?)#}'
lineout = re.sub(regex,f,line)
return lineout

def convert_lines(lines,tranin,tranout,outarr):
newlines = []
for line in lines:
newline = convert_line(line,tranin,tranout,outarr)
newlines.append(newline)
return newlines


def test(lines):
outarr = []
def f(m):
x = m.group(1)
y = transcode(x,tranin,tranout)
return '{#%s#}' % y
for iline,line in enumerate(lines):
lineout = re.sub(r'{#(.*?)#}',f,line)
outarr.append(lineout)
return outarr

if __name__=="__main__":
transcoderdir = sys.argv[1]
transcoder.transcoder_set_dir(transcoderdir)
tranin = sys.argv[2]
tranout = sys.argv[3]
filein = sys.argv[4]
fileout = sys.argv[5]

lines = read_lines(filein)
outarr = test(lines)
write_lines(fileout,outarr)
26 changes: 26 additions & 0 deletions pwgissues/issue78/readme.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
10-06-2024
issue: https://github.com/sanskrit-lexicon/PWG/issues/78

# This directory
cd /c/xampp/htdocs/sanskrit-lexicon/PWG/pwgissues/issue78

# proposed enhancement of slp1 transcoding for vowel-markers.

https://github.com/user-attachments/files/17129810/PWGVN_1-6_reformatted_.dng.txt

There is a 'special' transcoding of slp1 to Devanagari
that is used in dictionaries: pwg, pw, sch, pwkvn
The transcoding files are slp1_deva1.xml and its inverse deva1_slp1.xml
We start with the versions of these two files in transcode0 directory.

The revision is in transcode1 directory.

python debug_transcode.py transcode1 slp1 deva1 test01_in.txt test01_out.txt
python debug_transcode.py transcode1 deva1 slp1 test01_out.txt test01_in_out.txt

test01_in_out.txt is different than test01_in.txt.
The invertibility fails.

Note that deva1_slp1.xml defines a finite-state machine with only one state,
the INIT state.

4 changes: 4 additions & 0 deletions pwgissues/issue78/test01_in.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{#tarudU¬AlakA#}
{#pElA¬Ada#}
{#rA¬ARa/#}
{#¬i#}
4 changes: 4 additions & 0 deletions pwgissues/issue78/test01_in_out.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{#tarudUAlakA#}
{#pElAAda#}
{#rAARa/#}
{#i#}
4 changes: 4 additions & 0 deletions pwgissues/issue78/test01_out.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{#तरुदूालका#}
{#पैलााद#}
{#रााण꣫#}
{#ि#}
220 changes: 220 additions & 0 deletions pwgissues/issue78/transcode0/deva1_slp1.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
<fsm start='INIT' inputDecoding='UTF-8' outputEncoding='UTF-8'>
<!-- 09-27-2024. Inverse of slp1_deva1.xml -->
<e> <s>INIT</s> <in>\u0905</in> <out>a</out></e>
<e> <s>INIT</s> <in>\u0906</in> <out>A</out></e>
<e> <s>INIT</s> <in>\u0907</in> <out>i</out></e>
<e> <s>INIT</s> <in>\u0908</in> <out>I</out></e>
<e> <s>INIT</s> <in>\u0909</in> <out>u</out></e>
<e> <s>INIT</s> <in>\u090a</in> <out>U</out></e>
<e> <s>INIT</s> <in>\u090b</in> <out>f</out></e>
<e> <s>INIT</s> <in>\u0960</in> <out>F</out></e>
<e> <s>INIT</s> <in>\u090c</in> <out>x</out></e>
<e> <s>INIT</s> <in>\u0961</in> <out>X</out></e>
<e> <s>INIT</s> <in>\u090f</in> <out>e</out></e>
<e> <s>INIT</s> <in>\u0910</in> <out>E</out></e>
<e> <s>INIT</s> <in>\u0913</in> <out>o</out></e>
<e> <s>INIT</s> <in>\u0914</in> <out>O</out></e>

<e> <s>INIT</s> <in>\u0905\ua8eb</in> <out>a/</out></e>
<e> <s>INIT</s> <in>\u0906\ua8eb</in> <out>A/</out></e>
<e> <s>INIT</s> <in>\u0907\ua8eb</in> <out>i/</out></e>
<e> <s>INIT</s> <in>\u0908\ua8eb</in> <out>I/</out></e>
<e> <s>INIT</s> <in>\u0909\ua8eb</in> <out>u/</out></e>
<e> <s>INIT</s> <in>\u090a\ua8eb</in> <out>U/</out></e>
<e> <s>INIT</s> <in>\u090b\ua8eb</in> <out>f/</out></e>
<e> <s>INIT</s> <in>\u0960\ua8eb</in> <out>F/</out></e>
<e> <s>INIT</s> <in>\u090c\ua8eb</in> <out>x/</out></e>
<e> <s>INIT</s> <in>\u0961\ua8eb</in> <out>X/</out></e>
<e> <s>INIT</s> <in>\u090f\ua8eb</in> <out>e/</out></e>
<e> <s>INIT</s> <in>\u0910\ua8eb</in> <out>E/</out></e>
<e> <s>INIT</s> <in>\u0913\ua8eb</in> <out>o/</out></e>
<e> <s>INIT</s> <in>\u0914\ua8eb</in> <out>O/</out></e>

<e> <s>INIT</s> <in>\u0905\u0952</in> <out>a\</out></e>
<e> <s>INIT</s> <in>\u0906\u0952</in> <out>A\</out></e>
<e> <s>INIT</s> <in>\u0907\u0952</in> <out>i\</out></e>
<e> <s>INIT</s> <in>\u0908\u0952</in> <out>I\</out></e>
<e> <s>INIT</s> <in>\u0909\u0952</in> <out>u\</out></e>
<e> <s>INIT</s> <in>\u090a\u0952</in> <out>U\</out></e>
<e> <s>INIT</s> <in>\u090b\u0952</in> <out>f\</out></e>
<e> <s>INIT</s> <in>\u0960\u0952</in> <out>F\</out></e>
<e> <s>INIT</s> <in>\u090c\u0952</in> <out>x\</out></e>
<e> <s>INIT</s> <in>\u0961\u0952</in> <out>X\</out></e>
<e> <s>INIT</s> <in>\u090f\u0952</in> <out>e\</out></e>
<e> <s>INIT</s> <in>\u0910\u0952</in> <out>E\</out></e>
<e> <s>INIT</s> <in>\u0913\u0952</in> <out>o\</out></e>
<e> <s>INIT</s> <in>\u0914\u0952</in> <out>O\</out></e>

<e> <s>INIT</s> <in>\u093e</in> <out>A</out></e>
<e> <s>INIT</s> <in>\u093f</in> <out>i</out></e>
<e> <s>INIT</s> <in>\u0940</in> <out>I</out></e>
<e> <s>INIT</s> <in>\u0941</in> <out>u</out></e>
<e> <s>INIT</s> <in>\u0942</in> <out>U</out></e>
<e> <s>INIT</s> <in>\u0943</in> <out>f</out></e>
<e> <s>INIT</s> <in>\u0944</in> <out>F</out></e>
<e> <s>INIT</s> <in>\u0962</in> <out>x</out></e>
<e> <s>INIT</s> <in>\u0963</in> <out>X</out></e>
<e> <s>INIT</s> <in>\u0947</in> <out>e</out></e>
<e> <s>INIT</s> <in>\u0948</in> <out>E</out></e>
<e> <s>INIT</s> <in>\u094b</in> <out>o</out></e>
<e> <s>INIT</s> <in>\u094c</in> <out>O</out></e>

<e> <s>INIT</s> <in>\u0951</in> <out>^</out></e>
<e> <s>INIT</s> <in>\u093e\u0951</in> <out>A^</out></e>
<e> <s>INIT</s> <in>\u093f\u0951</in> <out>i^</out></e>
<e> <s>INIT</s> <in>\u0940\u0951</in> <out>I^</out></e>
<e> <s>INIT</s> <in>\u0941\u0951</in> <out>u^</out></e>
<e> <s>INIT</s> <in>\u0942\u0951</in> <out>U^</out></e>
<e> <s>INIT</s> <in>\u0943\u0951</in> <out>f^</out></e>
<e> <s>INIT</s> <in>\u0944\u0951</in> <out>F^</out></e>
<e> <s>INIT</s> <in>\u0962\u0951</in> <out>x^</out></e>
<e> <s>INIT</s> <in>\u0963\u0951</in> <out>X^</out></e>
<e> <s>INIT</s> <in>\u0947\u0951</in> <out>e^</out></e>
<e> <s>INIT</s> <in>\u0948\u0951</in> <out>E^</out></e>
<e> <s>INIT</s> <in>\u094b\u0951</in> <out>o^</out></e>
<e> <s>INIT</s> <in>\u094c\u0951</in> <out>O^</out></e>

<e> <s>INIT</s> <in>\u0952</in> <out>\</out></e>
<e> <s>INIT</s> <in>\u093e\u0952</in> <out>A\</out></e>
<e> <s>INIT</s> <in>\u093f\u0952</in> <out>i\</out></e>
<e> <s>INIT</s> <in>\u0940\u0952</in> <out>I\</out></e>
<e> <s>INIT</s> <in>\u0941\u0952</in> <out>u\</out></e>
<e> <s>INIT</s> <in>\u0942\u0952</in> <out>U\</out></e>
<e> <s>INIT</s> <in>\u0943\u0952</in> <out>f\</out></e>
<e> <s>INIT</s> <in>\u0944\u0952</in> <out>F\</out></e>
<e> <s>INIT</s> <in>\u0962\u0952</in> <out>x\</out></e>
<e> <s>INIT</s> <in>\u0963\u0952</in> <out>X\</out></e>
<e> <s>INIT</s> <in>\u0947\u0952</in> <out>e\</out></e>
<e> <s>INIT</s> <in>\u0948\u0952</in> <out>E\</out></e>
<e> <s>INIT</s> <in>\u094b\u0952</in> <out>o\</out></e>
<e> <s>INIT</s> <in>\u094c\u0952</in> <out>O\</out></e>

<e> <s>INIT</s> <in>\ua8eb</in> <out>/</out></e>
<e> <s>INIT</s> <in>\u093e\ua8eb</in> <out>A/</out></e>
<e> <s>INIT</s> <in>\u093f\ua8eb</in> <out>i/</out></e>
<e> <s>INIT</s> <in>\u0940\ua8eb</in> <out>I/</out></e>
<e> <s>INIT</s> <in>\u0941\ua8eb</in> <out>u/</out></e>
<e> <s>INIT</s> <in>\u0942\ua8eb</in> <out>U/</out></e>
<e> <s>INIT</s> <in>\u0943\ua8eb</in> <out>f/</out></e>
<e> <s>INIT</s> <in>\u0944\ua8eb</in> <out>F/</out></e>
<e> <s>INIT</s> <in>\u0962\ua8eb</in> <out>x/</out></e>
<e> <s>INIT</s> <in>\u0963\ua8eb</in> <out>X/</out></e>
<e> <s>INIT</s> <in>\u0947\ua8eb</in> <out>e/</out></e>
<e> <s>INIT</s> <in>\u0948\ua8eb</in> <out>E/</out></e>
<e> <s>INIT</s> <in>\u094b\ua8eb</in> <out>o/</out></e>
<e> <s>INIT</s> <in>\u094c\ua8eb</in> <out>O/</out></e>

<!-- virama outputs empty string -->
<e> <s>INIT</s> <in>\u094d</in> <out></out></e>
<!-- if a consonant is **not** followed by a vowel sign or a virama, then include schwa Note: the /^ implies the check (by hard-coding) in transcoder.php-->
<e> <s>INIT</s> <in>\u0915/^</in> <out>ka</out></e>
<e> <s>INIT</s> <in>\u0916/^</in> <out>Ka</out></e>
<e> <s>INIT</s> <in>\u0917/^</in> <out>ga</out></e>
<e> <s>INIT</s> <in>\u0918/^</in> <out>Ga</out></e>
<e> <s>INIT</s> <in>\u0919/^</in> <out>Na</out></e>
<e> <s>INIT</s> <in>\u091a/^</in> <out>ca</out></e>
<e> <s>INIT</s> <in>\u091b/^</in> <out>Ca</out></e>
<e> <s>INIT</s> <in>\u091c/^</in> <out>ja</out></e>
<e> <s>INIT</s> <in>\u091d/^</in> <out>Ja</out></e>
<e> <s>INIT</s> <in>\u091e/^</in> <out>Ya</out></e>
<e> <s>INIT</s> <in>\u091f/^</in> <out>wa</out></e>
<e> <s>INIT</s> <in>\u0920/^</in> <out>Wa</out></e>
<e> <s>INIT</s> <in>\u0921/^</in> <out>qa</out></e>
<e> <s>INIT</s> <in>\u0922/^</in> <out>Qa</out></e>
<e> <s>INIT</s> <in>\u0923/^</in> <out>Ra</out></e>
<e> <s>INIT</s> <in>\u0924/^</in> <out>ta</out></e>
<e> <s>INIT</s> <in>\u0925/^</in> <out>Ta</out></e>
<e> <s>INIT</s> <in>\u0926/^</in> <out>da</out></e>
<e> <s>INIT</s> <in>\u0927/^</in> <out>Da</out></e>
<e> <s>INIT</s> <in>\u0928/^</in> <out>na</out></e>
<e> <s>INIT</s> <in>\u092a/^</in> <out>pa</out></e>
<e> <s>INIT</s> <in>\u092b/^</in> <out>Pa</out></e>
<e> <s>INIT</s> <in>\u092c/^</in> <out>ba</out></e>
<e> <s>INIT</s> <in>\u092d/^</in> <out>Ba</out></e>
<e> <s>INIT</s> <in>\u092e/^</in> <out>ma</out></e>
<e> <s>INIT</s> <in>\u092f/^</in> <out>ya</out></e>
<e> <s>INIT</s> <in>\u0930/^</in> <out>ra</out></e>
<e> <s>INIT</s> <in>\u0932/^</in> <out>la</out></e>
<e> <s>INIT</s> <in>\u0933/^</in> <out>La</out></e>
<e> <s>INIT</s> <in>\u0933\u094d\u0939/^</in> <out>|a</out></e>
<e> <s>INIT</s> <in>\u0935/^</in> <out>va</out></e>
<e> <s>INIT</s> <in>\u0936/^</in> <out>Sa</out></e>
<e> <s>INIT</s> <in>\u0937/^</in> <out>za</out></e>
<e> <s>INIT</s> <in>\u0938/^</in> <out>sa</out></e>
<e> <s>INIT</s> <in>\u0939/^</in> <out>ha</out></e>

<!-- If a consonent is followed by something else, then just output the consonent with no schwa. The vowel will be added next-->

<e> <s>INIT</s> <in>\u0915</in> <out>k</out></e>
<e> <s>INIT</s> <in>\u0916</in> <out>K</out></e>
<e> <s>INIT</s> <in>\u0917</in> <out>g</out></e>
<e> <s>INIT</s> <in>\u0918</in> <out>G</out></e>
<e> <s>INIT</s> <in>\u0919</in> <out>N</out></e>
<e> <s>INIT</s> <in>\u091a</in> <out>c</out></e>
<e> <s>INIT</s> <in>\u091b</in> <out>C</out></e>
<e> <s>INIT</s> <in>\u091c</in> <out>j</out></e>
<e> <s>INIT</s> <in>\u091d</in> <out>J</out></e>
<e> <s>INIT</s> <in>\u091e</in> <out>Y</out></e>
<e> <s>INIT</s> <in>\u091f</in> <out>w</out></e>
<e> <s>INIT</s> <in>\u0920</in> <out>W</out></e>
<e> <s>INIT</s> <in>\u0921</in> <out>q</out></e>
<e> <s>INIT</s> <in>\u0922</in> <out>Q</out></e>
<e> <s>INIT</s> <in>\u0923</in> <out>R</out></e>
<e> <s>INIT</s> <in>\u0924</in> <out>t</out></e>
<e> <s>INIT</s> <in>\u0925</in> <out>T</out></e>
<e> <s>INIT</s> <in>\u0926</in> <out>d</out></e>
<e> <s>INIT</s> <in>\u0927</in> <out>D</out></e>
<e> <s>INIT</s> <in>\u0928</in> <out>n</out></e>
<e> <s>INIT</s> <in>\u092a</in> <out>p</out></e>
<e> <s>INIT</s> <in>\u092b</in> <out>P</out></e>
<e> <s>INIT</s> <in>\u092c</in> <out>b</out></e>
<e> <s>INIT</s> <in>\u092d</in> <out>B</out></e>
<e> <s>INIT</s> <in>\u092e</in> <out>m</out></e>
<e> <s>INIT</s> <in>\u092f</in> <out>y</out></e>
<e> <s>INIT</s> <in>\u0930</in> <out>r</out></e>
<e> <s>INIT</s> <in>\u0932</in> <out>l</out></e>
<e> <s>INIT</s> <in>\u0933</in> <out>L</out></e>
<e> <s>INIT</s> <in>\u0933\u094d\u0939</in> <out>|</out></e>
<e> <s>INIT</s> <in>\u0935</in> <out>v</out></e>
<e> <s>INIT</s> <in>\u0936</in> <out>S</out></e>
<e> <s>INIT</s> <in>\u0937</in> <out>z</out></e>
<e> <s>INIT</s> <in>\u0938</in> <out>s</out></e>
<e> <s>INIT</s> <in>\u0939</in> <out>h</out></e>
<e> <s>INIT</s> <in>\u0902</in> <out>M</out></e>
<e> <s>INIT</s> <in>\u0903</in> <out>H</out></e>
<e> <s>INIT</s> <in>\u093d</in> <out>'</out></e>
<e> <s>INIT</s> <in>\u0964</in> <out>.</out></e>
<e> <s>INIT</s> <in>\u0965</in> <out>..</out></e>
<e> <s>INIT</s> <in>\u0966</in> <out>0</out></e>
<e> <s>INIT</s> <in>\u0967</in> <out>1</out></e>
<e> <s>INIT</s> <in>\u0968</in> <out>2</out></e>
<e> <s>INIT</s> <in>\u0969</in> <out>3</out></e>
<e> <s>INIT</s> <in>\u096a</in> <out>4</out></e>
<e> <s>INIT</s> <in>\u096b</in> <out>5</out></e>
<e> <s>INIT</s> <in>\u096c</in> <out>6</out></e>
<e> <s>INIT</s> <in>\u096d</in> <out>7</out></e>
<e> <s>INIT</s> <in>\u096e</in> <out>8</out></e>
<e> <s>INIT</s> <in>\u096f</in> <out>9</out></e>
<!-- candrabindu -->
<e> <s>INIT</s> <in>\u0901</in> <out>~</out></e>
<!-- OM -->
<e> <s>INIT</s> <in>\u0950</in> <out>o~</out></e>

<!-- special rules such as
<e n='123b'> <s>INIT</s> <in>\u0902\ua8eb</in> <out>/M</out></e>
-->
<e n='122a'> <s>INIT</s> <in>\u0903\u0952</in> <out>\H</out> </e>
<e n='122b'> <s>INIT</s> <in>\u0902\u0952</in> <out>\M</out> </e>
<e n='123a'> <s>INIT</s> <in>\u0903\ua8eb</in> <out>/H</out> </e>
<e n='123b'> <s>INIT</s> <in>\u0902\ua8eb</in> <out>/M</out> </e>
<e n='124a'> <s>INIT</s> <in>\u0903\u0951</in> <out>^H</out> </e>
<e n='124b'> <s>INIT</s> <in>\u0902\u0951</in> <out>^M</out> </e>
<!-- additional rewriting for accents. 8/13/2021 -->
<e n='125a'> <s>INIT</s> <in>\u0903\u0951\u0952</in> <out>^\H</out> </e>
<e n='125b'> <s>INIT</s> <in>\u0902\u0951\u0952</in> <out>^\M</out> </e>
<e n='126a'> <s>INIT</s> <in>\u0901\u0952</in> <out>\~</out> </e>
<e n='126b'> <s>INIT</s> <in>\u0901\u0951</in> <out>^~</out> </e>
<e n='126d'> <s>INIT</s> <in>\u0901\ua8eb</in> <out>/~</out> </e>
<e n='126e'> <s>INIT</s> <in>\u0901\u0903\ua8eb</in> <out>/~H</out> </e>
<e> <s>INIT</s> <in>\u1cf2</in> <out>Z</out> </e>
</fsm>
Loading

0 comments on commit 366b4f5

Please sign in to comment.