vntxt_0.txt transcoding of vntext_0_deva.txt. #76

sanskrit-lexicon · Sep 26, 2024 · 05b0723 · 05b0723
1 parent a67a6b7
commit 05b0723
Show file tree

Hide file tree

Showing 10 changed files with 2,948 additions and 0 deletions.
diff --git a/pwgissues/issue76/change_vntxt_0_deva.txt b/pwgissues/issue76/change_vntxt_0_deva.txt
@@ -0,0 +1,66 @@
+Notes on changes to vntxt_0_deva.txt
+These changes made so transcoding to slp1 is invertible.
+---
+old:
+{#अनरुस्#}	 ¦ [1.0176] lies: {#ते एवैतदनरुष्करोति यदक्ष्यावानक्ति.#}
+new:
+{#अनरुस्#}	 ¦ [1.0176] lies: {#ते एवैतदनरुष्करोति यदक्ष्यावानक्ति#}.
+---
+old:
+{#ऋजीक#}	 ¦ [1.1042] Das Beispiel aus <ls>ṚV.</ls> ist zu streichen, da {#आविर्ऋजीक#} ein Wort ist.
+new:
+{#ऋजीक#}	 ¦ [1.1042] Das Beispiel aus <ls>ṚV.</ls> ist zu streichen, da {#आविरृजीक#} ein Wort ist.
+This is subtle.  The unicode characters differ but when rendered look identical:
+old unicode:
+0906 | आ | DEVANAGARI LETTER AA
+0935 | व | DEVANAGARI LETTER VA
+093F | ि | DEVANAGARI VOWEL SIGN I
+0930 | र | DEVANAGARI LETTER RA
+094D | ् | DEVANAGARI SIGN VIRAMA    <<<
+090B | ऋ | DEVANAGARI LETTER VOCALIC R  <<<
+091C | ज | DEVANAGARI LETTER JA
+0940 | ी | DEVANAGARI VOWEL SIGN II
+0915 | क | DEVANAGARI LETTER KA
+new unicode:
+0906 | आ | DEVANAGARI LETTER AA
+0935 | व | DEVANAGARI LETTER VA
+093F | ि | DEVANAGARI VOWEL SIGN I
+0930 | र | DEVANAGARI LETTER RA
+0943 | ृ | DEVANAGARI VOWEL SIGN VOCALIC R   <<<
+091C | ज | DEVANAGARI LETTER JA
+0940 | ी | DEVANAGARI VOWEL SIGN II
+0915 | क | DEVANAGARI LETTER KA
+---
+old:
+{#जंह्॒#} und {#जंहस्#}	 ¦ [3.0002] Ueber eine andere Auffassung s. <ls>BENFEY</ls> in <ls>Gött. gel. Anz. 1860. Stück 74. 75. S. 742. fgg.</ls>
+new:
+{#जंह्#} und {#जंहस्#}	 ¦ [3.0002] Ueber eine andere Auffassung s. <ls>BENFEY</ls> in <ls>Gött. gel. Anz. 1860. Stück 74. 75. S. 742. fgg.</ls>
+old unicode:
+091C | ज | DEVANAGARI LETTER JA
+0902 | ं | DEVANAGARI SIGN ANUSVARA
+0939 | ह | DEVANAGARI LETTER HA
+094D | ् | DEVANAGARI SIGN VIRAMA
+0952 | ॒ | DEVANAGARI STRESS SIGN ANUDATTA   <<< to remove
+new unicode:
+091C | ज | DEVANAGARI LETTER JA
+0902 | ं | DEVANAGARI SIGN ANUSVARA
+0939 | ह | DEVANAGARI LETTER HA
+094D | ् | DEVANAGARI SIGN VIRAMA
+---
+old:
+{#तरुदूालका#}	 ¦ [3.0271] nach dem {#तरुत्र#} lies: {#तरुदूलिका#}.
+new: typo in 1st word
+{#तरुदूलिका#}	 ¦ [3.0271] nach dem {#तरुत्र#} lies: {#तरुदूलिका#}.
+---
+old:
+{#रााण꣫#}	 ¦ [6.0317] (auf Bogen 21*) ...
+new:  PWG style udAtta -> MW style udAtta, also hiatus
+; the cdsl spelling headword in rARa/ = राण॑
+{#राण॑#}	 ¦ [6.0317] (auf Bogen 21*)
+---
+old:
+{#राण॑#}	 ¦ [6.0317] (auf Bogen 21*) Z. 1; in {#राणि#} und {#पैलादि#} ist der Haken über dem {#ि#} abgebrochen.
+new: Replace  DEVANAGARI VOWEL SIGN I with DEVANAGARI LETTER I
+{#राण॑#}	 ¦ [6.0317] (auf Bogen 21*) Z. 1; in {#राणि#} und {#पैलादि#} ist der Haken über dem {#इ#} abgebrochen.
+;    Jim doesn't know how to represent in slp1 the 'naked' vowel sign.
+;    the hook above the {#ि#} is broken
diff --git a/pwgissues/issue76/readme.txt b/pwgissues/issue76/readme.txt
@@ -0,0 +1,51 @@
+09-25-2024
+issue: https://github.com/sanskrit-lexicon/PWG/issues/76
+
+# This directory
+cd /c/xampp/htdocs/sanskrit-lexicon/PWG/pwgissues/issue76
+
+# starting point for vntxt digitization of missing VN material.
+https://github.com/user-attachments/files/17129810/PWGVN_1-6_reformatted_.dng.txt
+
+# local copy rename
+mv PWGVN_1-6_reformatted_.dng.txt vntxt_0_deva_orig.txt
+cp vntxt_0_deva_orig.txt vntxt_0_deva.txt
+
+
+# transcode
+mkdir transcode
+cd transcode
+python mark_deva.py ../vntxt_0_deva.txt vntxt_0_deva_marked.txt
+637 lines read from ../vntxt_0_deva.txt
+637 lines written to vntxt_0_deva_marked.txt
+Devanagari text has been marked and saved to vntxt_0_deva_marked.txt
+
+Note: This step unnecessary!!  AB has alread marked Devanagari as {#X#},
+ which is the pwg convention.
+rm vntxt_0_deva_marked.txt
+
+# transcode
+cd /c/xampp/htdocs/sanskrit-lexicon/PWG/pwgissues/issue76/transcode
+mkdir pwgtranscoder1
+cp /c/xampp/htdocs/sanskrit-lexicon/MWS/mwtranscode/transcoder1/deva_slp1.xml pwgtranscoder1/deva_slp1.xml
+cp /c/xampp/htdocs/sanskrit-lexicon/MWS/mwtranscode/transcoder1/slp1_deva.xml pwgtranscoder1/slp1_deva.xml
+
+cp /c/xampp/htdocs/sanskrit-lexicon/MWS/mwtranscode/transcoder.py .
+cp /c/xampp/htdocs/sanskrit-lexicon/MWS/mwtranscode/mw_transcode.py pwg_transcode.py
+
+# heavily edit pwg_transcode.py
+
+-----------------
+Transcode
+# Some editing of vntxt_0_deva.txt related to transcoding to get invertibility
+See change_vntxt_0_deva.txt
+
+python pwg_transcode.py pwgtranscoder1 deva slp1 ../vntxt_0_deva.txt ../vntxt_0.txt
+# check invertibility
+python pwg_transcode.py pwgtranscoder1 slp1 deva ../vntxt_0.txt tempchk.txt
+diff ../vntxt_0_deva.txt tempchk.txt | wc -l
+0  # invertibility checks.
+
+ python pwg_transcode.py pwgtranscoder1 deva slp1 ../vntxt_0_deva.txt ../vntxt_0.txt
+
+
diff --git a/pwgissues/issue76/transcode/mark_deva.py b/pwgissues/issue76/transcode/mark_deva.py
@@ -0,0 +1,48 @@
+# coding=utf-8
+"""
+ mark_deva.py
+ Code assistance from Copilot
+"""
+import re, sys, codecs
+
+def read_lines(filein):
+ with codecs.open(filein,encoding='utf-8',mode='r') as f:
+  lines = [x.rstrip('\r\n') for x in f]
+ print(len(lines),"lines read from",filein)
+ return lines
+
+def write_lines(fileout,outarr):
+ with codecs.open(fileout,"w","utf-8") as f:
+   for out in outarr:
+    f.write(out+'\n')  
+ print(len(outarr),"lines written to",fileout)
+
+def mark_devanagari(text):
+    # Regular expression pattern to match Devanagari characters
+    devanagari_pattern = re.compile(r'[\u0900-\u097F]+')
+
+    # Replace Devanagari text with marked text
+    marked_text = devanagari_pattern.sub(lambda x: f'<s>{x.group()}</s>', text)
+
+    return marked_text
+
+def marklines(lines):
+ newlines = []
+ for line in lines:
+  newline = mark_devanagari(line)
+  newlines.append(newline)
+ return newlines
+
+if __name__=="__main__":
+ filein = sys.argv[1]
+ fileout = sys.argv[2]
+ # Read the input file into array of lines
+ lines = read_lines(filein)
+
+ # Mark the Devanagari text in each line
+ newlines = marklines(lines)
+
+ # Write the output to a new file
+ write_lines(fileout,newlines)
+
+ print("Devanagari text has been marked and saved to",fileout)
diff --git a/pwgissues/issue76/transcode/pwg_transcode.py b/pwgissues/issue76/transcode/pwg_transcode.py
@@ -0,0 +1,99 @@
+#-*- coding:utf-8 -*-
+"""pwg_transcode.py
+"""
+from __future__ import print_function
+import sys, re,codecs
+import transcoder
+
+def read_lines(filein):
+ with codecs.open(filein,encoding='utf-8',mode='r') as f:
+  lines = [x.rstrip('\r\n') for x in f]
+ print(len(lines),"lines read from",filein)
+ return lines
+
+def write_lines(fileout,outarr):
+ with codecs.open(fileout,"w","utf-8") as f:
+   for out in outarr:
+    f.write(out+'\n')  
+ print(len(outarr),"lines written to",fileout)
+
+def print_unicode(u):
+ """ Sample output:
+x= a/MSa—BU/
+0905 | अ | DEVANAGARI LETTER A
+0951 | ॑ | DEVANAGARI STRESS SIGN UDATTA
+0902 | ं | DEVANAGARI SIGN ANUSVARA
+0936 | श | DEVANAGARI LETTER SHA
+2014 | — | EM DASH
+092D | भ | DEVANAGARI LETTER BHA
+0942 | ू | DEVANAGARI VOWEL SIGN UU
+0951 | ॑ | DEVANAGARI STRESS SIGN UDATTA
+ """
+ import unicodedata
+ outarr = []
+ for c in u:
+  name = unicodedata.name(c)
+  icode = ord(c)
+  a = f"{icode:04X} | {c} | {name}"
+  outarr.append(a)
+ return outarr
+
+def transcode(x,tranin,tranout):
+ y = transcoder.transcoder_processString(x,tranin,tranout)
+ return y
+
+def convert_line(line,tranin,tranout,outarr):
+ # convert text  in '{#X#}' -> {#Y#}
+ #outarr = []
+ def f(m):
+  x = m.group(1)
+  y = transcode(x,tranin,tranout)
+  ans = '{#%s#}' % y
+  #return ans # comment out this line to 
+  # check invertibility 
+  x1 = transcode(y,tranout,tranin)
+  if x1 != x:
+   y1 = transcode(x1,tranin,tranout)
+   outarr.append(' x=%s,  y=%s' %(x,y))
+   a = print_unicode(x)
+   for a1 in a:
+    outarr.append(a1)
+   outarr.append('x1=%s, y1=%s' %(x1,y1))
+   a = print_unicode(x1)
+   for a1 in a:
+    outarr.append(a1)
+  return ans
+
+ regex = '{#(.*?)#}'
+ lineout = re.sub(regex,f,line)
+ return lineout
+
+def convert_lines(lines,tranin,tranout,outarr):
+ newlines = []
+ for line in lines:
+  newline = convert_line(line,tranin,tranout,outarr)
+  newlines.append(newline)
+ return newlines
+
+def test():
+ slp1 = 'rAARa/'
+ deva = transcode(slp1,'slp1','deva')
+ print('test: {#%s#}' % deva)
+ exit(1)
+
+
+if __name__=="__main__":
+ transcoderdir = sys.argv[1]
+ tranin = sys.argv[2]
+ tranout = sys.argv[3]
+ filein = sys.argv[4] #  xxx.txt (path to digitization of xxx
+ fileout = sys.argv[5] # 
+ lines = read_lines(filein)
+ transcoder.transcoder_set_dir(transcoderdir)
+ #test()
+ outarr = []
+ newlines = convert_lines(lines,tranin,tranout,outarr)
+ write_lines(fileout,newlines)
+ fileout1 = 'temp_debug.txt'
+ write_lines(fileout1,outarr)
+