Skip to content

Commit

Permalink
vntxt_0.txt transcoding of vntext_0_deva.txt. #76
Browse files Browse the repository at this point in the history
  • Loading branch information
funderburkjim committed Sep 26, 2024
1 parent a67a6b7 commit 05b0723
Show file tree
Hide file tree
Showing 10 changed files with 2,948 additions and 0 deletions.
66 changes: 66 additions & 0 deletions pwgissues/issue76/change_vntxt_0_deva.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
Notes on changes to vntxt_0_deva.txt
These changes made so transcoding to slp1 is invertible.
---
old:
{#अनरुस्#} ¦ [1.0176] lies: {#ते एवैतदनरुष्करोति यदक्ष्यावानक्ति.#}
new:
{#अनरुस्#} ¦ [1.0176] lies: {#ते एवैतदनरुष्करोति यदक्ष्यावानक्ति#}.
---
old:
{#ऋजीक#} ¦ [1.1042] Das Beispiel aus <ls>ṚV.</ls> ist zu streichen, da {#आविर्ऋजीक#} ein Wort ist.
new:
{#ऋजीक#} ¦ [1.1042] Das Beispiel aus <ls>ṚV.</ls> ist zu streichen, da {#आविरृजीक#} ein Wort ist.
This is subtle. The unicode characters differ but when rendered look identical:
old unicode:
0906 | आ | DEVANAGARI LETTER AA
0935 | व | DEVANAGARI LETTER VA
093F | ि | DEVANAGARI VOWEL SIGN I
0930 | र | DEVANAGARI LETTER RA
094D | ् | DEVANAGARI SIGN VIRAMA <<<
090B | ऋ | DEVANAGARI LETTER VOCALIC R <<<
091C | ज | DEVANAGARI LETTER JA
0940 | ी | DEVANAGARI VOWEL SIGN II
0915 | क | DEVANAGARI LETTER KA
new unicode:
0906 | आ | DEVANAGARI LETTER AA
0935 | व | DEVANAGARI LETTER VA
093F | ि | DEVANAGARI VOWEL SIGN I
0930 | र | DEVANAGARI LETTER RA
0943 | ृ | DEVANAGARI VOWEL SIGN VOCALIC R <<<
091C | ज | DEVANAGARI LETTER JA
0940 | ी | DEVANAGARI VOWEL SIGN II
0915 | क | DEVANAGARI LETTER KA
---
old:
{#जंह्॒#} und {#जंहस्#} ¦ [3.0002] Ueber eine andere Auffassung s. <ls>BENFEY</ls> in <ls>Gött. gel. Anz. 1860. Stück 74. 75. S. 742. fgg.</ls>
new:
{#जंह्#} und {#जंहस्#} ¦ [3.0002] Ueber eine andere Auffassung s. <ls>BENFEY</ls> in <ls>Gött. gel. Anz. 1860. Stück 74. 75. S. 742. fgg.</ls>
old unicode:
091C | ज | DEVANAGARI LETTER JA
0902 | ं | DEVANAGARI SIGN ANUSVARA
0939 | ह | DEVANAGARI LETTER HA
094D | ् | DEVANAGARI SIGN VIRAMA
0952 | ॒ | DEVANAGARI STRESS SIGN ANUDATTA <<< to remove
new unicode:
091C | ज | DEVANAGARI LETTER JA
0902 | ं | DEVANAGARI SIGN ANUSVARA
0939 | ह | DEVANAGARI LETTER HA
094D | ् | DEVANAGARI SIGN VIRAMA
---
old:
{#तरुदूालका#} ¦ [3.0271] nach dem {#तरुत्र#} lies: {#तरुदूलिका#}.
new: typo in 1st word
{#तरुदूलिका#} ¦ [3.0271] nach dem {#तरुत्र#} lies: {#तरुदूलिका#}.
---
old:
{#रााण꣫#} ¦ [6.0317] (auf Bogen 21*) ...
new: PWG style udAtta -> MW style udAtta, also hiatus
; the cdsl spelling headword in rARa/ = राण॑
{#राण॑#} ¦ [6.0317] (auf Bogen 21*)
---
old:
{#राण॑#} ¦ [6.0317] (auf Bogen 21*) Z. 1; in {#राणि#} und {#पैलादि#} ist der Haken über dem {#ि#} abgebrochen.
new: Replace DEVANAGARI VOWEL SIGN I with DEVANAGARI LETTER I
{#राण॑#} ¦ [6.0317] (auf Bogen 21*) Z. 1; in {#राणि#} und {#पैलादि#} ist der Haken über dem {#इ#} abgebrochen.
; Jim doesn't know how to represent in slp1 the 'naked' vowel sign.
; the hook above the {#ि#} is broken
51 changes: 51 additions & 0 deletions pwgissues/issue76/readme.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
09-25-2024
issue: https://github.com/sanskrit-lexicon/PWG/issues/76

# This directory
cd /c/xampp/htdocs/sanskrit-lexicon/PWG/pwgissues/issue76

# starting point for vntxt digitization of missing VN material.
https://github.com/user-attachments/files/17129810/PWGVN_1-6_reformatted_.dng.txt

# local copy rename
mv PWGVN_1-6_reformatted_.dng.txt vntxt_0_deva_orig.txt
cp vntxt_0_deva_orig.txt vntxt_0_deva.txt


# transcode
mkdir transcode
cd transcode
python mark_deva.py ../vntxt_0_deva.txt vntxt_0_deva_marked.txt
637 lines read from ../vntxt_0_deva.txt
637 lines written to vntxt_0_deva_marked.txt
Devanagari text has been marked and saved to vntxt_0_deva_marked.txt

Note: This step unnecessary!! AB has alread marked Devanagari as {#X#},
which is the pwg convention.
rm vntxt_0_deva_marked.txt

# transcode
cd /c/xampp/htdocs/sanskrit-lexicon/PWG/pwgissues/issue76/transcode
mkdir pwgtranscoder1
cp /c/xampp/htdocs/sanskrit-lexicon/MWS/mwtranscode/transcoder1/deva_slp1.xml pwgtranscoder1/deva_slp1.xml
cp /c/xampp/htdocs/sanskrit-lexicon/MWS/mwtranscode/transcoder1/slp1_deva.xml pwgtranscoder1/slp1_deva.xml

cp /c/xampp/htdocs/sanskrit-lexicon/MWS/mwtranscode/transcoder.py .
cp /c/xampp/htdocs/sanskrit-lexicon/MWS/mwtranscode/mw_transcode.py pwg_transcode.py

# heavily edit pwg_transcode.py

-----------------
Transcode
# Some editing of vntxt_0_deva.txt related to transcoding to get invertibility
See change_vntxt_0_deva.txt

python pwg_transcode.py pwgtranscoder1 deva slp1 ../vntxt_0_deva.txt ../vntxt_0.txt
# check invertibility
python pwg_transcode.py pwgtranscoder1 slp1 deva ../vntxt_0.txt tempchk.txt
diff ../vntxt_0_deva.txt tempchk.txt | wc -l
0 # invertibility checks.

python pwg_transcode.py pwgtranscoder1 deva slp1 ../vntxt_0_deva.txt ../vntxt_0.txt


48 changes: 48 additions & 0 deletions pwgissues/issue76/transcode/mark_deva.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# coding=utf-8
"""
mark_deva.py
Code assistance from Copilot
"""
import re, sys, codecs

def read_lines(filein):
with codecs.open(filein,encoding='utf-8',mode='r') as f:
lines = [x.rstrip('\r\n') for x in f]
print(len(lines),"lines read from",filein)
return lines

def write_lines(fileout,outarr):
with codecs.open(fileout,"w","utf-8") as f:
for out in outarr:
f.write(out+'\n')
print(len(outarr),"lines written to",fileout)

def mark_devanagari(text):
# Regular expression pattern to match Devanagari characters
devanagari_pattern = re.compile(r'[\u0900-\u097F]+')

# Replace Devanagari text with marked text
marked_text = devanagari_pattern.sub(lambda x: f'<s>{x.group()}</s>', text)

return marked_text

def marklines(lines):
newlines = []
for line in lines:
newline = mark_devanagari(line)
newlines.append(newline)
return newlines

if __name__=="__main__":
filein = sys.argv[1]
fileout = sys.argv[2]
# Read the input file into array of lines
lines = read_lines(filein)

# Mark the Devanagari text in each line
newlines = marklines(lines)

# Write the output to a new file
write_lines(fileout,newlines)

print("Devanagari text has been marked and saved to",fileout)
99 changes: 99 additions & 0 deletions pwgissues/issue76/transcode/pwg_transcode.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#-*- coding:utf-8 -*-
"""pwg_transcode.py
"""
from __future__ import print_function
import sys, re,codecs
import transcoder

def read_lines(filein):
with codecs.open(filein,encoding='utf-8',mode='r') as f:
lines = [x.rstrip('\r\n') for x in f]
print(len(lines),"lines read from",filein)
return lines

def write_lines(fileout,outarr):
with codecs.open(fileout,"w","utf-8") as f:
for out in outarr:
f.write(out+'\n')
print(len(outarr),"lines written to",fileout)

def print_unicode(u):
""" Sample output:
x= a/MSa—BU/
0905 | अ | DEVANAGARI LETTER A
0951 | ॑ | DEVANAGARI STRESS SIGN UDATTA
0902 | ं | DEVANAGARI SIGN ANUSVARA
0936 | श | DEVANAGARI LETTER SHA
2014 | — | EM DASH
092D | भ | DEVANAGARI LETTER BHA
0942 | ू | DEVANAGARI VOWEL SIGN UU
0951 | ॑ | DEVANAGARI STRESS SIGN UDATTA
"""
import unicodedata
outarr = []
for c in u:
name = unicodedata.name(c)
icode = ord(c)
a = f"{icode:04X} | {c} | {name}"
outarr.append(a)
return outarr

def transcode(x,tranin,tranout):
y = transcoder.transcoder_processString(x,tranin,tranout)
return y

def convert_line(line,tranin,tranout,outarr):
# convert text in '{#X#}' -> {#Y#}
#outarr = []
def f(m):
x = m.group(1)
y = transcode(x,tranin,tranout)
ans = '{#%s#}' % y
#return ans # comment out this line to
# check invertibility
x1 = transcode(y,tranout,tranin)
if x1 != x:
y1 = transcode(x1,tranin,tranout)
outarr.append(' x=%s, y=%s' %(x,y))
a = print_unicode(x)
for a1 in a:
outarr.append(a1)
outarr.append('x1=%s, y1=%s' %(x1,y1))
a = print_unicode(x1)
for a1 in a:
outarr.append(a1)
return ans

regex = '{#(.*?)#}'
lineout = re.sub(regex,f,line)
return lineout

def convert_lines(lines,tranin,tranout,outarr):
newlines = []
for line in lines:
newline = convert_line(line,tranin,tranout,outarr)
newlines.append(newline)
return newlines

def test():
slp1 = 'rAARa/'
deva = transcode(slp1,'slp1','deva')
print('test: {#%s#}' % deva)
exit(1)


if __name__=="__main__":
transcoderdir = sys.argv[1]
tranin = sys.argv[2]
tranout = sys.argv[3]
filein = sys.argv[4] # xxx.txt (path to digitization of xxx
fileout = sys.argv[5] #
lines = read_lines(filein)
transcoder.transcoder_set_dir(transcoderdir)
#test()
outarr = []
newlines = convert_lines(lines,tranin,tranout,outarr)
write_lines(fileout,newlines)
fileout1 = 'temp_debug.txt'
write_lines(fileout1,outarr)

Loading

0 comments on commit 05b0723

Please sign in to comment.