-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
vntxt_0.txt transcoding of vntext_0_deva.txt. #76
- Loading branch information
1 parent
a67a6b7
commit 05b0723
Showing
10 changed files
with
2,948 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
Notes on changes to vntxt_0_deva.txt | ||
These changes made so transcoding to slp1 is invertible. | ||
--- | ||
old: | ||
{#अनरुस्#} ¦ [1.0176] lies: {#ते एवैतदनरुष्करोति यदक्ष्यावानक्ति.#} | ||
new: | ||
{#अनरुस्#} ¦ [1.0176] lies: {#ते एवैतदनरुष्करोति यदक्ष्यावानक्ति#}. | ||
--- | ||
old: | ||
{#ऋजीक#} ¦ [1.1042] Das Beispiel aus <ls>ṚV.</ls> ist zu streichen, da {#आविर्ऋजीक#} ein Wort ist. | ||
new: | ||
{#ऋजीक#} ¦ [1.1042] Das Beispiel aus <ls>ṚV.</ls> ist zu streichen, da {#आविरृजीक#} ein Wort ist. | ||
This is subtle. The unicode characters differ but when rendered look identical: | ||
old unicode: | ||
0906 | आ | DEVANAGARI LETTER AA | ||
0935 | व | DEVANAGARI LETTER VA | ||
093F | ि | DEVANAGARI VOWEL SIGN I | ||
0930 | र | DEVANAGARI LETTER RA | ||
094D | ् | DEVANAGARI SIGN VIRAMA <<< | ||
090B | ऋ | DEVANAGARI LETTER VOCALIC R <<< | ||
091C | ज | DEVANAGARI LETTER JA | ||
0940 | ी | DEVANAGARI VOWEL SIGN II | ||
0915 | क | DEVANAGARI LETTER KA | ||
new unicode: | ||
0906 | आ | DEVANAGARI LETTER AA | ||
0935 | व | DEVANAGARI LETTER VA | ||
093F | ि | DEVANAGARI VOWEL SIGN I | ||
0930 | र | DEVANAGARI LETTER RA | ||
0943 | ृ | DEVANAGARI VOWEL SIGN VOCALIC R <<< | ||
091C | ज | DEVANAGARI LETTER JA | ||
0940 | ी | DEVANAGARI VOWEL SIGN II | ||
0915 | क | DEVANAGARI LETTER KA | ||
--- | ||
old: | ||
{#जंह्॒#} und {#जंहस्#} ¦ [3.0002] Ueber eine andere Auffassung s. <ls>BENFEY</ls> in <ls>Gött. gel. Anz. 1860. Stück 74. 75. S. 742. fgg.</ls> | ||
new: | ||
{#जंह्#} und {#जंहस्#} ¦ [3.0002] Ueber eine andere Auffassung s. <ls>BENFEY</ls> in <ls>Gött. gel. Anz. 1860. Stück 74. 75. S. 742. fgg.</ls> | ||
old unicode: | ||
091C | ज | DEVANAGARI LETTER JA | ||
0902 | ं | DEVANAGARI SIGN ANUSVARA | ||
0939 | ह | DEVANAGARI LETTER HA | ||
094D | ् | DEVANAGARI SIGN VIRAMA | ||
0952 | ॒ | DEVANAGARI STRESS SIGN ANUDATTA <<< to remove | ||
new unicode: | ||
091C | ज | DEVANAGARI LETTER JA | ||
0902 | ं | DEVANAGARI SIGN ANUSVARA | ||
0939 | ह | DEVANAGARI LETTER HA | ||
094D | ् | DEVANAGARI SIGN VIRAMA | ||
--- | ||
old: | ||
{#तरुदूालका#} ¦ [3.0271] nach dem {#तरुत्र#} lies: {#तरुदूलिका#}. | ||
new: typo in 1st word | ||
{#तरुदूलिका#} ¦ [3.0271] nach dem {#तरुत्र#} lies: {#तरुदूलिका#}. | ||
--- | ||
old: | ||
{#रााण꣫#} ¦ [6.0317] (auf Bogen 21*) ... | ||
new: PWG style udAtta -> MW style udAtta, also hiatus | ||
; the cdsl spelling headword in rARa/ = राण॑ | ||
{#राण॑#} ¦ [6.0317] (auf Bogen 21*) | ||
--- | ||
old: | ||
{#राण॑#} ¦ [6.0317] (auf Bogen 21*) Z. 1; in {#राणि#} und {#पैलादि#} ist der Haken über dem {#ि#} abgebrochen. | ||
new: Replace DEVANAGARI VOWEL SIGN I with DEVANAGARI LETTER I | ||
{#राण॑#} ¦ [6.0317] (auf Bogen 21*) Z. 1; in {#राणि#} und {#पैलादि#} ist der Haken über dem {#इ#} abgebrochen. | ||
; Jim doesn't know how to represent in slp1 the 'naked' vowel sign. | ||
; the hook above the {#ि#} is broken |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
09-25-2024 | ||
issue: https://github.com/sanskrit-lexicon/PWG/issues/76 | ||
|
||
# This directory | ||
cd /c/xampp/htdocs/sanskrit-lexicon/PWG/pwgissues/issue76 | ||
|
||
# starting point for vntxt digitization of missing VN material. | ||
https://github.com/user-attachments/files/17129810/PWGVN_1-6_reformatted_.dng.txt | ||
|
||
# local copy rename | ||
mv PWGVN_1-6_reformatted_.dng.txt vntxt_0_deva_orig.txt | ||
cp vntxt_0_deva_orig.txt vntxt_0_deva.txt | ||
|
||
|
||
# transcode | ||
mkdir transcode | ||
cd transcode | ||
python mark_deva.py ../vntxt_0_deva.txt vntxt_0_deva_marked.txt | ||
637 lines read from ../vntxt_0_deva.txt | ||
637 lines written to vntxt_0_deva_marked.txt | ||
Devanagari text has been marked and saved to vntxt_0_deva_marked.txt | ||
|
||
Note: This step unnecessary!! AB has alread marked Devanagari as {#X#}, | ||
which is the pwg convention. | ||
rm vntxt_0_deva_marked.txt | ||
|
||
# transcode | ||
cd /c/xampp/htdocs/sanskrit-lexicon/PWG/pwgissues/issue76/transcode | ||
mkdir pwgtranscoder1 | ||
cp /c/xampp/htdocs/sanskrit-lexicon/MWS/mwtranscode/transcoder1/deva_slp1.xml pwgtranscoder1/deva_slp1.xml | ||
cp /c/xampp/htdocs/sanskrit-lexicon/MWS/mwtranscode/transcoder1/slp1_deva.xml pwgtranscoder1/slp1_deva.xml | ||
|
||
cp /c/xampp/htdocs/sanskrit-lexicon/MWS/mwtranscode/transcoder.py . | ||
cp /c/xampp/htdocs/sanskrit-lexicon/MWS/mwtranscode/mw_transcode.py pwg_transcode.py | ||
|
||
# heavily edit pwg_transcode.py | ||
|
||
----------------- | ||
Transcode | ||
# Some editing of vntxt_0_deva.txt related to transcoding to get invertibility | ||
See change_vntxt_0_deva.txt | ||
|
||
python pwg_transcode.py pwgtranscoder1 deva slp1 ../vntxt_0_deva.txt ../vntxt_0.txt | ||
# check invertibility | ||
python pwg_transcode.py pwgtranscoder1 slp1 deva ../vntxt_0.txt tempchk.txt | ||
diff ../vntxt_0_deva.txt tempchk.txt | wc -l | ||
0 # invertibility checks. | ||
|
||
python pwg_transcode.py pwgtranscoder1 deva slp1 ../vntxt_0_deva.txt ../vntxt_0.txt | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
# coding=utf-8 | ||
""" | ||
mark_deva.py | ||
Code assistance from Copilot | ||
""" | ||
import re, sys, codecs | ||
|
||
def read_lines(filein): | ||
with codecs.open(filein,encoding='utf-8',mode='r') as f: | ||
lines = [x.rstrip('\r\n') for x in f] | ||
print(len(lines),"lines read from",filein) | ||
return lines | ||
|
||
def write_lines(fileout,outarr): | ||
with codecs.open(fileout,"w","utf-8") as f: | ||
for out in outarr: | ||
f.write(out+'\n') | ||
print(len(outarr),"lines written to",fileout) | ||
|
||
def mark_devanagari(text): | ||
# Regular expression pattern to match Devanagari characters | ||
devanagari_pattern = re.compile(r'[\u0900-\u097F]+') | ||
|
||
# Replace Devanagari text with marked text | ||
marked_text = devanagari_pattern.sub(lambda x: f'<s>{x.group()}</s>', text) | ||
|
||
return marked_text | ||
|
||
def marklines(lines): | ||
newlines = [] | ||
for line in lines: | ||
newline = mark_devanagari(line) | ||
newlines.append(newline) | ||
return newlines | ||
|
||
if __name__=="__main__": | ||
filein = sys.argv[1] | ||
fileout = sys.argv[2] | ||
# Read the input file into array of lines | ||
lines = read_lines(filein) | ||
|
||
# Mark the Devanagari text in each line | ||
newlines = marklines(lines) | ||
|
||
# Write the output to a new file | ||
write_lines(fileout,newlines) | ||
|
||
print("Devanagari text has been marked and saved to",fileout) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
#-*- coding:utf-8 -*- | ||
"""pwg_transcode.py | ||
""" | ||
from __future__ import print_function | ||
import sys, re,codecs | ||
import transcoder | ||
|
||
def read_lines(filein): | ||
with codecs.open(filein,encoding='utf-8',mode='r') as f: | ||
lines = [x.rstrip('\r\n') for x in f] | ||
print(len(lines),"lines read from",filein) | ||
return lines | ||
|
||
def write_lines(fileout,outarr): | ||
with codecs.open(fileout,"w","utf-8") as f: | ||
for out in outarr: | ||
f.write(out+'\n') | ||
print(len(outarr),"lines written to",fileout) | ||
|
||
def print_unicode(u): | ||
""" Sample output: | ||
x= a/MSa—BU/ | ||
0905 | अ | DEVANAGARI LETTER A | ||
0951 | ॑ | DEVANAGARI STRESS SIGN UDATTA | ||
0902 | ं | DEVANAGARI SIGN ANUSVARA | ||
0936 | श | DEVANAGARI LETTER SHA | ||
2014 | — | EM DASH | ||
092D | भ | DEVANAGARI LETTER BHA | ||
0942 | ू | DEVANAGARI VOWEL SIGN UU | ||
0951 | ॑ | DEVANAGARI STRESS SIGN UDATTA | ||
""" | ||
import unicodedata | ||
outarr = [] | ||
for c in u: | ||
name = unicodedata.name(c) | ||
icode = ord(c) | ||
a = f"{icode:04X} | {c} | {name}" | ||
outarr.append(a) | ||
return outarr | ||
|
||
def transcode(x,tranin,tranout): | ||
y = transcoder.transcoder_processString(x,tranin,tranout) | ||
return y | ||
|
||
def convert_line(line,tranin,tranout,outarr): | ||
# convert text in '{#X#}' -> {#Y#} | ||
#outarr = [] | ||
def f(m): | ||
x = m.group(1) | ||
y = transcode(x,tranin,tranout) | ||
ans = '{#%s#}' % y | ||
#return ans # comment out this line to | ||
# check invertibility | ||
x1 = transcode(y,tranout,tranin) | ||
if x1 != x: | ||
y1 = transcode(x1,tranin,tranout) | ||
outarr.append(' x=%s, y=%s' %(x,y)) | ||
a = print_unicode(x) | ||
for a1 in a: | ||
outarr.append(a1) | ||
outarr.append('x1=%s, y1=%s' %(x1,y1)) | ||
a = print_unicode(x1) | ||
for a1 in a: | ||
outarr.append(a1) | ||
return ans | ||
|
||
regex = '{#(.*?)#}' | ||
lineout = re.sub(regex,f,line) | ||
return lineout | ||
|
||
def convert_lines(lines,tranin,tranout,outarr): | ||
newlines = [] | ||
for line in lines: | ||
newline = convert_line(line,tranin,tranout,outarr) | ||
newlines.append(newline) | ||
return newlines | ||
|
||
def test(): | ||
slp1 = 'rAARa/' | ||
deva = transcode(slp1,'slp1','deva') | ||
print('test: {#%s#}' % deva) | ||
exit(1) | ||
|
||
|
||
if __name__=="__main__": | ||
transcoderdir = sys.argv[1] | ||
tranin = sys.argv[2] | ||
tranout = sys.argv[3] | ||
filein = sys.argv[4] # xxx.txt (path to digitization of xxx | ||
fileout = sys.argv[5] # | ||
lines = read_lines(filein) | ||
transcoder.transcoder_set_dir(transcoderdir) | ||
#test() | ||
outarr = [] | ||
newlines = convert_lines(lines,tranin,tranout,outarr) | ||
write_lines(fileout,newlines) | ||
fileout1 = 'temp_debug.txt' | ||
write_lines(fileout1,outarr) | ||
|
Oops, something went wrong.