Skip to content

Commit

Permalink
Transcode scripts between slp1 and iast (=roman) and slp1 and deva. #95
Browse files Browse the repository at this point in the history
  • Loading branch information
funderburkjim committed Jul 18, 2023
1 parent e5f6700 commit b52b15b
Show file tree
Hide file tree
Showing 10 changed files with 1,584 additions and 0 deletions.
101 changes: 101 additions & 0 deletions pwkissues/issue95/change_0.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
; **********************************************************
; changes noticed during devanagari conversion
; **********************************************************
; <L>9574<pc>1112-3<k1>ard<k2>ard/<e>500
44142 old <L>9574<pc>1112-3<k1>ard<k2>ard/<e>500
;
44142 new <L>9574<pc>1112-3<k1>ard<k2>ard<e>500
;---------------------------------------------------
; <L>9574<pc>1112-3<k1>ard<k2>ard/<e>500
44143 old {#ard/#}¦ {#fdati#} (<ls>ṚV.</ls>), {#a/rdati#} und {#*fRatti#}
;
44143 new {#ard#}¦ {#fdati#} (<ls>ṚV.</ls>), {#a/rdati#} und {#*fRatti#}
;---------------------------------------------------
; <L>29905<pc>2090-2<k1>kfY<k2>kfY<e>000
144239 old {#kfY#}¦ Bez. {%der Wurzel.%} {#kar^1 , karoti , kurute#} <ls n="Chr.">235,25.</ls>
;
144239 new {#kfY#}¦ Bez. {%der Wurzel.%} 1. {#kar#} , {#karoti , kurute#} <ls n="Chr.">235,25.</ls>
;---------------------------------------------------
; <L>52049<pc>3112-3<k1>dfSAna<k2>df/SAna<e>100
257830 old <div n="1">— 1) <ab>Partic.</ab> von {#daS^#}
;
257830 new <div n="1">— 1) <ab>Partic.</ab> von {#darS#}
;---------------------------------------------------
; <L>60799<pc>3236-3<k1>nIv<k2>*nIv/<e>107
301808 old <L>60799<pc>3236-3<k1>nIv<k2>*nIv/<e>107
;
301808 new <L>60799<pc>3236-3<k1>nIv<k2>*nIv<e>107
;---------------------------------------------------
; <L>60799<pc>3236-3<k1>nIv<k2>*nIv/<e>107
301809 old {#*nIv/#}¦ {#nIvati (sTOlye)#}.
;
301809 new {#*nIv#},¦ {#nIvati (sTOlye)#}.
;---------------------------------------------------
; <L>85535<pc>5059-3<k1>mA<k2>mA<h>5<e>500
426024 old <div n="m">— <ab>Intens.</ab> <ab>Partic.</ab> {#m^myat#} ( {#mo/miat#}) {%blökend%} (vom Bock).
;
426024 new <div n="m">— <ab>Intens.</ab> <ab>Partic.</ab> {#memyat#} ( {#mo/miat#}) {%blökend%} (vom Bock).
;---------------------------------------------------
; <L>86749<pc>5076-3<k1>mi<k2>mi<h>2<e>500
432160 old <div n="p">— Mit {#parA#} {%zurückkehren%} im <ab>Partic.</ab> Fut. {#parA-m^zyan#} <ls>AIT. BR.</ls>
;
432160 new <div n="p">— Mit {#parA#} {%zurückkehren%} im <ab>Partic.</ab> Fut. {#parA-mezyan#} <ls>AIT. BR.</ls>
;---------------------------------------------------
; <L>97406<pc>6006-1<k1>vaj<k2>*vaj<e>500
487480 old {#*vaj#}¦ , {#vajati#} ( {#gatO#}). {#vajayanti#} <ls>MBH. 2,1142</ls> fehlerhaft für {#var/jayanti#}. {#vAjay#} <ab>s.</ab> bes.
;
487480 new {#*vaj#}¦ , {#vajati#} ( {#gatO#}). {#vajayanti#} <ls>MBH. 2,1142</ls> fehlerhaft für {#varjayanti#}. {#vAjay#} <ab>s.</ab> bes.
;---------------------------------------------------
; <L>99664<pc>6048-3<k1>vasyasa<k2>vasyasa<e>000
500583 old {#vasyasa#}¦ in {#pApa°#} und {#Svovasyas/#}.
;
500583 new {#vasyasa#}¦ in {#pApa°#} und {#Svovasyasa/#}.
;---------------------------------------------------
; <L>105645<pc>6139-1<k1>visarman<k2>visarma/n<e>100
531294 old {#visarma/n#}¦ <lex>m.</lex> {%das Zerrinnen.%} {#visarmA/RaM kar/#} {%Etwas%} (<ab>Acc.</ab>) {%zerrinnen lassen.%}
;
531294 new {#visarma/n#}¦ <lex>m.</lex> {%das Zerrinnen.%} {#visarmA/RaM kar#} {%Etwas%} (<ab>Acc.</ab>) {%zerrinnen lassen.%}
;---------------------------------------------------
; <L>129442<pc>7187-1<k1>sotva<k2>(sotva)<e>100
649155 old {#(sotva)#}¦ {#sp/tua#} <lex>Adj.</lex> {%zu keltern.%}
;
649155 new {#(sotva)#}¦ {#so/tua#} <lex>Adj.</lex> {%zu keltern.%}
;---------------------------------------------------
; **********************************************************
; changes noticed during iast (roman) conversion
; **********************************************************
; <L>155<pc>1002-3<k1>akiMcanatA<k2>akiMcanatA<e>100
716 old {#akiMcanatA#}¦ <lex>f.</lex> und {#aki^canatva#} <lex>n.</lex> {%Besitzlosigkeit , Armuth.%}
;
716 new {#akiMcanatA#}¦ <lex>f.</lex> und {#akiMcanatva#} <lex>n.</lex> {%Besitzlosigkeit , Armuth.%}
;---------------------------------------------------
; <L>22598<pc>1276-2<k1>o<k2>o/<h>3<e>000
107183 old 3. {#o/#}¦ <ls n="Chr.">6,7.</ls> <ls n="Chr.">18,22</ls> = ^2. {#A/^2+u#}
;
107183 new 3. {#o/#}¦ <ls n="Chr.">6,7.</ls> <ls n="Chr.">18,22</ls> = ^2. {#A/#} + ^2. {#u#}
;---------------------------------------------------
; <L>55397<pc>3160-3<k1>Diz<k2>*Diz<h>1<e>000
274662 old 1. {#*Diz#}¦ = {#DA^1 , diDezwi#} ( {#Sabde#}).
;
274662 new 1. {#*Diz#}¦ = 1. {#DA#}. {#diDezwi#} ({#Sabde#}).
;---------------------------------------------------
; <L>83663<pc>5034-1<k1>marj<k2>marj<h>1<e>500
416714 old <div n="1">— 2) {%Etwas abwaschen%} , so <ab>v.a.</ab> {%rächen.%} {#0ratimArjita#}.
;
416714 new <div n="1">— 2) {%Etwas abwaschen%} , so <ab>v.a.</ab> {%rächen.%} {#pratimArjita#}.
;---------------------------------------------------
; <L>89765<pc>5120-3<k1>yaTA<k2>ya/TA<e>108
447320 old <div n="1">— 1) {%wie , gleichwie%} (einem {#ta/TA , taTA taTA#} [106 , 33] , {#eva/ , evam#} oder {#tadvat#} entsprechend). {#ya/TA vA#} (nach einem vorangehenden {#vA#} ) {%oder wie sonst.%} {#ya/TA cit , ya/TA ha , ya/TA ha vE (33 , 7) , yaTA - iva , iva yaTA , ya/TevANga/#} (<ls>ṚV. 10,86,7</ls>), {#ya/TEva ha#} (<ls>Chr. 36,22</ls>). {%*Auch als Ausruf der Verwunderung.%} {#yaTo etat#} {%was das betrifft (dass)%}. {#yaTA - taTA#} {%oder%} {#yaTA - tEna satyEna#} bei Betheuerungen und festen Behauptungen {%so gewiss - so wahr%} ; ausnahmsweise wechseln in diesem Falle {#yaTA#} und {#taTA#} ihre Stellen.
;
447320 new <div n="1">— 1) {%wie , gleichwie%} (einem {#ta/TA , taTA taTA#} [106 , 33] , {#eva/ , evam#} oder {#tadvat#} entsprechend). {#ya/TA vA#} (nach einem vorangehenden {#vA#} ) {%oder wie sonst.%} {#ya/TA cit , ya/TA ha , ya/TA ha vE#} (33 , 7) , {#yaTA - iva , iva yaTA , ya/TevANga/#} (<ls>ṚV. 10,86,7</ls>), {#ya/TEva ha#} (<ls>Chr. 36,22</ls>). {%*Auch als Ausruf der Verwunderung.%} {#yaTo etat#} {%was das betrifft (dass)%}. {#yaTA - taTA#} {%oder%} {#yaTA - tEna satyEna#} bei Betheuerungen und festen Behauptungen {%so gewiss - so wahr%} ; ausnahmsweise wechseln in diesem Falle {#yaTA#} und {#taTA#} ihre Stellen.
;---------------------------------------------------
; <L>104498<pc>6122-3<k1>viS<k2>vi/S<h>2<e>100
525581 old <div n="1">— 2) <ab>Sg.</ab> und <ab>Pl.</ab> {%Gemeinde%} (zunächst {%die kleinere Vereinigung innerhalb des Volkes) ; Stamm , Volk%} ; <ab>Pl.</ab> {%die Unterthanen , Leute , Mannschaft%} , {#viSAM patiH#} , {#viSAM nATaH#} , {#viSamISvara:#} und {#viSAM varizWaH#} so <ab>v.a.</ab> {%Fürst , König.%}
;
525581 new <div n="1">— 2) <ab>Sg.</ab> und <ab>Pl.</ab> {%Gemeinde%} (zunächst {%die kleinere Vereinigung innerhalb des Volkes) ; Stamm , Volk%} ; <ab>Pl.</ab> {%die Unterthanen , Leute , Mannschaft%} , {#viSAM patiH#} , {#viSAM nATaH#} , {#viSamISvaraH#} und {#viSAM varizWaH#} so <ab>v.a.</ab> {%Fürst , König.%}
;---------------------------------------------------
; <L>122811<pc>7098-2<k1>sah<k2>sah<h>1<e>500
616619 old <div n="1">— 2) {%vermögen , im Stande sein%} (sowohl physisch als auch moralisch) ; {%sich bewogen fühlen , wollen%} ; die Ergänzung ein <ab>Infin.</ab> (<ls>DIVYĀVAD. 93,13. 17. 125,19. 293,26</ls>), ein <ab>Acc.</ab> {%({#svArTam#} so <ab>v.a.</ab> seine Sache zu betreiben vermögen)%} , {#°arTam#} ({#sekArtham#} {%zu begiessen%}), <ab>Acc.</ab> mit {#prati#} <ls>ŚIŚ. 14,83</ls>), <ab>Loc.</ab> oder <ab>Dat.</ab> Mit {#na#} und <ab>Infin.</ab> so <ab>v.a.</ab> {%sich nicht entschliessen können zu%} <ls>DIVYĀVAD. 502,2.</ls>
;
616619 new <div n="1">— 2) {%vermögen , im Stande sein%} (sowohl physisch als auch moralisch) ; {%sich bewogen fühlen , wollen%} ; die Ergänzung ein <ab>Infin.</ab> (<ls>DIVYĀVAD. 93,13. 17. 125,19. 293,26</ls>), ein <ab>Acc.</ab> {%({#svArTam#} so <ab>v.a.</ab> seine Sache zu betreiben vermögen)%} , {#°arTam#} ({#sekArTam#} {%zu begiessen%}), <ab>Acc.</ab> mit {#prati#} <ls>ŚIŚ. 14,83</ls>), <ab>Loc.</ab> oder <ab>Dat.</ab> Mit {#na#} und <ab>Infin.</ab> so <ab>v.a.</ab> {%sich nicht entschliessen können zu%} <ls>DIVYĀVAD. 502,2.</ls>
;---------------------------------------------------
71 changes: 71 additions & 0 deletions pwkissues/issue95/diff_to_changes_dict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# coding=utf-8
""" diff_to_changes_dict.py
Generate change transactions from an 'old' and 'new' file
The two files should have same number of lines
ASSUME input file is a dictionary as in csl-orig/v02, e.g. mw.txt.
This structure identifies the metaline for each change;
and this is the only difference from diff_to_changes.py,
which ignores this structure, and is thus available for
generating changes for any two text files with same number of lines.
python diff_to_changes_dict.py old.txt new.txt changes.txt
Now:
python updateByLine.py old.txt changes.txt new1.txt
then new1.txt is same as new.txt.
"""
from __future__ import print_function
import sys, re,codecs

def read_lines(filein):
with codecs.open(filein,encoding='utf-8',mode='r') as f:
lines = [x.rstrip('\r\n') for x in f]
return lines

class Change(object):
def __init__(self,iline,line1,line2,metaline1):
self.iline = iline
self.line1 = line1
self.line2 = line2
self.lnum = iline+1
self.metaline1 = metaline1
a = []
a.append('; %s' %metaline1)
a.append('%s old %s' %(self.lnum,self.line1))
a.append(';')
a.append('%s new %s' %(self.lnum,self.line2))
a.append(';---------------------------------------------------')
self.changeout = a

def write_changes(fileout,changes):
outarr = []
for change in changes:
for x in change.changeout:
outarr.append(x)
with codecs.open(fileout,"w","utf-8") as f:
for out in outarr:
f.write(out+'\n')
print(len(changes),"changes written to",fileout)

if __name__=="__main__":
filein1 = sys.argv[1] # old.txt
filein2 = sys.argv[2] # new.txt
fileout = sys.argv[3] # changes.txt
lines1 = read_lines(filein1)
lines2 = read_lines(filein2)
n = len(lines1)
if n != len(lines2):
print('ERROR: files have different number of lines')
exit(1)
changes = []
metaline1 = None
metaline2 = None
for iline,line1 in enumerate(lines1):
line2 = lines2[iline]
if line1.startswith('<L>'):
metaline1 = line1

if line1 == line2:
continue
changes.append(Change(iline,line1,line2,metaline1))
#
write_changes(fileout,changes)

163 changes: 163 additions & 0 deletions pwkissues/issue95/pwtranscode/pw_transcode.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
#-*- coding:utf-8 -*-
"""pw_transcode.py
"""
from __future__ import print_function
import sys, re,codecs
import transcoder
transcoder.transcoder_set_dir('transcoder')

slp1chars = {}
def update_slp1chars(x,y,tranin,tranout):
if not ((tranin == 'roman') and (tranout == 'slp1')):
return
m = re.search(r"^[a-zA-Z|~/\\^— √°'+.,;=?\[\]\(\)!‘’*_3-]*$",y)
if m == None:
print('Unexpected character in line #%s' % (iline+1,))
print(' x=',x)
print(' y=',y)
return

def convert(line,tranin,tranout):
# convert text in '{#X#}'
def f(m):
x = m.group(1)
y = transcode(x,tranin,tranout)
return '{#%s#}' %y

regex = '{#(.*?)#}'
lineout = re.sub(regex,f,line)
return lineout

def print_unicode(x,u):
""" Sample output:
x= a/MSa—BU/
0905 | अ | DEVANAGARI LETTER A
0951 | ॑ | DEVANAGARI STRESS SIGN UDATTA
0902 | ं | DEVANAGARI SIGN ANUSVARA
0936 | श | DEVANAGARI LETTER SHA
2014 | — | EM DASH
092D | भ | DEVANAGARI LETTER BHA
0942 | ू | DEVANAGARI VOWEL SIGN UU
0951 | ॑ | DEVANAGARI STRESS SIGN UDATTA
"""
import unicodedata
outarr = []
for c in u:
name = unicodedata.name(c)
icode = ord(c)
a = f"{icode:04X} | {c} | {name}"
outarr.append(a)
print('x=',x)
for out in outarr:
print(out)
print()

def transcode(x,tranin,tranout):
y = transcoder.transcoder_processString(x,tranin,tranout)
#if True and (('|' in x) or ('Q' in x)):
if False and ('~' in x): # for debugging.
print_unicode(x,y)
update_slp1chars(x,y,tranin,tranout)
return y

def convert_metaline(line,tranin,tranout):
# '<k1>X<k2>Y'
m = re.search('<k1>([^<]+)<k2>([^<]+)',line)
x = m.group(0) # entire match
k1 = m.group(1)
k2 = m.group(2)
#k1a =transcoder.transcoder_processString(k1,tranin,tranout)
#k2a =transcoder.transcoder_processString(k2,tranin,tranout)
k1a = transcode(k1,tranin,tranout)
k2a = transcode(k2,tranin,tranout)
y = '<k1>%s<k2>%s' %(k1a,k2a)
lineout = line.replace(x,y)
if False and (tranin == 'slp1') and (tranout == 'deva'):
if '/' in k2:
print_unicode(k2,k2a)
if (tranin in ['roman','roman1']) and (tranout == 'slp1'):
# transcoding inversion problem for three lines
exceptions = [
('<L>116525.7<',
'<L>116525.7<pc>588,2<k1>paramahaMsopanizadhfdaya<k2>parama/—haMsopanizad-hfdaya<e>4'
),
('<L>139372<',
'<L>139372<pc>704,3<k1>prAghAra<k2>prAg—hAra<e>3'
),
('<L>139373<',
'<L>139373<pc>704,3<k1>prAghoma<k2>prAg—homa<e>3'
)
]
for start,correction in exceptions:
if lineout.startswith(start):
lineout = correction
print('manual correction:',lineout)
break
return lineout

def test():
tranin = 'roman'
tranout = 'slp1'
tests = [
'ā́',
'ā-pyā́yana',
'ā́-bhūti',
]
for x in tests:
y = transcode(x,tranin,tranout)
print('%s -> %s'%(x,y))
exit(1)
def test1():
with codecs.open("temp.txt","w","utf-8") as f:
x = 'A^'
y = transcode(x,'slp1','roman')
f.write(y+'\n')
z = transcode(y,'roman','slp1')
f.write(z+'\n')
print('write to temp.txt')
exit(1)

if __name__=="__main__":
#test()
#test1()
tranin = sys.argv[1]
tranout = sys.argv[2]
filein = sys.argv[3] # xxx.txt (path to digitization of xxx
fileout = sys.argv[4] #

with codecs.open(filein,"r","utf-8") as f:
with codecs.open(fileout,"w","utf-8") as fout:
inentry = False
for iline,line in enumerate(f):
line = line.rstrip('\r\n')
if inentry:
# inentry = True
if line.startswith('<LEND>'):
lineout = line
inentry = False
elif line.startswith('<L>'): # error
print('Error 1. Not expecting <L>')
print("line # ",iline+1)
print(line)
exit(1)
else:
# keep looking for <LEND
lineout = convert(line,tranin,tranout)
else:
# inentry = False
if line.startswith('<L>'):
lineout = convert_metaline(line,tranin,tranout)
inentry = True
elif line.startswith('<LEND>'): # error
print('Error 2. Not expecting <LEND>')
print("line # ",iline+1)
print(line)
exit(1)
else:
# line outside of <L>...<LEND>
lineout = line
fout.write(lineout+'\n')
if False: # True:
if iline > 1000:
print('quit at iline=',iline)
break
Loading

0 comments on commit b52b15b

Please sign in to comment.