-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Transcode scripts between slp1 and iast (=roman) and slp1 and deva. #95
- Loading branch information
1 parent
e5f6700
commit b52b15b
Showing
10 changed files
with
1,584 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
; ********************************************************** | ||
; changes noticed during devanagari conversion | ||
; ********************************************************** | ||
; <L>9574<pc>1112-3<k1>ard<k2>ard/<e>500 | ||
44142 old <L>9574<pc>1112-3<k1>ard<k2>ard/<e>500 | ||
; | ||
44142 new <L>9574<pc>1112-3<k1>ard<k2>ard<e>500 | ||
;--------------------------------------------------- | ||
; <L>9574<pc>1112-3<k1>ard<k2>ard/<e>500 | ||
44143 old {#ard/#}¦ {#fdati#} (<ls>ṚV.</ls>), {#a/rdati#} und {#*fRatti#} | ||
; | ||
44143 new {#ard#}¦ {#fdati#} (<ls>ṚV.</ls>), {#a/rdati#} und {#*fRatti#} | ||
;--------------------------------------------------- | ||
; <L>29905<pc>2090-2<k1>kfY<k2>kfY<e>000 | ||
144239 old {#kfY#}¦ Bez. {%der Wurzel.%} {#kar^1 , karoti , kurute#} <ls n="Chr.">235,25.</ls> | ||
; | ||
144239 new {#kfY#}¦ Bez. {%der Wurzel.%} 1. {#kar#} , {#karoti , kurute#} <ls n="Chr.">235,25.</ls> | ||
;--------------------------------------------------- | ||
; <L>52049<pc>3112-3<k1>dfSAna<k2>df/SAna<e>100 | ||
257830 old <div n="1">— 1) <ab>Partic.</ab> von {#daS^#} | ||
; | ||
257830 new <div n="1">— 1) <ab>Partic.</ab> von {#darS#} | ||
;--------------------------------------------------- | ||
; <L>60799<pc>3236-3<k1>nIv<k2>*nIv/<e>107 | ||
301808 old <L>60799<pc>3236-3<k1>nIv<k2>*nIv/<e>107 | ||
; | ||
301808 new <L>60799<pc>3236-3<k1>nIv<k2>*nIv<e>107 | ||
;--------------------------------------------------- | ||
; <L>60799<pc>3236-3<k1>nIv<k2>*nIv/<e>107 | ||
301809 old {#*nIv/#}¦ {#nIvati (sTOlye)#}. | ||
; | ||
301809 new {#*nIv#},¦ {#nIvati (sTOlye)#}. | ||
;--------------------------------------------------- | ||
; <L>85535<pc>5059-3<k1>mA<k2>mA<h>5<e>500 | ||
426024 old <div n="m">— <ab>Intens.</ab> <ab>Partic.</ab> {#m^myat#} ( {#mo/miat#}) {%blökend%} (vom Bock). | ||
; | ||
426024 new <div n="m">— <ab>Intens.</ab> <ab>Partic.</ab> {#memyat#} ( {#mo/miat#}) {%blökend%} (vom Bock). | ||
;--------------------------------------------------- | ||
; <L>86749<pc>5076-3<k1>mi<k2>mi<h>2<e>500 | ||
432160 old <div n="p">— Mit {#parA#} {%zurückkehren%} im <ab>Partic.</ab> Fut. {#parA-m^zyan#} <ls>AIT. BR.</ls> | ||
; | ||
432160 new <div n="p">— Mit {#parA#} {%zurückkehren%} im <ab>Partic.</ab> Fut. {#parA-mezyan#} <ls>AIT. BR.</ls> | ||
;--------------------------------------------------- | ||
; <L>97406<pc>6006-1<k1>vaj<k2>*vaj<e>500 | ||
487480 old {#*vaj#}¦ , {#vajati#} ( {#gatO#}). {#vajayanti#} <ls>MBH. 2,1142</ls> fehlerhaft für {#var/jayanti#}. {#vAjay#} <ab>s.</ab> bes. | ||
; | ||
487480 new {#*vaj#}¦ , {#vajati#} ( {#gatO#}). {#vajayanti#} <ls>MBH. 2,1142</ls> fehlerhaft für {#varjayanti#}. {#vAjay#} <ab>s.</ab> bes. | ||
;--------------------------------------------------- | ||
; <L>99664<pc>6048-3<k1>vasyasa<k2>vasyasa<e>000 | ||
500583 old {#vasyasa#}¦ in {#pApa°#} und {#Svovasyas/#}. | ||
; | ||
500583 new {#vasyasa#}¦ in {#pApa°#} und {#Svovasyasa/#}. | ||
;--------------------------------------------------- | ||
; <L>105645<pc>6139-1<k1>visarman<k2>visarma/n<e>100 | ||
531294 old {#visarma/n#}¦ <lex>m.</lex> {%das Zerrinnen.%} {#visarmA/RaM kar/#} {%Etwas%} (<ab>Acc.</ab>) {%zerrinnen lassen.%} | ||
; | ||
531294 new {#visarma/n#}¦ <lex>m.</lex> {%das Zerrinnen.%} {#visarmA/RaM kar#} {%Etwas%} (<ab>Acc.</ab>) {%zerrinnen lassen.%} | ||
;--------------------------------------------------- | ||
; <L>129442<pc>7187-1<k1>sotva<k2>(sotva)<e>100 | ||
649155 old {#(sotva)#}¦ {#sp/tua#} <lex>Adj.</lex> {%zu keltern.%} | ||
; | ||
649155 new {#(sotva)#}¦ {#so/tua#} <lex>Adj.</lex> {%zu keltern.%} | ||
;--------------------------------------------------- | ||
; ********************************************************** | ||
; changes noticed during iast (roman) conversion | ||
; ********************************************************** | ||
; <L>155<pc>1002-3<k1>akiMcanatA<k2>akiMcanatA<e>100 | ||
716 old {#akiMcanatA#}¦ <lex>f.</lex> und {#aki^canatva#} <lex>n.</lex> {%Besitzlosigkeit , Armuth.%} | ||
; | ||
716 new {#akiMcanatA#}¦ <lex>f.</lex> und {#akiMcanatva#} <lex>n.</lex> {%Besitzlosigkeit , Armuth.%} | ||
;--------------------------------------------------- | ||
; <L>22598<pc>1276-2<k1>o<k2>o/<h>3<e>000 | ||
107183 old 3. {#o/#}¦ <ls n="Chr.">6,7.</ls> <ls n="Chr.">18,22</ls> = ^2. {#A/^2+u#} | ||
; | ||
107183 new 3. {#o/#}¦ <ls n="Chr.">6,7.</ls> <ls n="Chr.">18,22</ls> = ^2. {#A/#} + ^2. {#u#} | ||
;--------------------------------------------------- | ||
; <L>55397<pc>3160-3<k1>Diz<k2>*Diz<h>1<e>000 | ||
274662 old 1. {#*Diz#}¦ = {#DA^1 , diDezwi#} ( {#Sabde#}). | ||
; | ||
274662 new 1. {#*Diz#}¦ = 1. {#DA#}. {#diDezwi#} ({#Sabde#}). | ||
;--------------------------------------------------- | ||
; <L>83663<pc>5034-1<k1>marj<k2>marj<h>1<e>500 | ||
416714 old <div n="1">— 2) {%Etwas abwaschen%} , so <ab>v.a.</ab> {%rächen.%} {#0ratimArjita#}. | ||
; | ||
416714 new <div n="1">— 2) {%Etwas abwaschen%} , so <ab>v.a.</ab> {%rächen.%} {#pratimArjita#}. | ||
;--------------------------------------------------- | ||
; <L>89765<pc>5120-3<k1>yaTA<k2>ya/TA<e>108 | ||
447320 old <div n="1">— 1) {%wie , gleichwie%} (einem {#ta/TA , taTA taTA#} [106 , 33] , {#eva/ , evam#} oder {#tadvat#} entsprechend). {#ya/TA vA#} (nach einem vorangehenden {#vA#} ) {%oder wie sonst.%} {#ya/TA cit , ya/TA ha , ya/TA ha vE (33 , 7) , yaTA - iva , iva yaTA , ya/TevANga/#} (<ls>ṚV. 10,86,7</ls>), {#ya/TEva ha#} (<ls>Chr. 36,22</ls>). {%*Auch als Ausruf der Verwunderung.%} {#yaTo etat#} {%was das betrifft (dass)%}. {#yaTA - taTA#} {%oder%} {#yaTA - tEna satyEna#} bei Betheuerungen und festen Behauptungen {%so gewiss - so wahr%} ; ausnahmsweise wechseln in diesem Falle {#yaTA#} und {#taTA#} ihre Stellen. | ||
; | ||
447320 new <div n="1">— 1) {%wie , gleichwie%} (einem {#ta/TA , taTA taTA#} [106 , 33] , {#eva/ , evam#} oder {#tadvat#} entsprechend). {#ya/TA vA#} (nach einem vorangehenden {#vA#} ) {%oder wie sonst.%} {#ya/TA cit , ya/TA ha , ya/TA ha vE#} (33 , 7) , {#yaTA - iva , iva yaTA , ya/TevANga/#} (<ls>ṚV. 10,86,7</ls>), {#ya/TEva ha#} (<ls>Chr. 36,22</ls>). {%*Auch als Ausruf der Verwunderung.%} {#yaTo etat#} {%was das betrifft (dass)%}. {#yaTA - taTA#} {%oder%} {#yaTA - tEna satyEna#} bei Betheuerungen und festen Behauptungen {%so gewiss - so wahr%} ; ausnahmsweise wechseln in diesem Falle {#yaTA#} und {#taTA#} ihre Stellen. | ||
;--------------------------------------------------- | ||
; <L>104498<pc>6122-3<k1>viS<k2>vi/S<h>2<e>100 | ||
525581 old <div n="1">— 2) <ab>Sg.</ab> und <ab>Pl.</ab> {%Gemeinde%} (zunächst {%die kleinere Vereinigung innerhalb des Volkes) ; Stamm , Volk%} ; <ab>Pl.</ab> {%die Unterthanen , Leute , Mannschaft%} , {#viSAM patiH#} , {#viSAM nATaH#} , {#viSamISvara:#} und {#viSAM varizWaH#} so <ab>v.a.</ab> {%Fürst , König.%} | ||
; | ||
525581 new <div n="1">— 2) <ab>Sg.</ab> und <ab>Pl.</ab> {%Gemeinde%} (zunächst {%die kleinere Vereinigung innerhalb des Volkes) ; Stamm , Volk%} ; <ab>Pl.</ab> {%die Unterthanen , Leute , Mannschaft%} , {#viSAM patiH#} , {#viSAM nATaH#} , {#viSamISvaraH#} und {#viSAM varizWaH#} so <ab>v.a.</ab> {%Fürst , König.%} | ||
;--------------------------------------------------- | ||
; <L>122811<pc>7098-2<k1>sah<k2>sah<h>1<e>500 | ||
616619 old <div n="1">— 2) {%vermögen , im Stande sein%} (sowohl physisch als auch moralisch) ; {%sich bewogen fühlen , wollen%} ; die Ergänzung ein <ab>Infin.</ab> (<ls>DIVYĀVAD. 93,13. 17. 125,19. 293,26</ls>), ein <ab>Acc.</ab> {%({#svArTam#} so <ab>v.a.</ab> seine Sache zu betreiben vermögen)%} , {#°arTam#} ({#sekArtham#} {%zu begiessen%}), <ab>Acc.</ab> mit {#prati#} <ls>ŚIŚ. 14,83</ls>), <ab>Loc.</ab> oder <ab>Dat.</ab> Mit {#na#} und <ab>Infin.</ab> so <ab>v.a.</ab> {%sich nicht entschliessen können zu%} <ls>DIVYĀVAD. 502,2.</ls> | ||
; | ||
616619 new <div n="1">— 2) {%vermögen , im Stande sein%} (sowohl physisch als auch moralisch) ; {%sich bewogen fühlen , wollen%} ; die Ergänzung ein <ab>Infin.</ab> (<ls>DIVYĀVAD. 93,13. 17. 125,19. 293,26</ls>), ein <ab>Acc.</ab> {%({#svArTam#} so <ab>v.a.</ab> seine Sache zu betreiben vermögen)%} , {#°arTam#} ({#sekArTam#} {%zu begiessen%}), <ab>Acc.</ab> mit {#prati#} <ls>ŚIŚ. 14,83</ls>), <ab>Loc.</ab> oder <ab>Dat.</ab> Mit {#na#} und <ab>Infin.</ab> so <ab>v.a.</ab> {%sich nicht entschliessen können zu%} <ls>DIVYĀVAD. 502,2.</ls> | ||
;--------------------------------------------------- |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
# coding=utf-8 | ||
""" diff_to_changes_dict.py | ||
Generate change transactions from an 'old' and 'new' file | ||
The two files should have same number of lines | ||
ASSUME input file is a dictionary as in csl-orig/v02, e.g. mw.txt. | ||
This structure identifies the metaline for each change; | ||
and this is the only difference from diff_to_changes.py, | ||
which ignores this structure, and is thus available for | ||
generating changes for any two text files with same number of lines. | ||
python diff_to_changes_dict.py old.txt new.txt changes.txt | ||
Now: | ||
python updateByLine.py old.txt changes.txt new1.txt | ||
then new1.txt is same as new.txt. | ||
""" | ||
from __future__ import print_function | ||
import sys, re,codecs | ||
|
||
def read_lines(filein): | ||
with codecs.open(filein,encoding='utf-8',mode='r') as f: | ||
lines = [x.rstrip('\r\n') for x in f] | ||
return lines | ||
|
||
class Change(object): | ||
def __init__(self,iline,line1,line2,metaline1): | ||
self.iline = iline | ||
self.line1 = line1 | ||
self.line2 = line2 | ||
self.lnum = iline+1 | ||
self.metaline1 = metaline1 | ||
a = [] | ||
a.append('; %s' %metaline1) | ||
a.append('%s old %s' %(self.lnum,self.line1)) | ||
a.append(';') | ||
a.append('%s new %s' %(self.lnum,self.line2)) | ||
a.append(';---------------------------------------------------') | ||
self.changeout = a | ||
|
||
def write_changes(fileout,changes): | ||
outarr = [] | ||
for change in changes: | ||
for x in change.changeout: | ||
outarr.append(x) | ||
with codecs.open(fileout,"w","utf-8") as f: | ||
for out in outarr: | ||
f.write(out+'\n') | ||
print(len(changes),"changes written to",fileout) | ||
|
||
if __name__=="__main__": | ||
filein1 = sys.argv[1] # old.txt | ||
filein2 = sys.argv[2] # new.txt | ||
fileout = sys.argv[3] # changes.txt | ||
lines1 = read_lines(filein1) | ||
lines2 = read_lines(filein2) | ||
n = len(lines1) | ||
if n != len(lines2): | ||
print('ERROR: files have different number of lines') | ||
exit(1) | ||
changes = [] | ||
metaline1 = None | ||
metaline2 = None | ||
for iline,line1 in enumerate(lines1): | ||
line2 = lines2[iline] | ||
if line1.startswith('<L>'): | ||
metaline1 = line1 | ||
|
||
if line1 == line2: | ||
continue | ||
changes.append(Change(iline,line1,line2,metaline1)) | ||
# | ||
write_changes(fileout,changes) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,163 @@ | ||
#-*- coding:utf-8 -*- | ||
"""pw_transcode.py | ||
""" | ||
from __future__ import print_function | ||
import sys, re,codecs | ||
import transcoder | ||
transcoder.transcoder_set_dir('transcoder') | ||
|
||
slp1chars = {} | ||
def update_slp1chars(x,y,tranin,tranout): | ||
if not ((tranin == 'roman') and (tranout == 'slp1')): | ||
return | ||
m = re.search(r"^[a-zA-Z|~/\\^— √°'+.,;=?\[\]\(\)!‘’*_3-]*$",y) | ||
if m == None: | ||
print('Unexpected character in line #%s' % (iline+1,)) | ||
print(' x=',x) | ||
print(' y=',y) | ||
return | ||
|
||
def convert(line,tranin,tranout): | ||
# convert text in '{#X#}' | ||
def f(m): | ||
x = m.group(1) | ||
y = transcode(x,tranin,tranout) | ||
return '{#%s#}' %y | ||
|
||
regex = '{#(.*?)#}' | ||
lineout = re.sub(regex,f,line) | ||
return lineout | ||
|
||
def print_unicode(x,u): | ||
""" Sample output: | ||
x= a/MSa—BU/ | ||
0905 | अ | DEVANAGARI LETTER A | ||
0951 | ॑ | DEVANAGARI STRESS SIGN UDATTA | ||
0902 | ं | DEVANAGARI SIGN ANUSVARA | ||
0936 | श | DEVANAGARI LETTER SHA | ||
2014 | — | EM DASH | ||
092D | भ | DEVANAGARI LETTER BHA | ||
0942 | ू | DEVANAGARI VOWEL SIGN UU | ||
0951 | ॑ | DEVANAGARI STRESS SIGN UDATTA | ||
""" | ||
import unicodedata | ||
outarr = [] | ||
for c in u: | ||
name = unicodedata.name(c) | ||
icode = ord(c) | ||
a = f"{icode:04X} | {c} | {name}" | ||
outarr.append(a) | ||
print('x=',x) | ||
for out in outarr: | ||
print(out) | ||
print() | ||
|
||
def transcode(x,tranin,tranout): | ||
y = transcoder.transcoder_processString(x,tranin,tranout) | ||
#if True and (('|' in x) or ('Q' in x)): | ||
if False and ('~' in x): # for debugging. | ||
print_unicode(x,y) | ||
update_slp1chars(x,y,tranin,tranout) | ||
return y | ||
|
||
def convert_metaline(line,tranin,tranout): | ||
# '<k1>X<k2>Y' | ||
m = re.search('<k1>([^<]+)<k2>([^<]+)',line) | ||
x = m.group(0) # entire match | ||
k1 = m.group(1) | ||
k2 = m.group(2) | ||
#k1a =transcoder.transcoder_processString(k1,tranin,tranout) | ||
#k2a =transcoder.transcoder_processString(k2,tranin,tranout) | ||
k1a = transcode(k1,tranin,tranout) | ||
k2a = transcode(k2,tranin,tranout) | ||
y = '<k1>%s<k2>%s' %(k1a,k2a) | ||
lineout = line.replace(x,y) | ||
if False and (tranin == 'slp1') and (tranout == 'deva'): | ||
if '/' in k2: | ||
print_unicode(k2,k2a) | ||
if (tranin in ['roman','roman1']) and (tranout == 'slp1'): | ||
# transcoding inversion problem for three lines | ||
exceptions = [ | ||
('<L>116525.7<', | ||
'<L>116525.7<pc>588,2<k1>paramahaMsopanizadhfdaya<k2>parama/—haMsopanizad-hfdaya<e>4' | ||
), | ||
('<L>139372<', | ||
'<L>139372<pc>704,3<k1>prAghAra<k2>prAg—hAra<e>3' | ||
), | ||
('<L>139373<', | ||
'<L>139373<pc>704,3<k1>prAghoma<k2>prAg—homa<e>3' | ||
) | ||
] | ||
for start,correction in exceptions: | ||
if lineout.startswith(start): | ||
lineout = correction | ||
print('manual correction:',lineout) | ||
break | ||
return lineout | ||
|
||
def test(): | ||
tranin = 'roman' | ||
tranout = 'slp1' | ||
tests = [ | ||
'ā́', | ||
'ā-pyā́yana', | ||
'ā́-bhūti', | ||
] | ||
for x in tests: | ||
y = transcode(x,tranin,tranout) | ||
print('%s -> %s'%(x,y)) | ||
exit(1) | ||
def test1(): | ||
with codecs.open("temp.txt","w","utf-8") as f: | ||
x = 'A^' | ||
y = transcode(x,'slp1','roman') | ||
f.write(y+'\n') | ||
z = transcode(y,'roman','slp1') | ||
f.write(z+'\n') | ||
print('write to temp.txt') | ||
exit(1) | ||
|
||
if __name__=="__main__": | ||
#test() | ||
#test1() | ||
tranin = sys.argv[1] | ||
tranout = sys.argv[2] | ||
filein = sys.argv[3] # xxx.txt (path to digitization of xxx | ||
fileout = sys.argv[4] # | ||
|
||
with codecs.open(filein,"r","utf-8") as f: | ||
with codecs.open(fileout,"w","utf-8") as fout: | ||
inentry = False | ||
for iline,line in enumerate(f): | ||
line = line.rstrip('\r\n') | ||
if inentry: | ||
# inentry = True | ||
if line.startswith('<LEND>'): | ||
lineout = line | ||
inentry = False | ||
elif line.startswith('<L>'): # error | ||
print('Error 1. Not expecting <L>') | ||
print("line # ",iline+1) | ||
print(line) | ||
exit(1) | ||
else: | ||
# keep looking for <LEND | ||
lineout = convert(line,tranin,tranout) | ||
else: | ||
# inentry = False | ||
if line.startswith('<L>'): | ||
lineout = convert_metaline(line,tranin,tranout) | ||
inentry = True | ||
elif line.startswith('<LEND>'): # error | ||
print('Error 2. Not expecting <LEND>') | ||
print("line # ",iline+1) | ||
print(line) | ||
exit(1) | ||
else: | ||
# line outside of <L>...<LEND> | ||
lineout = line | ||
fout.write(lineout+'\n') | ||
if False: # True: | ||
if iline > 1000: | ||
print('quit at iline=',iline) | ||
break |
Oops, something went wrong.