-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
PWG MBH number ls markup enhancement.
Ref: #51
- Loading branch information
1 parent
58f1f8d
commit a09412a
Showing
9 changed files
with
58,478 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
<ls>MBH. Bd. III, S. 818, Z. 5. u. 4,</ls> aTarvaSiras 1646 | ||
<ls n="MBH.">3, Kap. 132-134</ls> azwAvakrIya 7244 | ||
<ls>MBH., Kap. 13-58.</ls> AstIka 9813 | ||
<ls>MBH. I, <is>Adhy</is>. 39</ls> Una 12722 | ||
<ls>MBH. III, <is>Adhy</is>.</ls> Una 12722 | ||
<ls n="MBH.">13, Kap. 102</ls> kUwa 18427 | ||
<ls>MBH. 9, <is>Adhy</is>. 33. fgg.</ls> gadA 21614 | ||
<ls n="MBH.">3, ADHY. 235 - 257.</ls> Goza 24224 | ||
<ls>MBH. 1, <is>Adhy</is>. 188</ls> Cedana 26291 | ||
<ls>MBH. 1-3</ls> trayaScatvAriMSa 30780 | ||
<ls>MBH. 2, <is>Adhy</is>. 45 - 69.</ls> dyUta 35210 | ||
<ls>MBH. 2, <is>Adhy</is>. 1</ls> nirRaya 39440 | ||
<ls n="MBH.">I, S. 418</ls> nirvAsa 39677 | ||
<ls>MBH. 1. 3-9. 12-14</ls> paYcazazwa 41530 | ||
<ls>MBH. 1. 3. 5-8. 12-14</ls> paYcasaptata 41534 | ||
<ls>MBH. 1-8</ls> paYcASa 41592 | ||
<ls>MBH. 9-14</ls> paYcASattama 41598 | ||
<ls>MBH. 1. 3. 5-8. 12-14</ls> paYcASIta 41602 | ||
<ls n="MBH.">1, Kap. 142</ls> parAmarSa 42577 | ||
<ls n="MBH.">5, Adhy. 96 - 104.</ls> mAtalIya 59606 | ||
<ls>MBH. I, S. 656. fgg.</ls> mArkaRqeya 60052 | ||
<ls>MBH. I, S. 407</ls> lipikara 86406 | ||
<ls>MBH. I, S. 308.</ls> vEzamya 96103 | ||
<ls n="MBH.">3, Adhy. 130. fg.</ls> SyenakapotIya 101383 | ||
<ls>MBH. I, S. 231.</ls> zawzazwi 102468 | ||
<ls>MBH. I, S. 245.</ls> zawsaptati 102473 | ||
<ls>MBH. I, S. 254.</ls> zaqaSIti 102499 | ||
<ls>MBH. I, S. 384. II, S. 179.</ls> zazwa 102609 | ||
<ls n="MBH.">3, Kap. 8 fgg.</ls> samAsyA 105729 | ||
<ls>MBH. 1, <is>Adhy</is>. 61</ls> sUtra 112180 | ||
<ls>MBH. 3, <is>Adhy</is>. 222. fgg.</ls> skanda 113426 | ||
<ls>MBH. IV, 432.</ls> svargArohaRa 115286 | ||
<ls>MBH. IV, 433. fgg.</ls> svargArohaRika 115287 |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,189 @@ | ||
#-*- coding:utf-8 -*- | ||
"""change_abnormal.py | ||
generate change transactions for lines in entries which | ||
are considered abnormal. The | ||
""" | ||
from __future__ import print_function | ||
import sys, re,codecs | ||
from parseheadline import parseheadline | ||
|
||
class Entry(object): | ||
Ldict = {} | ||
def __init__(self,lines,linenum1,linenum2): | ||
# linenum1,2 are int | ||
self.metaline = lines[0] | ||
self.lend = lines[-1] # the <LEND> line | ||
self.datalines = lines[1:-1] # the non-meta lines | ||
# parse the meta line into a dictionary | ||
#self.meta = Hwmeta(self.metaline) | ||
self.metad = parseheadline(self.metaline) | ||
self.linenum1 = linenum1 | ||
self.linenum2 = linenum2 | ||
#L = self.meta.L | ||
L = self.metad['L'] | ||
if L in self.Ldict: | ||
print("Entry init error: duplicate L",L,linenum1) | ||
exit(1) | ||
self.Ldict[L] = self | ||
self.lsarr = [] | ||
|
||
def init_entries(filein): | ||
# slurp lines | ||
with codecs.open(filein,encoding='utf-8',mode='r') as f: | ||
lines = [line.rstrip('\r\n') for line in f] | ||
recs=[] # list of Entry objects | ||
inentry = False | ||
idx1 = None | ||
idx2 = None | ||
for idx,line in enumerate(lines): | ||
if inentry: | ||
if line.startswith('<LEND>'): | ||
idx2 = idx | ||
entrylines = lines[idx1:idx2+1] | ||
linenum1 = idx1 + 1 | ||
linenum2 = idx2 + 1 | ||
entry = Entry(entrylines,linenum1,linenum2) | ||
recs.append(entry) | ||
# prepare for next entry | ||
idx1 = None | ||
idx2 = None | ||
inentry = False | ||
elif line.startswith('<L>'): # error | ||
print('init_entries Error 1. Not expecting <L>') | ||
print("line # ",idx+1) | ||
print(line.encode('utf-8')) | ||
exit(1) | ||
else: | ||
# keep looking for <LEND> | ||
continue | ||
else: | ||
# inentry = False. Looking for '<L>' | ||
if line.startswith('<L>'): | ||
idx1 = idx | ||
inentry = True | ||
elif line.startswith('<LEND>'): # error | ||
print('init_entries Error 2. Not expecting <LEND>') | ||
print("line # ",idx+1) | ||
print(line.encode('utf-8')) | ||
exit(1) | ||
else: | ||
# keep looking for <L> | ||
continue | ||
# when all lines are read, we should have inentry = False | ||
if inentry: | ||
print('init_entries Error 3. Last entry not closed') | ||
print('Open entry starts at line',idx1+1) | ||
exit(1) | ||
|
||
print(len(lines),"lines read from",filein) | ||
print(len(recs),"entries found") | ||
return recs | ||
|
||
|
||
class LSchange(object): | ||
def __init__(self,entry,iline,ls): | ||
self.entry = entry | ||
self.iline = iline | ||
self.ls = ls | ||
|
||
def find_abnormals_mbh(lspfx,entries): | ||
replacements = (('.','[.]'), ('(','\('), (')','\)')) | ||
tmp = lspfx | ||
for old,new in replacements: | ||
tmp = tmp.replace(old,new) | ||
regexnorm = re.compile(r'^<ls>%s ([0-9]+), ( fg+[.])?</ls>$'%tmp) | ||
regexdata = [ | ||
[r'<ls>%s.*?</ls>'%tmp, | ||
[ (r'<ls>%s ([0-9]+), ([0-9]+[.]?)</ls>'%tmp , '1a'), | ||
(r'<ls>%s ([0-9]+), ([0-9]+[.]?) fgg?[.]</ls>' % tmp, '1b'), | ||
(r'<ls>%s ([0-9]+), ([0-9]+[.,]?) v[.] l[.]</ls>' % tmp, '1c'), | ||
(r'<ls>%s</ls>'%tmp , '1d'), | ||
(r'<ls>%s[^<]*ed[.] Bomb[.].*?</ls>'%tmp , '1e'), | ||
] | ||
], | ||
[r'<ls n="%s[^"]*">.*?</ls>'%tmp, | ||
[(r'<ls n="%s">([0-9]+), ([0-9]+[.]?)</ls>'%tmp , '2a'), | ||
(r'<ls n="%s">([0-9]+), ([0-9]+[.]?) fgg?[.]</ls>' % tmp, '2b'), | ||
(r'<ls n="%s">([0-9]+), ([0-9]+[.,]?) v[.] l[.]</ls>' % tmp, '2c'), | ||
(r'<ls n="%s ([0-9]+),">([0-9]+[.]?)</ls>'%tmp , '2d'), | ||
(r'<ls n="%s ([0-9]+),">([0-9]+[.]?) fgg?[.]</ls>' % tmp, '2e'), | ||
(r'<ls n="%s ([0-9]+),">([0-9]+[.,]?) v[.] l[.]</ls>' % tmp, '2f'), | ||
|
||
] | ||
] | ||
] | ||
abnormals = [] | ||
normals = [] | ||
for entry in entries: | ||
#text = '\n'.join(entry.datalines) | ||
for iline,line in enumerate(entry.datalines): | ||
abnormal = False | ||
for regex1,regexnorms in regexdata: | ||
lsarr = re.findall(regex1,line) | ||
#abnormal = | ||
for ls in lsarr: | ||
normal = False | ||
for regex1a,regextype in regexnorms: | ||
if re.search(regex1a,ls): | ||
#normal = LSinstance(entry,ls,regextype) | ||
#normals.append(normal) | ||
normal = True | ||
break | ||
if normal == False: | ||
abnormal = True | ||
break # for ls in lsarr | ||
if abnormal: | ||
break # for regex1a | ||
# | ||
if abnormal: | ||
# generate change transaction for this line | ||
instance = LSchange(entry,iline,ls) | ||
abnormals.append(instance) | ||
print(len(abnormals),'abnormal lines found') | ||
return abnormals | ||
|
||
def normals_summary(normals): | ||
d = {} | ||
for lsinstance in normals: | ||
t = lsinstance.type | ||
if t not in d: | ||
d[t] = 0 | ||
d[t] = d[t] + 1 | ||
types = sorted(d.keys()) | ||
tot = 0 | ||
for t in types: | ||
print(d[t],"ls instances of type",t) | ||
tot = tot + d[t] | ||
print('totals=',tot) | ||
|
||
def write_abnormals(fileout,abnormals): | ||
with codecs.open(fileout,"w","utf-8") as f: | ||
for x in abnormals: | ||
entry = x.entry | ||
iline = x.iline | ||
ls = x.ls # the abnormal ls | ||
lnum = entry.linenum1+iline+1 | ||
metaline = re.sub(r'<k2>.*$','',entry.metaline) | ||
line = entry.datalines[iline] | ||
outarr = [] | ||
outarr.append('; --------------------------------') | ||
outarr.append('; %s' % metaline) | ||
outarr.append('; Abnormal ls: %s' %ls) | ||
outarr.append('%s old %s' %(lnum,line)) | ||
outarr.append(';') | ||
outarr.append('%s new %s' %(lnum,line)) | ||
for out in outarr: | ||
f.write(out+'\n') | ||
print(len(abnormals),'change transactions',fileout) | ||
|
||
if __name__=="__main__": | ||
lspfx = sys.argv[1] | ||
filein = sys.argv[2] # xxx.txt (path to digitization of xxx) | ||
#filebib = sys.argv[2] # pwbib_input.txt | ||
fileout = sys.argv[3] # | ||
entries = init_entries(filein) | ||
if lspfx == 'MBH.': | ||
abnormals = find_abnormals_mbh(lspfx,entries) | ||
write_abnormals(fileout,abnormals) | ||
else: | ||
print('Not implemented for lspfx = %s' %lspfx) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,171 @@ | ||
#-*- coding:utf-8 -*- | ||
"""listls_abnormal.py | ||
""" | ||
from __future__ import print_function | ||
import sys, re,codecs | ||
from parseheadline import parseheadline | ||
|
||
class Entry(object): | ||
Ldict = {} | ||
def __init__(self,lines,linenum1,linenum2): | ||
# linenum1,2 are int | ||
self.metaline = lines[0] | ||
self.lend = lines[-1] # the <LEND> line | ||
self.datalines = lines[1:-1] # the non-meta lines | ||
# parse the meta line into a dictionary | ||
#self.meta = Hwmeta(self.metaline) | ||
self.metad = parseheadline(self.metaline) | ||
self.linenum1 = linenum1 | ||
self.linenum2 = linenum2 | ||
#L = self.meta.L | ||
L = self.metad['L'] | ||
if L in self.Ldict: | ||
print("Entry init error: duplicate L",L,linenum1) | ||
exit(1) | ||
self.Ldict[L] = self | ||
self.lsarr = [] | ||
|
||
def init_entries(filein): | ||
# slurp lines | ||
with codecs.open(filein,encoding='utf-8',mode='r') as f: | ||
lines = [line.rstrip('\r\n') for line in f] | ||
recs=[] # list of Entry objects | ||
inentry = False | ||
idx1 = None | ||
idx2 = None | ||
for idx,line in enumerate(lines): | ||
if inentry: | ||
if line.startswith('<LEND>'): | ||
idx2 = idx | ||
entrylines = lines[idx1:idx2+1] | ||
linenum1 = idx1 + 1 | ||
linenum2 = idx2 + 1 | ||
entry = Entry(entrylines,linenum1,linenum2) | ||
recs.append(entry) | ||
# prepare for next entry | ||
idx1 = None | ||
idx2 = None | ||
inentry = False | ||
elif line.startswith('<L>'): # error | ||
print('init_entries Error 1. Not expecting <L>') | ||
print("line # ",idx+1) | ||
print(line.encode('utf-8')) | ||
exit(1) | ||
else: | ||
# keep looking for <LEND> | ||
continue | ||
else: | ||
# inentry = False. Looking for '<L>' | ||
if line.startswith('<L>'): | ||
idx1 = idx | ||
inentry = True | ||
elif line.startswith('<LEND>'): # error | ||
print('init_entries Error 2. Not expecting <LEND>') | ||
print("line # ",idx+1) | ||
print(line.encode('utf-8')) | ||
exit(1) | ||
else: | ||
# keep looking for <L> | ||
continue | ||
# when all lines are read, we should have inentry = False | ||
if inentry: | ||
print('init_entries Error 3. Last entry not closed') | ||
print('Open entry starts at line',idx1+1) | ||
exit(1) | ||
|
||
print(len(lines),"lines read from",filein) | ||
print(len(recs),"entries found") | ||
return recs | ||
|
||
|
||
class LSinstance(object): | ||
def __init__(self,entry,ls,lstype=None): | ||
self.entry = entry | ||
self.ls = ls | ||
self.type = lstype # type of | ||
|
||
def find_abnormals_mbh(lspfx,entries): | ||
replacements = (('.','[.]'), ('(','\('), (')','\)')) | ||
tmp = lspfx | ||
for old,new in replacements: | ||
tmp = tmp.replace(old,new) | ||
regexnorm = re.compile(r'^<ls>%s ([0-9]+), ( fg+[.])?</ls>$'%tmp) | ||
regexdata = [ | ||
[r'<ls>%s.*?</ls>'%tmp, | ||
[ (r'<ls>%s ([0-9]+), ([0-9]+[.]?)</ls>'%tmp , '1a'), | ||
(r'<ls>%s ([0-9]+), ([0-9]+[.]?) fgg?[.]</ls>' % tmp, '1b'), | ||
(r'<ls>%s ([0-9]+), ([0-9]+[.,]?) v[.] l[.]</ls>' % tmp, '1c'), | ||
(r'<ls>%s</ls>'%tmp , '1d'), | ||
(r'<ls>%s[^<]*ed[.] Bomb[.].*?</ls>'%tmp , '1e'), | ||
] | ||
], | ||
[r'<ls n="%s[^"]*">.*?</ls>'%tmp, | ||
[(r'<ls n="%s">([0-9]+), ([0-9]+[.]?)</ls>'%tmp , '2a'), | ||
(r'<ls n="%s">([0-9]+), ([0-9]+[.]?) fgg?[.]</ls>' % tmp, '2b'), | ||
(r'<ls n="%s">([0-9]+), ([0-9]+[.,]?) v[.] l[.]</ls>' % tmp, '2c'), | ||
(r'<ls n="%s ([0-9]+),">([0-9]+[.]?)</ls>'%tmp , '2d'), | ||
(r'<ls n="%s ([0-9]+),">([0-9]+[.]?) fgg?[.]</ls>' % tmp, '2e'), | ||
(r'<ls n="%s ([0-9]+),">([0-9]+[.,]?) v[.] l[.]</ls>' % tmp, '2f'), | ||
|
||
] | ||
] | ||
] | ||
if False: | ||
regexnorms = regexdata[1][1] | ||
for regex1a,regextype in regexnorms: | ||
print('%s %s' %(regextype,regex1a)) | ||
abnormals = [] | ||
normals = [] | ||
for entry in entries: | ||
text = '\n'.join(entry.datalines) | ||
for regex1,regexnorms in regexdata: | ||
lsarr = re.findall(regex1,text,flags=re.DOTALL) | ||
for ls in lsarr: | ||
normal = None | ||
for regex1a,regextype in regexnorms: | ||
if re.search(regex1a,ls): | ||
normal = LSinstance(entry,ls,regextype) | ||
normals.append(normal) | ||
break | ||
if normal == None: | ||
abnormal = LSinstance(entry,ls) | ||
abnormals.append(abnormal) | ||
return abnormals,normals | ||
|
||
def normals_summary(normals): | ||
d = {} | ||
for lsinstance in normals: | ||
t = lsinstance.type | ||
if t not in d: | ||
d[t] = 0 | ||
d[t] = d[t] + 1 | ||
types = sorted(d.keys()) | ||
tot = 0 | ||
for t in types: | ||
print(d[t],"ls instances of type",t) | ||
tot = tot + d[t] | ||
print('totals=',tot) | ||
|
||
def write_abnormals(fileout,abnormals): | ||
with codecs.open(fileout,"w","utf-8") as f: | ||
for x in abnormals: | ||
entry = x.entry | ||
ls = x.ls | ||
L = entry.metad['L'] | ||
k1 = entry.metad['k1'] | ||
out = '%s %s %s' %(ls,k1,L) | ||
f.write(out+'\n') | ||
print(len(abnormals),'abnormal ls written to',fileout) | ||
#exit(1) | ||
if __name__=="__main__": | ||
lspfx = sys.argv[1] | ||
filein = sys.argv[2] # xxx.txt (path to digitization of xxx) | ||
#filebib = sys.argv[2] # pwbib_input.txt | ||
fileout = sys.argv[3] # | ||
entries = init_entries(filein) | ||
if lspfx == 'MBH.': | ||
abnormals,normals = find_abnormals_mbh(lspfx,entries) | ||
write_abnormals(fileout,abnormals) | ||
normals_summary(normals) | ||
exit(1) | ||
print('Not implemented for lspfx = %s' %lspfx) |
Oops, something went wrong.