Skip to content

Commit

Permalink
PWK ls summary;
Browse files Browse the repository at this point in the history
Ref: #85
  • Loading branch information
funderburkjim committed Feb 19, 2022
1 parent 2dbd3a9 commit e78bfdb
Show file tree
Hide file tree
Showing 4 changed files with 1,191 additions and 0 deletions.
1 change: 1 addition & 0 deletions pw_ls/readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ occuring in the PW dictionary.
* **spruch** Improve markup for references to 'Indische Spruch' so displays
can link to an html version of the verses.

* **summary** Generate a summary of counts of ls references for pwk
324 changes: 324 additions & 0 deletions pw_ls/summary/lsextract_all.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,324 @@
#-*- coding:utf-8 -*-
"""lsextract_all.py -- summary stats
"""
import sys,re,codecs
## https:##stackoverflow.com/questions/27092833/unicodeencodeerror-charmap-codec-cant-encode-characters
## This required by git bash to avoid error
## UnicodeEncodeError: 'charmap' codec cannot encode characters
## when run in a git bash script.

sys.stdout.reconfigure(encoding='utf-8')
class Change(object):
def __init__(self,metaline,page,iline,old,new,reason,iline1,line1,new1):
self.metaline = metaline
self.page = page
self.iline = iline
self.old = old
self.new = new
self.reason = reason
self.iline1 = iline1
self.line1 = line1
self.new1 = new1

def init_changes(lines,tipd):
changes = [] # array of Change objects
metaline = None
imetaline1 = None
page = None
for iline,line in enumerate(lines):
if iline == 0: # %***This File is E:\\APTE.ALL, Last update 11.09.06
continue #
line = line.rstrip('\r\n')
if line == '':
continue
if line.startswith('<L>'):
metaline = line
imetaline1 = iline+1
continue
if line == '<LEND>':
metaline = None
imetaline = None
continue
if line.startswith('[Page'):
page = line
continue
reason=''
newline = ls_questionmark(line,tipd)
if newline == line:
continue
# generate a change
# look at previous line(s) for last '<ls>X</ls>' and derive source
found = None
if found == None:
iline1 = None
line1 = None
newline1 = None
reason = ''
#print('manual check:',iline+1,line)
else:
newline1 = line1 # re.sub(r'#} *$',' …#}',line1)
change = Change(metaline,page,iline,line,newline,reason,iline1,line1,newline1)
changes.append(change)
print(len(changes),'potential changes found')
return changes

def change_out(change,ichange):
outarr = []
case = ichange + 1
#outarr.append('; TODO Case %s: (reason = %s)' % (case,change.reason))
try:
ident = change.metaline
except:
print('ERROR:',change.iline,change.old)
exit(1)
if ident == None:
ident = change.page
outarr.append('; ' + ident)
# possible change for iline1
if change.iline1 != None:
lnum = change.iline1 + 1
line = change.line1
new = change.new1
outarr.append('%s old %s' % (lnum,line))
outarr.append('%s new %s' % (lnum,new))
outarr.append(';')

# change for iline
lnum = change.iline + 1
line = change.old
new = change.new
outarr.append('%s old %s' % (lnum,line))
outarr.append('%s new %s' % (lnum,new))
outarr.append(';')

# dummy next line
return outarr

def write_changes(fileout,changes):
with codecs.open(fileout,"w","utf-8") as f:
for ichange,change in enumerate(changes):
outarr = change_out(change,ichange)
for out in outarr:
f.write(out+'\n')
print(len(changes),"possible changes written to",fileout)

class Tooltip(object):
def __init__(self,line):
line = line.rstrip('\r\n')
# pwg has code, abbrevUpper, abbrevLower,tip
self.code,self.abbrev,self.abbrevlo,self.tip = line.split('\t')
self.total = 0

def init_tooltip(filein):
with codecs.open(filein,"r","utf-8") as f:
ans = [Tooltip(x) for x in f]
print(len(ans),'tooltips from',filein)
return ans

def dfirstchar(tooltips_sorted):
d = {}
for tip in tooltips_sorted:
c = tip.abbrev[0]
if c not in d:
d[c] = []
d[c].append(tip)
return d

def findtip(ls,tiplist):
for tip in tiplist:
if ls.startswith(tip.abbrev):
return tip
return None

class LSCase(object):
def __init__(self,ls,abbrev,metaline,iline,line):
self.ls = ls
self.abbrev = abbrev
self.metaline = metaline
self.iline = iline
self.line = line
self.parmstr = ls[len(abbrev):].strip()
if self.parmstr == '':
self.nparms = 0
else:
self.nparms = len(self.parmstr.split(' '))
self.len = len(self.parmstr)
#if ls == abbrev:
# print(ls,"'%s'" %self.parmstr,self.nparms)
# exit(1)

def count_tips(lines,tipd,numbertip,unknowntip):
#
lsentries = [] # list of 'entry' with ls of given abbrev
metaline = None
imetaline1 = None
page = None
for iline,line in enumerate(lines):
if iline == 0: # %***This File is E:\\APTE.ALL, Last update 11.09.06
continue #
line = line.rstrip('\r\n')
if line == '':
continue
if line.startswith('<L>'):
metaline = line
imetaline1 = iline+1
entry = [] # list of LSCase appearing in this entry
continue
if line == '<LEND>':
if len(entry)>0:
lsentries.append(entry)
#
metaline = None
imetaline = None
continue
if line.startswith('[Page'):
page = line
continue
for m in re.finditer(r'<ls([^>]*)>([^<]*)</ls>',line):
attrib = m.group(1)
elt = m.group(2)
if len(elt) == 0:
print('WARNING at line %s %s' % (iline+1,metaline))
print('ls = ',m.group(0))
tip = unknowntip
tip.total = tip.total + 1
continue
m1 = re.search(r' +n="(.*?)"',attrib)
if m1 != None:
nval = m1.group(1)
elt = nval + ' ' + elt
if re.search(r'^[0-9]',elt): # number
tip = numbertip
elif elt[0] not in tipd:
tip = unknowntip
else:
tiplist = tipd[elt[0]]
tip = findtip(elt,tiplist)
if tip == None:
tip = unknowntip
# found a match

tip.total = tip.total + 1
#lscase = LSCase(elt,abbrev,metaline,iline,line)
#entry.append(lscase)

#print(len(lsentries),'entries with ls for %s'%abbrev)
#return lsentries

def unused_write_lscases(fileout,cases,abbrev):
parmsd = {} #
mparm = 0
nparms = []
for ls in cases:
n = ls.nparms
if n not in parmsd:
parmsd[n] = []
parmsd[n].append(ls)
if n > mparm:
mparm = n
if n not in nparms:
nparms.append(n)

nparms.sort()
print(nparms)
for n in nparms:
casesn = parmsd[n]
print("%s cases with %s parms" %(len(casesn),n))
print('ok so far')
f = codecs.open(fileout,"w","utf-8")
n0 = 4
for n in nparms:
casesn = parmsd[n]
if n != n0:
continue
f.write(';-----------------------------------------------------------\n')
f.write('; %s %s instances with %s parameters\n' %(len(casesn),abbrev,n))
f.write(';-----------------------------------------------------------\n')

for lscase in casesn:
outarr = []
if n == n0:
outarr.append('; %s' %lscase.metaline)
outarr.append('; %s' %lscase.ls)
newls = re.sub(r'(%s [0-9]+, [0-9]+, [0-9]+[.]) ' % abbrev,
r'\1</ls> <ls n="%s">' % abbrev,lscase.ls)
if newls != lscase.ls:
outarr.append('%s old %s' %(lscase.iline+1,lscase.line))
newline = lscase.line.replace(lscase.ls,newls)
outarr.append('%s new %s' %(lscase.iline+1,newline))
else:
outarr.append('; %s old %s' %(lscase.iline+1,lscase.line))
outarr.append(';')
else:
outarr.append('; %s' %lscase.ls)
for out in outarr:
f.write(out+'\n')
f.close()

def write_tips(tips0,numbertip,unknowntip):
outrecs = []
outrecs.append('') # for totals
tips = sorted(tips0,key = lambda tip: tip.total,reverse=True)
def tipformat(tip):
text = tip.tip
text = re.sub(r'^.*? = ','',text)
text = text.replace('[Cologne Addition]','')
text = text[0:40]
return '%05d\t%s\t%s' %(tip.total,tip.abbrev,text)
outrecs.append(tipformat(numbertip))
outrecs.append(tipformat(unknowntip))
tot = 0
tot = tot + numbertip.total
tot = tot + unknowntip.total
for tip in tips:
outrecs.append(tipformat(tip))
tot = tot + tip.total
#
import datetime
x = datetime.datetime.now()
date = x.strftime("%Y-%m-%d")
outrecs[0] = '%05d\t%s\tAs of %s' %(tot,'ALL',date)
with codecs.open(fileout,"w","utf-8") as f:
for out in outrecs:
f.write(out+'\n')
print("write_tips Output in ",fileout)

def write_lsentries(fileout,lsentries,abbrev):
f = codecs.open(fileout,"w","utf-8")
n0 = 0
ntot = 0
for lscases in lsentries:
# lscases is a non-empty list of LSCase objects
metaline = lscases[0].metaline
n = len(lscases)
ntot = ntot + n
f.write(';-----------------------------------------------------------\n')
x = re.sub(r'<k2>.*$','',metaline)
f.write('; %s {%s %s}\n' %(x,abbrev,n))
#f.write(';-----------------------------------------------------------\n')

for lscase in lscases:
f.write(lscase.ls + '\n')
#f.write(';-----------------------------------------------------------\n')
f.close()
print(ntot,'= number of %s ls references'%abbrev)
if __name__=="__main__":

filein = sys.argv[1] # xxx.txt (path to digitization of xxx)
filetip = sys.argv[2] # pwgbib_input.txt
fileout = sys.argv[3] # output summary
tips0 = init_tooltip(filetip)
tips = sorted(tips0,key = lambda tip: len(tip.abbrev),reverse=True)
tipd = dfirstchar(tips)
# dummy for number
numbertip = Tooltip("9.1\tNUMBER\tnumber\tls starts with number")
# dummy for unknown
unknowntip = Tooltip("9.2\tUNKNOWN\tunknown\tls is unknown")

with codecs.open(filein,"r","utf-8") as f:
lines = [x.rstrip('\r\n') for x in f]
count_tips(lines,tipd,numbertip,unknowntip) # also, updates tip.changes
write_tips(tips0,numbertip,unknowntip)
exit(1)
write_lsentries(fileout,lsentries,abbrev)

Loading

0 comments on commit e78bfdb

Please sign in to comment.