Skip to content

Commit

Permalink
PW: Alternate headwords and other work.
Browse files Browse the repository at this point in the history
Ref: sanskrit-lexicon/PWK#106
Installed temp_pw_9c.txt
  • Loading branch information
funderburkjim committed Mar 25, 2024
1 parent 2bff3dc commit e2fa906
Show file tree
Hide file tree
Showing 10 changed files with 64,060 additions and 20,915 deletions.
103 changes: 103 additions & 0 deletions v02/pw/althws/digentry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#-*- coding:utf-8 -*-
"""digentry.py
Module to read a digitization
and generate a list of Entry objects
Adapted for temp_pwkvn_22.txt
"""
from __future__ import print_function
import sys,re,codecs

class Entry(object):
Ldict = {}
def __init__(self,lines,linenum1,linenum2):
# linenum1,2 are int
self.metaline = lines[0]
self.lend = lines[-1] # the <LEND> line
self.datalines = lines[1:-1] # the non-meta lines
# parse the meta line into a dictionary
self.metad = parseheadline(self.metaline)
self.linenum1 = linenum1
self.linenum2 = linenum2
L = self.metad['L']
if L in self.Ldict:
print("Entry init error: duplicate L",L,linenum1)
exit(1)
self.Ldict[L] = self
self.lsarr = []

def init(filein):
# slurp lines
with codecs.open(filein,encoding='utf-8',mode='r') as f:
lines = [line.rstrip('\r\n') for line in f]
recs=[] # list of Entry objects
inentry = False
idx1 = None
idx2 = None
for idx,line in enumerate(lines):
if inentry:
if line.startswith('<LEND>'):
idx2 = idx
entrylines = lines[idx1:idx2+1]
linenum1 = idx1 + 1
linenum2 = idx2 + 1
entry = Entry(entrylines,linenum1,linenum2)
recs.append(entry)
# prepare for next entry
idx1 = None
idx2 = None
inentry = False
elif line.startswith('<L>'): # error
print('init_entries Error 1. Not expecting <L>')
print("line # ",idx+1)
print(line.encode('utf-8'))
exit(1)
else:
# keep looking for <LEND>
continue
else:
# inentry = False. Looking for '<L>'
if line.startswith('<L>'):
idx1 = idx
inentry = True
elif line.startswith('<LEND>'): # error
print('init_entries Error 2. Not expecting <LEND>')
print("line # ",idx+1)
print(line.encode('utf-8'))
exit(1)
else:
# keep looking for <L>
continue
# when all lines are read, we should have inentry = False
if inentry:
print('digentry.init Error 3. for file',filein)
print('Last entry not closed. Open entry starts at line',idx1+1)
exit(1)

print(len(lines),"lines read from",filein)
print(len(recs),"entries found")
return recs

def parseheadline(headline):
"""
function to parse a 'metaline' and return a dictionary.
Example:
headline = <L>16850<pc>292-3<k1>visarga<k2>visarga<h>1<e>
returns dictionary
{'L': '16850',
'pc': '292-3',
'k1': 'visarga',
'k2': 'visarga',
'h': '1',
'e': ''}
"""
headline = headline.strip()
splits = re.split('[<]([^>]*)[>]([^<]*)',headline)
result = {}
for i in range(len(splits)):
if i % 3 == 1:
result[splits[i]] = splits[i+1]
return result

if __name__=="__main__":
filein = sys.argv[1] # xxx.txt (path to digitization of xxx)
entries = init(filein)
32 changes: 32 additions & 0 deletions v02/pw/althws/make_hwextra.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#-*- coding:utf-8 -*-
""" make_hwextra.py
"""
from __future__ import print_function
import sys,re,codecs

def read_lines(filein):
with codecs.open(filein,encoding='utf-8',mode='r') as f:
lines = [x.rstrip('\r\n') for x in f]
return lines

def extract(lines):
# non-comment
ans = [line for line in lines if not line.startswith(';')]
return ans

def write(fileout,lines):
with codecs.open(fileout,"w","utf-8") as f:
for out in lines:
f.write(out+'\n')
print(len(lines),"records written to",fileout)

if __name__=="__main__":
filein = sys.argv[1] # multik2
fileout = sys.argv[2] # multik2a
lines = read_lines(filein)
hwextras = extract(lines)
write(fileout,hwextras)



91 changes: 91 additions & 0 deletions v02/pw/althws/multik2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#-*- coding:utf-8 -*-
"""multik2.py
"""
from __future__ import print_function
import sys,re,codecs
import digentry

def generate_Ls(L0,L1,n):
"""
A difficult problem in general.
Assume L0 and L1 are digit sequences.
(or, L1 may be None)
"""
n0 = int(L0)
if L1 == None:
n1 = n0 + 1
else:
n1 = int(L1)
assert (n0 < n1),"generate_Ls ERROR 1: L0=%s, L1=%s" % (L0,L1)
# construct intermediate values
assert (1<=n) and (n<100),"generate_Ls ERROR 1: L0=%s, L1=%s" % (L0,L1)
xn0 = float(n0)
xn1 = float(n1)
ans = []
if (n<10):
x = xn0
for k in range(n):
x = x + 0.1
L = '%0.1f' % x
ans.append(L)
else: # 10<=n<=99
x = xn0
for k in range(n):
x = x + 0.01
L = '%0.2f' % x
ans.append(L)
return ans


def multik2s(entries):
""" construct tab-delimited records needed for construction of hwextra
"""
althws = []
nentries = len(entries)
for ientry,entry in enumerate(entries):
metaline = entry.metaline
L0 = entry.metad['L']
pc0 = entry.metad['pc']
k10 = entry.metad['k1']
k20 = entry.metad['k2']
if not (',' in k20):
# no alternates for this entry
continue
# Get L for next entry, or None if this is last entry
metacalc = '<L>%s<pc>%s<k1>%s<k2>%s' %(L0,pc0,k10,k20)
# only L0,pc0,k10,k20 should be in metaline
if metaline != metacalc:
print('WARNING: metaline=%s' % metaline)
print(' metacalc=%s' % metacalc)


ientry1 = ientry+1
if ientry1 < nentries:
entry1 = entries[ientry1]
L1 = entry1.metad['L']
else:
L1 = None
vals = (L0,L1,pc0,k10,k20)
out = '\t'.join(vals)
althws.append(out)
print(len(althws),"metalines with comma")
return althws

def write(fileout,lines):
with codecs.open(fileout,"w","utf-8") as f:
for iline,line in enumerate(lines):
f.write(line+'\n')
print(len(lines),"records written to",fileout)

if __name__=="__main__":
filein = sys.argv[1] # pwkvn.txt
fileout = sys.argv[2] # pwhvn_hwextra.txt

entries = digentry.init(filein)
althws = multik2s(entries)

write(fileout,althws)



Loading

0 comments on commit e2fa906

Please sign in to comment.