-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
PW: Alternate headwords and other work.
Ref: sanskrit-lexicon/PWK#106 Installed temp_pw_9c.txt
- Loading branch information
1 parent
2bff3dc
commit e2fa906
Showing
10 changed files
with
64,060 additions
and
20,915 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
#-*- coding:utf-8 -*- | ||
"""digentry.py | ||
Module to read a digitization | ||
and generate a list of Entry objects | ||
Adapted for temp_pwkvn_22.txt | ||
""" | ||
from __future__ import print_function | ||
import sys,re,codecs | ||
|
||
class Entry(object): | ||
Ldict = {} | ||
def __init__(self,lines,linenum1,linenum2): | ||
# linenum1,2 are int | ||
self.metaline = lines[0] | ||
self.lend = lines[-1] # the <LEND> line | ||
self.datalines = lines[1:-1] # the non-meta lines | ||
# parse the meta line into a dictionary | ||
self.metad = parseheadline(self.metaline) | ||
self.linenum1 = linenum1 | ||
self.linenum2 = linenum2 | ||
L = self.metad['L'] | ||
if L in self.Ldict: | ||
print("Entry init error: duplicate L",L,linenum1) | ||
exit(1) | ||
self.Ldict[L] = self | ||
self.lsarr = [] | ||
|
||
def init(filein): | ||
# slurp lines | ||
with codecs.open(filein,encoding='utf-8',mode='r') as f: | ||
lines = [line.rstrip('\r\n') for line in f] | ||
recs=[] # list of Entry objects | ||
inentry = False | ||
idx1 = None | ||
idx2 = None | ||
for idx,line in enumerate(lines): | ||
if inentry: | ||
if line.startswith('<LEND>'): | ||
idx2 = idx | ||
entrylines = lines[idx1:idx2+1] | ||
linenum1 = idx1 + 1 | ||
linenum2 = idx2 + 1 | ||
entry = Entry(entrylines,linenum1,linenum2) | ||
recs.append(entry) | ||
# prepare for next entry | ||
idx1 = None | ||
idx2 = None | ||
inentry = False | ||
elif line.startswith('<L>'): # error | ||
print('init_entries Error 1. Not expecting <L>') | ||
print("line # ",idx+1) | ||
print(line.encode('utf-8')) | ||
exit(1) | ||
else: | ||
# keep looking for <LEND> | ||
continue | ||
else: | ||
# inentry = False. Looking for '<L>' | ||
if line.startswith('<L>'): | ||
idx1 = idx | ||
inentry = True | ||
elif line.startswith('<LEND>'): # error | ||
print('init_entries Error 2. Not expecting <LEND>') | ||
print("line # ",idx+1) | ||
print(line.encode('utf-8')) | ||
exit(1) | ||
else: | ||
# keep looking for <L> | ||
continue | ||
# when all lines are read, we should have inentry = False | ||
if inentry: | ||
print('digentry.init Error 3. for file',filein) | ||
print('Last entry not closed. Open entry starts at line',idx1+1) | ||
exit(1) | ||
|
||
print(len(lines),"lines read from",filein) | ||
print(len(recs),"entries found") | ||
return recs | ||
|
||
def parseheadline(headline): | ||
""" | ||
function to parse a 'metaline' and return a dictionary. | ||
Example: | ||
headline = <L>16850<pc>292-3<k1>visarga<k2>visarga<h>1<e> | ||
returns dictionary | ||
{'L': '16850', | ||
'pc': '292-3', | ||
'k1': 'visarga', | ||
'k2': 'visarga', | ||
'h': '1', | ||
'e': ''} | ||
""" | ||
headline = headline.strip() | ||
splits = re.split('[<]([^>]*)[>]([^<]*)',headline) | ||
result = {} | ||
for i in range(len(splits)): | ||
if i % 3 == 1: | ||
result[splits[i]] = splits[i+1] | ||
return result | ||
|
||
if __name__=="__main__": | ||
filein = sys.argv[1] # xxx.txt (path to digitization of xxx) | ||
entries = init(filein) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
#-*- coding:utf-8 -*- | ||
""" make_hwextra.py | ||
""" | ||
from __future__ import print_function | ||
import sys,re,codecs | ||
|
||
def read_lines(filein): | ||
with codecs.open(filein,encoding='utf-8',mode='r') as f: | ||
lines = [x.rstrip('\r\n') for x in f] | ||
return lines | ||
|
||
def extract(lines): | ||
# non-comment | ||
ans = [line for line in lines if not line.startswith(';')] | ||
return ans | ||
|
||
def write(fileout,lines): | ||
with codecs.open(fileout,"w","utf-8") as f: | ||
for out in lines: | ||
f.write(out+'\n') | ||
print(len(lines),"records written to",fileout) | ||
|
||
if __name__=="__main__": | ||
filein = sys.argv[1] # multik2 | ||
fileout = sys.argv[2] # multik2a | ||
lines = read_lines(filein) | ||
hwextras = extract(lines) | ||
write(fileout,hwextras) | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
#-*- coding:utf-8 -*- | ||
"""multik2.py | ||
""" | ||
from __future__ import print_function | ||
import sys,re,codecs | ||
import digentry | ||
|
||
def generate_Ls(L0,L1,n): | ||
""" | ||
A difficult problem in general. | ||
Assume L0 and L1 are digit sequences. | ||
(or, L1 may be None) | ||
""" | ||
n0 = int(L0) | ||
if L1 == None: | ||
n1 = n0 + 1 | ||
else: | ||
n1 = int(L1) | ||
assert (n0 < n1),"generate_Ls ERROR 1: L0=%s, L1=%s" % (L0,L1) | ||
# construct intermediate values | ||
assert (1<=n) and (n<100),"generate_Ls ERROR 1: L0=%s, L1=%s" % (L0,L1) | ||
xn0 = float(n0) | ||
xn1 = float(n1) | ||
ans = [] | ||
if (n<10): | ||
x = xn0 | ||
for k in range(n): | ||
x = x + 0.1 | ||
L = '%0.1f' % x | ||
ans.append(L) | ||
else: # 10<=n<=99 | ||
x = xn0 | ||
for k in range(n): | ||
x = x + 0.01 | ||
L = '%0.2f' % x | ||
ans.append(L) | ||
return ans | ||
|
||
|
||
def multik2s(entries): | ||
""" construct tab-delimited records needed for construction of hwextra | ||
""" | ||
althws = [] | ||
nentries = len(entries) | ||
for ientry,entry in enumerate(entries): | ||
metaline = entry.metaline | ||
L0 = entry.metad['L'] | ||
pc0 = entry.metad['pc'] | ||
k10 = entry.metad['k1'] | ||
k20 = entry.metad['k2'] | ||
if not (',' in k20): | ||
# no alternates for this entry | ||
continue | ||
# Get L for next entry, or None if this is last entry | ||
metacalc = '<L>%s<pc>%s<k1>%s<k2>%s' %(L0,pc0,k10,k20) | ||
# only L0,pc0,k10,k20 should be in metaline | ||
if metaline != metacalc: | ||
print('WARNING: metaline=%s' % metaline) | ||
print(' metacalc=%s' % metacalc) | ||
|
||
|
||
ientry1 = ientry+1 | ||
if ientry1 < nentries: | ||
entry1 = entries[ientry1] | ||
L1 = entry1.metad['L'] | ||
else: | ||
L1 = None | ||
vals = (L0,L1,pc0,k10,k20) | ||
out = '\t'.join(vals) | ||
althws.append(out) | ||
print(len(althws),"metalines with comma") | ||
return althws | ||
|
||
def write(fileout,lines): | ||
with codecs.open(fileout,"w","utf-8") as f: | ||
for iline,line in enumerate(lines): | ||
f.write(line+'\n') | ||
print(len(lines),"records written to",fileout) | ||
|
||
if __name__=="__main__": | ||
filein = sys.argv[1] # pwkvn.txt | ||
fileout = sys.argv[2] # pwhvn_hwextra.txt | ||
|
||
entries = digentry.init(filein) | ||
althws = multik2s(entries) | ||
|
||
write(fileout,althws) | ||
|
||
|
||
|
Oops, something went wrong.