PW: Alternate headwords and other work.

Ref: sanskrit-lexicon/PWK#106 Installed temp_pw_9c.txt
sanskrit-lexicon · Mar 25, 2024 · e2fa906 · e2fa906
1 parent 2bff3dc
commit e2fa906
Show file tree

Hide file tree

Showing 10 changed files with 64,060 additions and 20,915 deletions.
diff --git a/v02/pw/althws/digentry.py b/v02/pw/althws/digentry.py
@@ -0,0 +1,103 @@
+#-*- coding:utf-8 -*-
+"""digentry.py
+  Module to read a digitization 
+  and generate a list of Entry objects
+  Adapted for temp_pwkvn_22.txt
+"""
+from __future__ import print_function
+import sys,re,codecs
+
+class Entry(object):
+ Ldict = {}
+ def __init__(self,lines,linenum1,linenum2):
+  # linenum1,2 are int
+  self.metaline = lines[0]
+  self.lend = lines[-1]  # the <LEND> line
+  self.datalines = lines[1:-1]  # the non-meta lines
+  # parse the meta line into a dictionary
+  self.metad = parseheadline(self.metaline)
+  self.linenum1 = linenum1
+  self.linenum2 = linenum2
+  L = self.metad['L']
+  if L in self.Ldict:
+   print("Entry init error: duplicate L",L,linenum1)
+   exit(1)
+  self.Ldict[L] = self
+  self.lsarr = []
+
+def init(filein):
+ # slurp lines
+ with codecs.open(filein,encoding='utf-8',mode='r') as f:
+  lines = [line.rstrip('\r\n') for line in f]
+ recs=[]  # list of Entry objects
+ inentry = False  
+ idx1 = None
+ idx2 = None
+ for idx,line in enumerate(lines):
+  if inentry:
+   if line.startswith('<LEND>'):
+    idx2 = idx
+    entrylines = lines[idx1:idx2+1]
+    linenum1 = idx1 + 1
+    linenum2 = idx2 + 1
+    entry = Entry(entrylines,linenum1,linenum2)
+    recs.append(entry)
+    # prepare for next entry
+    idx1 = None
+    idx2 = None
+    inentry = False
+   elif line.startswith('<L>'):  # error
+    print('init_entries Error 1. Not expecting <L>')
+    print("line # ",idx+1)
+    print(line.encode('utf-8'))
+    exit(1)
+   else: 
+    # keep looking for <LEND>
+    continue
+  else:
+   # inentry = False. Looking for '<L>'
+   if line.startswith('<L>'):
+    idx1 = idx
+    inentry = True
+   elif line.startswith('<LEND>'): # error
+    print('init_entries Error 2. Not expecting <LEND>')
+    print("line # ",idx+1)
+    print(line.encode('utf-8'))
+    exit(1)
+   else: 
+    # keep looking for <L>
+    continue
+ # when all lines are read, we should have inentry = False
+ if inentry:
+  print('digentry.init Error 3. for file',filein)
+  print('Last entry not closed. Open entry starts at line',idx1+1)
+  exit(1)
+
+ print(len(lines),"lines read from",filein)
+ print(len(recs),"entries found")
+ return recs
+
+def parseheadline(headline):
+ """
+  function to parse a 'metaline' and return a dictionary.
+  Example:
+  headline = <L>16850<pc>292-3<k1>visarga<k2>visarga<h>1<e>
+  returns dictionary
+  {'L': '16850', 
+   'pc': '292-3',
+   'k1': 'visarga', 
+   'k2': 'visarga', 
+   'h': '1', 
+   'e': ''}
+ """
+ headline = headline.strip()
+ splits = re.split('[<]([^>]*)[>]([^<]*)',headline)
+ result = {}
+ for i in range(len(splits)):
+  if i % 3 == 1:
+   result[splits[i]] = splits[i+1]
+ return result
+
+if __name__=="__main__":
+ filein = sys.argv[1] #  xxx.txt (path to digitization of xxx)
+ entries = init(filein)
diff --git a/v02/pw/althws/make_hwextra.py b/v02/pw/althws/make_hwextra.py
@@ -0,0 +1,32 @@
+#-*- coding:utf-8 -*-
+""" make_hwextra.py
+ 
+"""
+from __future__ import print_function
+import sys,re,codecs
+
+def read_lines(filein):
+ with codecs.open(filein,encoding='utf-8',mode='r') as f:
+  lines = [x.rstrip('\r\n') for x in f]
+ return lines
+
+def extract(lines):
+ # non-comment
+ ans = [line for line in lines if not line.startswith(';')]
+ return ans
+
+def write(fileout,lines):
+ with codecs.open(fileout,"w","utf-8") as f:
+  for out in lines:
+   f.write(out+'\n')
+ print(len(lines),"records written to",fileout)
+
+if __name__=="__main__":
+ filein = sys.argv[1] # multik2
+ fileout = sys.argv[2] # multik2a
+ lines = read_lines(filein)
+ hwextras = extract(lines)
+ write(fileout,hwextras)
+
+
+
diff --git a/v02/pw/althws/multik2.py b/v02/pw/althws/multik2.py
@@ -0,0 +1,91 @@
+#-*- coding:utf-8 -*-
+"""multik2.py
+ 
+"""
+from __future__ import print_function
+import sys,re,codecs
+import digentry  
+
+def generate_Ls(L0,L1,n):
+ """
+  A difficult problem in general.
+  Assume L0 and L1 are digit sequences. 
+   (or, L1 may be None)
+ """
+ n0 = int(L0)
+ if L1 == None:
+  n1 = n0 + 1
+ else:
+  n1 = int(L1)
+ assert (n0 < n1),"generate_Ls ERROR 1: L0=%s, L1=%s" % (L0,L1)
+ # construct intermediate values
+ assert (1<=n) and (n<100),"generate_Ls ERROR 1: L0=%s, L1=%s" % (L0,L1)
+ xn0 = float(n0)
+ xn1 = float(n1)
+ ans = []
+ if (n<10):
+  x = xn0
+  for k in range(n):
+   x = x + 0.1
+   L = '%0.1f' % x
+   ans.append(L)
+ else: # 10<=n<=99
+  x = xn0
+  for k in range(n):
+   x = x + 0.01
+   L = '%0.2f' % x
+   ans.append(L)
+ return ans
+
+
+def multik2s(entries):
+ """ construct tab-delimited records needed for construction of hwextra
+ """
+ althws = []
+ nentries = len(entries)
+ for ientry,entry in enumerate(entries):
+  metaline = entry.metaline
+  L0 = entry.metad['L']
+  pc0 = entry.metad['pc']
+  k10 = entry.metad['k1']
+  k20 = entry.metad['k2']
+  if not (',' in k20):
+   # no alternates for this entry
+   continue
+  # Get L for next entry, or None if this is last entry
+  metacalc = '<L>%s<pc>%s<k1>%s<k2>%s' %(L0,pc0,k10,k20)
+  # only L0,pc0,k10,k20 should be in metaline
+  if metaline != metacalc:
+   print('WARNING: metaline=%s' % metaline)
+   print('         metacalc=%s' % metacalc)
+
+
+  ientry1 = ientry+1
+  if ientry1 < nentries:
+   entry1 = entries[ientry1]
+   L1 = entry1.metad['L']
+  else:
+   L1 = None
+  vals = (L0,L1,pc0,k10,k20)
+  out = '\t'.join(vals)
+  althws.append(out)
+ print(len(althws),"metalines with comma")
+ return althws
+
+def write(fileout,lines):
+ with codecs.open(fileout,"w","utf-8") as f:
+  for iline,line in enumerate(lines):
+   f.write(line+'\n')
+ print(len(lines),"records written to",fileout)
+
+if __name__=="__main__":
+ filein = sys.argv[1] # pwkvn.txt
+ fileout = sys.argv[2] # pwhvn_hwextra.txt
+
+ entries = digentry.init(filein)
+ althws = multik2s(entries)
+
+ write(fileout,althws)
+
+
+