-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
eb0be63
commit ec71116
Showing
51 changed files
with
349,411 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,225 @@ | ||
; Manual changes to temp_md_0b.txt | ||
---+ | ||
<L>1146<pc>011-2<k1>aDvan | ||
{#aDvan#}¦ ádhvan, <lex>m.</lex> road; journey, wandering; 🞄distance; {@-ina,@} <lex>m.</lex> traveller. | ||
-Ina (iast) | ||
---+ | ||
L>1667<pc>016-1<k1> | ||
{@-dhyāying,@} -> {@-dhyāyin,@} | ||
---+ | ||
<L>2072<pc>021-3<k1>apratiraTa<k2>apratiraTa<e>S | ||
{@-rāpa,@} -> {@-rūpa,@} | ||
rUpa | ||
---+ | ||
<L>2076<pc>021-3<k1>apratizWa | ||
{@-ṣaṭhita,@} -> {@-ṣṭhita,@} | ||
---+ | ||
<L>2211<pc>023-1<k1>aBitAqana | ||
{@-tāps,@} -> {@-tāpa,@} | ||
{@-tāma,@} -> {@-tāmra,@} | ||
tāpa | ||
---+ | ||
<L>2377<pc>025-1<k1>amantraka | ||
{@-gñ@}a, vid, -> {@-jña,@} {@-vid,@} | ||
---+ | ||
<L>3167<pc>034-3<k1>asaScat | ||
{@-anti@} -> {@-antī@} | ||
NOTE: {@-antī@} and {@-át-ī@} are not cpds, but feminine forms | ||
---+ | ||
<L>3564<pc>039-1<k1>Adfta | ||
{@-vati,@} -> {@-vatī,@} | ||
---+ | ||
<L>3870<pc>042-2<k1>AlIQa<k2>AlIQa<e>S | ||
{@-viśeṣa-śbhin,@} -> {@-viśeṣa-śobhin,@} | ||
---+ | ||
<L>3921<pc>043-1<k1>AveSa | ||
{@-īka,@} -> {@-ika,@} | ||
---+ | ||
<L>5901<pc>068-2<k1>kiMnara | ||
{@-nāmaka (ikā),@} -> {@-nāmaka,@} {@(ikā),@} | ||
---+ | ||
<L>5463<pc>062-2<k1>kapAwa | ||
{@-ka (ikā@}) -> {@-ka,@} ({@ikā@}) | ||
---+ | ||
<L>5458<pc>062-2<k1>kanyA | ||
{@-āāra,@} -> {@-āgāra,@} | ||
---+ | ||
<L>247<pc>003-1<k1>aguRajYa | ||
{@-śila,@} -> {@-śīla,@} | ||
---+ | ||
<L>272<pc>003-1<k1>agnigfha | ||
{@-jihava,@} -> {@-jihvá,@} | ||
---+ | ||
<L>279<pc>003-1<k1>agnimuKa | ||
{@-saraṇa,@} -> {@-śaraṇa,@} | ||
---+ | ||
<L>282<pc>003-2<k1>agnizwut | ||
{@-ṣvāttā,@} -> {@-ṣvāttá,@} | ||
---+ | ||
<L>296<pc>003-2<k1>agnyagAra | ||
{@-adhéya,@} -> {@-ādhéya,@} | ||
---+ | ||
<L>302<pc>003-2<k1>agranaKa | ||
{@-vīva,@} -> {@-vīra,@} | ||
---+ | ||
<L>324<pc>003-3<k1>agryatapas | ||
{@-mahiṣi,@} -> {@-mahiṣī,@} | ||
---+ | ||
<L>372<pc>004-1<k1>aNgabanDana | ||
{@-bhu,@} -> {@-bhū,@} | ||
---+ | ||
<L>298<pc>003-2<k1>agrakara | ||
{@-gra,@} -> {@-ga,@} | ||
|
||
---+ | ||
<L>1008<pc>010-1<k1>aDika | ||
and mer; exceeded -> and more; exceeded | ||
---+ | ||
<L>512<pc>005-2<k1>ajYAta | ||
{@-kulaśila,@} -> {@-kulaśīla,@} | ||
---+ | ||
<L>697<pc>007-1<k1>atibala | ||
{@-baliyas,@} -> {@-balīyas,@} | ||
---+ | ||
<L>728<pc>007-2<k1>atilubDa | ||
{@-tā-,@} -> {@-tā,@} | ||
---+ | ||
<L>744<pc>007-2<k1>ativega | ||
{@-vepatha-mat,@} -> {@-vepathu-mat,@} | ||
---+ | ||
<L>844<pc>008-2<k1>atyAditya | ||
{@-āūḍha,@} -> {@-ārūḍha,@} | ||
{@-āūḍhi,@} -> {@-ārūḍhi,@} | ||
---+ | ||
<L>865<pc>008-3<k1>atraBavat | ||
{@-i,@} -> {@-ī,@} | ||
---+ | ||
<L>983<pc>009-3<k1>aDarAt | ||
{#aDarAt#}¦ adharā́t, <lex>ad.</lex> below; {@-āt,@} | ||
{@-āt,@} -> {@-tāt,@} (print-change cf. MW) | ||
---+ | ||
<L>1106<pc>010-3<k1>aDogata | ||
{@-nivita,@} -> {@-nivīta,@} | ||
---+ | ||
<L>1229<pc>012-1<k1>ananta | ||
{@-kirti,@} -> {@-kīrti,@} | ||
---+ | ||
<L>1458<pc>014-1<k1>anAyasita | ||
to plying -> not plying | ||
---+ | ||
<L>1463<pc>014-1<k1>anAraBya<k2>anAraBya<e>S | ||
{#anAraBya#}¦ an-ā-rabhya, <ab>fp.</ab> not to be begun; 🞄impossible; {@-ārambh-a,@} <lex>m.</lex> non-commencement 🞄(of, <ab>g.</ab>); <lex>a.</lex> ruddha, <ab>pp.</ab> unlimited. | ||
OLD: missing entry | ||
<L>1463<pc>014-1<k1>anAraBya<k2>anAraBya<e>S | ||
{#anAraBya#}¦ an-ā-rabhya, <ab>fp.</ab> not to be begun; 🞄impossible; {@-ārambh-a,@} <lex>m.</lex> non-commencement 🞄(of, <ab>g.</ab>); <lex>a.</lex> unenterprising: {@-in@,} <lex>a.</lex> <ab>id.</ab> | ||
<LEND> | ||
NEW: | ||
<L>1463.1<pc>014-1<k1>anArudDa<k2>anArudDa<e>S | ||
{#anArudDa#}¦ an-ā-ruddha, <ab>pp.</ab> unlimited. | ||
<LEND> | ||
---+ | ||
<L>1636<pc>015-2<k1>anukta | ||
{@-kliva-vacana,@} -> {@-klīva-vacana,@} | ||
---+ | ||
<L>1693<pc>016-2<k1>anupeta | ||
{@-púrva,@} -> {@-pūrva,@} | ||
---+ | ||
<L>1711<pc>017-1<k1>anuSaya | ||
{@-śāanīya,@} -> {@-śāsanīya,@} | ||
{@-śphin,@} -> {@-śobhin,@} | ||
---+ | ||
<L>1874<pc>019-2<k1>apacita | ||
{@-citti,@} -> {@-citi,@} | ||
---+ | ||
<L>1883<pc>019-2<k1>apatita | ||
{@-anyo'nya-tyāgin,@} -> {@-anyonya-tyāgin,@} PRINT CHANGE | ||
---+ | ||
<L>1887<pc>019-2<k1>apatya | ||
{@-sneka-kṛpā-maya,@} -> {@-sneha-kṛpā-maya,@} | ||
---+ | ||
<L>1896<pc>019-2<k1>apadravya | ||
{@-dhavaṃsá,@} -> {@-dhvaṃsá,@} | ||
---+ | ||
<L>1897<pc>019-2<k1>apanaya | ||
{@-ninīṣum@} -> {@-ninīṣu@} | ||
---+ | ||
<L>1918<pc>019-3<k1>aparAdDa | ||
{@-rādhṛ,@} -> {@-rāddhṛ,@} | ||
---+ | ||
<L>1919<pc>019-3<k1>aparADa | ||
{@(i)-tā,@} -> ({@i@}){-tā,@} | ||
---+ | ||
<L>1961<pc>020-2<k1>apasnAna | ||
{@-spasśa,@} -> {@-spaśa,@} | ||
---+ | ||
<L>1974<pc>020-3<k1>apANga | ||
{@ā, i@} -> {@ā,@} {@ī@} | ||
---+ | ||
<L>2200<pc>023-1<k1>aBicakzaRa | ||
{@-cákaṣe,@} -> {@-cákṣe,@} | ||
---+ | ||
<L>2226<pc>023-1<k1>aBiniveSa | ||
promeness -> proneness | ||
---+ | ||
<L>2228<pc>023-2<k1>aBinna | ||
{@-vala,@} -> {@-vela,@} | ||
---+ | ||
<L>2274<pc>023-3<k1>aBizikta | ||
{@-sheṇana,@} -> {@-ṣeṇana,@} | ||
---+ | ||
<L>2282<pc>024-1<k1>aBisara | ||
(n){@-ī,@} -> (ṇ){@-ī,@} | ||
---+ | ||
<L>2326<pc>024-2<k1>aByarTana | ||
{@-arthaṇa,@} -> {@-arhaṇa,@} | ||
---+ | ||
<L>2381<pc>025-1<k1>amara | ||
{@-pati-kamāra,@} -> {@-pati-kumāra,@} | ||
{@-prāthita,@} -> {@-prārthita,@} | ||
---+ | ||
<L>2472<pc>026-1<k1>amBoja | ||
{@-jnī,@} -> {@-jinī,@} | ||
---+ | ||
<L>2696<pc>028-3<k1>alaMkAra | ||
{@-śila,@} -> {@-śīla,@} | ||
---+ | ||
<L>2745<pc>029-1<k1>alpaparIvAra | ||
{@-paṇya,@} -> {@-puṇya,@} | ||
---+ | ||
<L>2769<pc>029-3<k1>avacaya | ||
{@-ciciṣā,@} -> {@-cicīṣā,@} | ||
---+ | ||
<L>2795<pc>029-3<k1>avaDya | ||
{@-bhāna,@} -> {@-bhāva,@} | ||
---+ | ||
<L>2909<pc>031-2<k1>aviparyaya | ||
{@-ās@}a, -> {@-āsa,@} | ||
---+ | ||
<L>3068<pc>033-1<k1>aSru | ||
{@k@}ṛ -> {@kṛ@} | ||
{@mu@}c, -> {@muc,@} | ||
---+ | ||
<L>3111<pc>033-3<k1>asaMyama | ||
{@-vijñāna,@} .. unconscious -> {@-vijñāta,@} | ||
{@- lakṣita,@} -> {@-lakṣita,@} | ||
---+ | ||
<L>3144<pc>034-2<k1>asamaYja | ||
{#°sa#} -sa, -> {#°sa#} {@-sa,@} | ||
---+ | ||
<L>3160<pc>034-2<k1>asaMBava | ||
{@-bhāya,@} -> {@-bhāvya,@} | ||
---+ | ||
<L>3167<pc>034-3<k1>asaScat | ||
{@-anti@} -> {@-antī@} | ||
---+ | ||
<L>3325<pc>036-2<k1>ahar | ||
{@-pāti,@} -> {@-páti,@} | ||
---+ | ||
<L>3456<pc>037-3<k1>AcArya | ||
{@-adhina,@} -> {@-adhīna,@} | ||
---+ | ||
<L>3079<pc>033-1<k1>aSvatTa | ||
{@- pādāta-sārameya-maya,@} -> {@-pādāta-sārameya-maya,@} | ||
--- | ||
<L>6245<pc>073-2<k1>kfmi | ||
*{@-kośaja, *-kośa‿uttha,@} -> *{@-kośaja,@} *{@-kośa‿uttha,@} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
#-*- coding:utf-8 -*- | ||
"""digentry.py | ||
Module to read a digitization | ||
and generate a list of Entry objects | ||
Adapted for temp_pwkvn_22.txt | ||
""" | ||
from __future__ import print_function | ||
import sys,re,codecs | ||
|
||
class Entry(object): | ||
Ldict = {} | ||
def __init__(self,lines,linenum1,linenum2): | ||
# linenum1,2 are int | ||
self.metaline = lines[0] | ||
self.lend = lines[-1] # the <LEND> line | ||
self.datalines = lines[1:-1] # the non-meta lines | ||
# parse the meta line into a dictionary | ||
self.metad = parseheadline(self.metaline) | ||
self.linenum1 = linenum1 | ||
self.linenum2 = linenum2 | ||
L = self.metad['L'] | ||
if L in self.Ldict: | ||
print("Entry init error: duplicate L",L,linenum1) | ||
exit(1) | ||
self.Ldict[L] = self | ||
self.lsarr = [] | ||
|
||
def init(filein): | ||
# slurp lines | ||
with codecs.open(filein,encoding='utf-8',mode='r') as f: | ||
lines = [line.rstrip('\r\n') for line in f] | ||
recs=[] # list of Entry objects | ||
inentry = False | ||
idx1 = None | ||
idx2 = None | ||
for idx,line in enumerate(lines): | ||
if inentry: | ||
if line.startswith('<LEND>'): | ||
idx2 = idx | ||
entrylines = lines[idx1:idx2+1] | ||
linenum1 = idx1 + 1 | ||
linenum2 = idx2 + 1 | ||
entry = Entry(entrylines,linenum1,linenum2) | ||
recs.append(entry) | ||
# prepare for next entry | ||
idx1 = None | ||
idx2 = None | ||
inentry = False | ||
elif line.startswith('<L>'): # error | ||
print('init_entries Error 1. Not expecting <L>') | ||
print("line # ",idx+1) | ||
print(line.encode('utf-8')) | ||
exit(1) | ||
else: | ||
# keep looking for <LEND> | ||
continue | ||
else: | ||
# inentry = False. Looking for '<L>' | ||
if line.startswith('<L>'): | ||
idx1 = idx | ||
inentry = True | ||
elif line.startswith('<LEND>'): # error | ||
print('init_entries Error 2. Not expecting <LEND>') | ||
print("line # ",idx+1) | ||
print(line.encode('utf-8')) | ||
exit(1) | ||
else: | ||
# keep looking for <L> | ||
continue | ||
# when all lines are read, we should have inentry = False | ||
if inentry: | ||
print('digentry.init Error 3. for file',filein) | ||
print('Last entry not closed. Open entry starts at line',idx1+1) | ||
exit(1) | ||
|
||
print(len(lines),"lines read from",filein) | ||
print(len(recs),"entries found") | ||
return recs | ||
|
||
def parseheadline(headline): | ||
""" | ||
function to parse a 'metaline' and return a dictionary. | ||
Example: | ||
headline = <L>16850<pc>292-3<k1>visarga<k2>visarga<h>1<e> | ||
returns dictionary | ||
{'L': '16850', | ||
'pc': '292-3', | ||
'k1': 'visarga', | ||
'k2': 'visarga', | ||
'h': '1', | ||
'e': ''} | ||
""" | ||
headline = headline.strip() | ||
splits = re.split('[<]([^>]*)[>]([^<]*)',headline) | ||
result = {} | ||
for i in range(len(splits)): | ||
if i % 3 == 1: | ||
result[splits[i]] = splits[i+1] | ||
return result | ||
|
||
if __name__=="__main__": | ||
filein = sys.argv[1] # xxx.txt (path to digitization of xxx) | ||
entries = init(filein) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
#-*- coding:utf-8 -*- | ||
"""hw_check.py | ||
""" | ||
from __future__ import print_function | ||
import sys, re,codecs | ||
|
||
def read_lines(filein): | ||
with codecs.open(filein,encoding='utf-8',mode='r') as f: | ||
lines = [x.rstrip('\r\n') for x in f] | ||
print(len(lines),"lines read from",filein) | ||
return lines | ||
|
||
def write_lines(fileout,lines): | ||
with codecs.open(fileout,"w","utf-8") as f: | ||
for line in lines: | ||
f.write(line+'\n') | ||
print(len(lines),"lines written to",fileout) | ||
|
||
def init_hws(filein): | ||
# <L>1<pc>1,1<k1>a<k2>a<h>1<e>1<ln1>5<ln2>7 | ||
lines = read_lines(filein) | ||
d = {} # unique | ||
for iline,line in enumerate(lines): | ||
m = re.search('^<L>(.*?)<pc>(.*?)<k1>(.*?)<k2>',line) | ||
k1 = m.group(3) | ||
d[k1] = True | ||
# | ||
keys = d.keys() | ||
print("%s keys from %s" %(len(keys),filein)) | ||
return d | ||
|
||
if __name__=="__main__": | ||
filein = sys.argv[1] # list of words (slp1 Sanskrit), one per line | ||
filein1 = sys.argv[2] # headwords in xxxhw.txt format | ||
fileout = sys.argv[3] # list of words found | ||
|
||
words = read_lines(filein) | ||
hwd = init_hws(filein1) # dictionary of k1s from xxxhw.txt | ||
|
||
found = [x for x in words if x in hwd] | ||
|
||
write_lines(fileout,found) |
Oops, something went wrong.