Skip to content

Commit

Permalink
extended ascii regenerated, and put into md-meta2 in csl-orig. #11
Browse files Browse the repository at this point in the history
  • Loading branch information
funderburkjim committed Dec 26, 2023
1 parent e63d8a6 commit 7b57765
Show file tree
Hide file tree
Showing 3 changed files with 233 additions and 4 deletions.
65 changes: 65 additions & 0 deletions mdissues/issue11/abv1/ea.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#-*- coding:utf-8 -*-
"""check_ea.py for ap57,ap90
"""
import sys,re,codecs
import unicodedata
## https:##stackoverflow.com/questions/27092833/unicodeencodeerror-charmap-codec-cant-encode-characters
## This required by git bash to avoid error
## UnicodeEncodeError: 'charmap' codec cannot encode characters
## when run in a git bash script.

sys.stdout.reconfigure(encoding='utf-8')
def check_ea(lines):
asdict = {}

metaline = None
imetaline1 = None
page = None
#regex_split = re.compile(r'<ls>(.*?)</ls>')
#nls = 0
for iline,line in enumerate(lines):
if iline == 0: # %***This File is E:\\APTE.ALL, Last update 11.09.06
continue #
line = line.rstrip('\r\n')
if line == '':
continue
if line.startswith('<L>'):
metaline = line
imetaline1 = iline+1
continue
if line == '<LEND>':
metaline = None
imetaline = None
continue
if line.startswith('[Page'):
page = line
continue
for c in line:
if ord(c) > 127:
if c not in asdict:
asdict[c] = 0
asdict[c] = asdict[c] + 1

print(len(asdict),"extended ascii characters")
return asdict

def write_ea(fileout,eadict):
keys = eadict.keys()
keys = sorted(keys)

with codecs.open(fileout,"w","utf-8") as f:
for key in keys:
out = "%s (\\u%04x) %5d := %s" %(key,ord(key),eadict[key],unicodedata.name(key))
f.write(out+'\n')
print(len(keys),"extended ascii counts written to",fileout)

if __name__=="__main__":
filein = sys.argv[1] # xxx.txt (path to digitization of xxx)
fileout = sys.argv[2] # extended ascii

with codecs.open(filein,"r","utf-8") as f:
lines = [x.rstrip('\r\n') for x in f]
eacounts = check_ea(lines) #
write_ea(fileout,eacounts)

107 changes: 107 additions & 0 deletions mdissues/issue11/abv1/ea_romana.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
¤ (\u00a4) 404 := CURRENCY SIGN
¦ (\u00a6) 20749 := BROKEN BAR
° (\u00b0) 4817 := DEGREE SIGN
± (\u00b1) 218 := PLUS-MINUS SIGN
² (\u00b2) 25 := SUPERSCRIPT TWO
¹ (\u00b9) 512 := SUPERSCRIPT ONE
Ñ (\u00d1) 31 := LATIN CAPITAL LETTER N WITH TILDE
× (\u00d7) 15 := MULTIPLICATION SIGN
à (\u00e0) 6 := LATIN SMALL LETTER A WITH GRAVE
á (\u00e1) 5573 := LATIN SMALL LETTER A WITH ACUTE
â (\u00e2) 112 := LATIN SMALL LETTER A WITH CIRCUMFLEX
é (\u00e9) 294 := LATIN SMALL LETTER E WITH ACUTE
ê (\u00ea) 2 := LATIN SMALL LETTER E WITH CIRCUMFLEX
í (\u00ed) 728 := LATIN SMALL LETTER I WITH ACUTE
ñ (\u00f1) 960 := LATIN SMALL LETTER N WITH TILDE
ó (\u00f3) 225 := LATIN SMALL LETTER O WITH ACUTE
ù (\u00f9) 3 := LATIN SMALL LETTER U WITH GRAVE
ú (\u00fa) 682 := LATIN SMALL LETTER U WITH ACUTE
û (\u00fb) 1 := LATIN SMALL LETTER U WITH CIRCUMFLEX
ü (\u00fc) 2 := LATIN SMALL LETTER U WITH DIAERESIS
Ā (\u0100) 1389 := LATIN CAPITAL LETTER A WITH MACRON
ā (\u0101) 25806 := LATIN SMALL LETTER A WITH MACRON
ă (\u0103) 2 := LATIN SMALL LETTER A WITH BREVE
Ī (\u012a) 60 := LATIN CAPITAL LETTER I WITH MACRON
ī (\u012b) 7216 := LATIN SMALL LETTER I WITH MACRON
ĭ (\u012d) 6 := LATIN SMALL LETTER I WITH BREVE
ł (\u0142) 6 := LATIN SMALL LETTER L WITH STROKE
œ (\u0153) 4 := LATIN SMALL LIGATURE OE
Ś (\u015a) 828 := LATIN CAPITAL LETTER S WITH ACUTE
ś (\u015b) 6417 := LATIN SMALL LETTER S WITH ACUTE
Ū (\u016a) 53 := LATIN CAPITAL LETTER U WITH MACRON
ū (\u016b) 2799 := LATIN SMALL LETTER U WITH MACRON
ŭ (\u016d) 2 := LATIN SMALL LETTER U WITH BREVE
ɴ (\u0274) 403 := LATIN LETTER SMALL CAPITAL N
ʼ (\u02bc) 1743 := MODIFIER LETTER APOSTROPHE
́ (\u0301) 1825 := COMBINING ACUTE ACCENT
̂ (\u0302) 11 := COMBINING CIRCUMFLEX ACCENT
̄ (\u0304) 13 := COMBINING MACRON
̆ (\u0306) 67 := COMBINING BREVE
̐ (\u0310) 1 := COMBINING CANDRABINDU
̣ (\u0323) 1 := COMBINING DOT BELOW
ά (\u03ac) 3 := GREEK SMALL LETTER ALPHA WITH TONOS
ή (\u03ae) 3 := GREEK SMALL LETTER ETA WITH TONOS
ί (\u03af) 1 := GREEK SMALL LETTER IOTA WITH TONOS
α (\u03b1) 3 := GREEK SMALL LETTER ALPHA
δ (\u03b4) 3 := GREEK SMALL LETTER DELTA
ε (\u03b5) 5 := GREEK SMALL LETTER EPSILON
η (\u03b7) 1 := GREEK SMALL LETTER ETA
ι (\u03b9) 1 := GREEK SMALL LETTER IOTA
μ (\u03bc) 3 := GREEK SMALL LETTER MU
ν (\u03bd) 5 := GREEK SMALL LETTER NU
ο (\u03bf) 1 := GREEK SMALL LETTER OMICRON
π (\u03c0) 2 := GREEK SMALL LETTER PI
ρ (\u03c1) 5 := GREEK SMALL LETTER RHO
ς (\u03c2) 1 := GREEK SMALL LETTER FINAL SIGMA
τ (\u03c4) 4 := GREEK SMALL LETTER TAU
χ (\u03c7) 1 := GREEK SMALL LETTER CHI
ό (\u03cc) 1 := GREEK SMALL LETTER OMICRON WITH TONOS
ϛ (\u03db) 2 := GREEK SMALL LETTER STIGMA
Ḍ (\u1e0c) 31 := LATIN CAPITAL LETTER D WITH DOT BELOW
ḍ (\u1e0d) 1258 := LATIN SMALL LETTER D WITH DOT BELOW
ḥ (\u1e25) 477 := LATIN SMALL LETTER H WITH DOT BELOW
Ḷ (\u1e36) 2 := LATIN CAPITAL LETTER L WITH DOT BELOW
ḷ (\u1e37) 34 := LATIN SMALL LETTER L WITH DOT BELOW
Ṃ (\u1e42) 22 := LATIN CAPITAL LETTER M WITH DOT BELOW
ṃ (\u1e43) 2363 := LATIN SMALL LETTER M WITH DOT BELOW
Ṅ (\u1e44) 12 := LATIN CAPITAL LETTER N WITH DOT ABOVE
ṅ (\u1e45) 1081 := LATIN SMALL LETTER N WITH DOT ABOVE
Ṇ (\u1e46) 24 := LATIN CAPITAL LETTER N WITH DOT BELOW
ṇ (\u1e47) 4869 := LATIN SMALL LETTER N WITH DOT BELOW
Ṛ (\u1e5a) 220 := LATIN CAPITAL LETTER R WITH DOT BELOW
ṛ (\u1e5b) 4760 := LATIN SMALL LETTER R WITH DOT BELOW
Ṝ (\u1e5c) 11 := LATIN CAPITAL LETTER R WITH DOT BELOW AND MACRON
ṝ (\u1e5d) 33 := LATIN SMALL LETTER R WITH DOT BELOW AND MACRON
Ṣ (\u1e62) 123 := LATIN CAPITAL LETTER S WITH DOT BELOW
ṣ (\u1e63) 6536 := LATIN SMALL LETTER S WITH DOT BELOW
Ṭ (\u1e6c) 44 := LATIN CAPITAL LETTER T WITH DOT BELOW
ṭ (\u1e6d) 2514 := LATIN SMALL LETTER T WITH DOT BELOW
ἀ (\u1f00) 2 := GREEK SMALL LETTER ALPHA WITH PSILI
ὖ (\u1f56) 1 := GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI
ὥ (\u1f65) 1 := GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA
‒ (\u2012) 10 := FIGURE DASH
— (\u2014) 5872 := EM DASH
‘ (\u2018) 112 := LEFT SINGLE QUOTATION MARK
’ (\u2019) 112 := RIGHT SINGLE QUOTATION MARK
‿ (\u203f) 2852 := UNDERTIE
⁄ (\u2044) 12 := FRACTION SLASH
₀ (\u2080) 1 := SUBSCRIPT ZERO
₂ (\u2082) 7 := SUBSCRIPT TWO
₄ (\u2084) 3 := SUBSCRIPT FOUR
₆ (\u2086) 1 := SUBSCRIPT SIX
₈ (\u2088) 2 := SUBSCRIPT EIGHT
Ⅰ (\u2160) 469 := ROMAN NUMERAL ONE
Ⅱ (\u2161) 98 := ROMAN NUMERAL TWO
Ⅲ (\u2162) 38 := ROMAN NUMERAL THREE
Ⅳ (\u2163) 134 := ROMAN NUMERAL FOUR
Ⅴ (\u2164) 33 := ROMAN NUMERAL FIVE
Ⅵ (\u2165) 110 := ROMAN NUMERAL SIX
Ⅶ (\u2166) 22 := ROMAN NUMERAL SEVEN
Ⅷ (\u2167) 7 := ROMAN NUMERAL EIGHT
Ⅸ (\u2168) 52 := ROMAN NUMERAL NINE
Ⅹ (\u2169) 30 := ROMAN NUMERAL TEN
√ (\u221a) 1811 := SQUARE ROOT
⏑ (\u23d1) 7 := METRICAL BREVE
〈 (\u3008) 628 := LEFT ANGLE BRACKET
〉 (\u3009) 628 := RIGHT ANGLE BRACKET
🞄 (\u1f784) 60401 := BLACK SLIGHTLY SMALL CIRCLE
65 changes: 61 additions & 4 deletions mdissues/issue11/abv1/readme.txt
Original file line number Diff line number Diff line change
Expand Up @@ -212,8 +212,65 @@ git push
at Cologne, cd csl-apidev, then git pull.

----------------------------------------------------------
There is still an error with simple search in csl-apidev.
This will be resolved elsewhere.
**********************************************************
change to roman numerals in <cl>X</cl>
Ref: https://github.com/sanskrit-lexicon/MD/issues/11#issuecomment-1868537765
Start with temp_md_ab_v1_roman.txt from this link.

temp_md_ab_v1_romana.txt one additional change.
One change: in <L>19659<pc>358-2<k1>sfj
<ab>A.</ab> -> <lex>Ā.</lex>

Revise mdab_input.txt in csl-pywork
Ⅰ. <id>Ⅰ.</id> <disp>First conjugation.</disp>
Ⅱ. <id>Ⅱ.</id> <disp>Second conjugation.</disp>
Ⅲ. <id>Ⅲ.</id> <disp>Third conjugation.</disp>
Ⅳ. <id>Ⅳ.</id> <disp>Fourth conjugation.</disp>
Ⅴ. <id>Ⅴ.</id> <disp>Fifth conjugation</disp>
Ⅵ. <id>Ⅵ.</id> <disp>Sixth conjugation.</disp>
Ⅶ. <id>Ⅶ.</id> <disp>Seventh conjugation.</disp>
Ⅷ. <id>Ⅷ.</id> <disp>Eighth conjugation</disp>
Ⅸ. <id>Ⅸ.</id> <disp>Ninth conjugation.</disp>
Ⅹ. <id>Ⅹ.</id> <disp>Tenth conjugation</disp>

Install this version.
commit to repository csl-orig

cp ../temp_md_ab_v1_romana.txt /c/xampp/htdocs/cologne/csl-orig/v02/md/md.txt

error at dal.php? simple search problem.

-----------
cd /c/xampp/htdocs/cologne/csl-pywork/v02
sh generate_dict.sh md ../../md
sh xmlchk_xampp.sh md
# ok

-- csl-orig push to github
cd /c/xampp/htdocs/cologne/csl-orig/
git pull #
git add . # md.txt
git commit -m "MD: Use Unicode Roman numerals for 'cl' tag
Ref: https://github.com/sanskrit-lexicon/MD/issues/11#issuecomment-1868537765"

git push
# 1 file changed, 750 insertions(+), 750 deletions(-)

cd /c/xampp/htdocs/sanskrit-lexicon/MD/mdissues/issue11/abv1

----
Cologne install of csl-orig for new md, and regen displays.
----
---------------------------------------------------------
Regenerate list of extended ascii
python ea.py ../temp_md_ab_v1_romana.txt ea_romana.txt
# 107 extended ascii counts written to ea_romana.txt

revise csl-orig/v02/md/md-meta2.txt
- Put in the new extended ascii, and some other minor editing.

push this change to github.
pull csl-orig at cologne, and regenerate md in csl-pywork.

---------------------------------------------------------
cd /c/xampp/htdocs/sanskrit-lexicon/MD/mdissues/issue11/abv1
# push this repo to github
---------------------------------------------------------

0 comments on commit 7b57765

Please sign in to comment.