-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
extended ascii regenerated, and put into md-meta2 in csl-orig. #11
- Loading branch information
1 parent
e63d8a6
commit 7b57765
Showing
3 changed files
with
233 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
#-*- coding:utf-8 -*- | ||
"""check_ea.py for ap57,ap90 | ||
""" | ||
import sys,re,codecs | ||
import unicodedata | ||
## https:##stackoverflow.com/questions/27092833/unicodeencodeerror-charmap-codec-cant-encode-characters | ||
## This required by git bash to avoid error | ||
## UnicodeEncodeError: 'charmap' codec cannot encode characters | ||
## when run in a git bash script. | ||
|
||
sys.stdout.reconfigure(encoding='utf-8') | ||
def check_ea(lines): | ||
asdict = {} | ||
|
||
metaline = None | ||
imetaline1 = None | ||
page = None | ||
#regex_split = re.compile(r'<ls>(.*?)</ls>') | ||
#nls = 0 | ||
for iline,line in enumerate(lines): | ||
if iline == 0: # %***This File is E:\\APTE.ALL, Last update 11.09.06 | ||
continue # | ||
line = line.rstrip('\r\n') | ||
if line == '': | ||
continue | ||
if line.startswith('<L>'): | ||
metaline = line | ||
imetaline1 = iline+1 | ||
continue | ||
if line == '<LEND>': | ||
metaline = None | ||
imetaline = None | ||
continue | ||
if line.startswith('[Page'): | ||
page = line | ||
continue | ||
for c in line: | ||
if ord(c) > 127: | ||
if c not in asdict: | ||
asdict[c] = 0 | ||
asdict[c] = asdict[c] + 1 | ||
|
||
print(len(asdict),"extended ascii characters") | ||
return asdict | ||
|
||
def write_ea(fileout,eadict): | ||
keys = eadict.keys() | ||
keys = sorted(keys) | ||
|
||
with codecs.open(fileout,"w","utf-8") as f: | ||
for key in keys: | ||
out = "%s (\\u%04x) %5d := %s" %(key,ord(key),eadict[key],unicodedata.name(key)) | ||
f.write(out+'\n') | ||
print(len(keys),"extended ascii counts written to",fileout) | ||
|
||
if __name__=="__main__": | ||
filein = sys.argv[1] # xxx.txt (path to digitization of xxx) | ||
fileout = sys.argv[2] # extended ascii | ||
|
||
with codecs.open(filein,"r","utf-8") as f: | ||
lines = [x.rstrip('\r\n') for x in f] | ||
eacounts = check_ea(lines) # | ||
write_ea(fileout,eacounts) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
¤ (\u00a4) 404 := CURRENCY SIGN | ||
¦ (\u00a6) 20749 := BROKEN BAR | ||
° (\u00b0) 4817 := DEGREE SIGN | ||
± (\u00b1) 218 := PLUS-MINUS SIGN | ||
² (\u00b2) 25 := SUPERSCRIPT TWO | ||
¹ (\u00b9) 512 := SUPERSCRIPT ONE | ||
Ñ (\u00d1) 31 := LATIN CAPITAL LETTER N WITH TILDE | ||
× (\u00d7) 15 := MULTIPLICATION SIGN | ||
à (\u00e0) 6 := LATIN SMALL LETTER A WITH GRAVE | ||
á (\u00e1) 5573 := LATIN SMALL LETTER A WITH ACUTE | ||
â (\u00e2) 112 := LATIN SMALL LETTER A WITH CIRCUMFLEX | ||
é (\u00e9) 294 := LATIN SMALL LETTER E WITH ACUTE | ||
ê (\u00ea) 2 := LATIN SMALL LETTER E WITH CIRCUMFLEX | ||
í (\u00ed) 728 := LATIN SMALL LETTER I WITH ACUTE | ||
ñ (\u00f1) 960 := LATIN SMALL LETTER N WITH TILDE | ||
ó (\u00f3) 225 := LATIN SMALL LETTER O WITH ACUTE | ||
ù (\u00f9) 3 := LATIN SMALL LETTER U WITH GRAVE | ||
ú (\u00fa) 682 := LATIN SMALL LETTER U WITH ACUTE | ||
û (\u00fb) 1 := LATIN SMALL LETTER U WITH CIRCUMFLEX | ||
ü (\u00fc) 2 := LATIN SMALL LETTER U WITH DIAERESIS | ||
Ā (\u0100) 1389 := LATIN CAPITAL LETTER A WITH MACRON | ||
ā (\u0101) 25806 := LATIN SMALL LETTER A WITH MACRON | ||
ă (\u0103) 2 := LATIN SMALL LETTER A WITH BREVE | ||
Ī (\u012a) 60 := LATIN CAPITAL LETTER I WITH MACRON | ||
ī (\u012b) 7216 := LATIN SMALL LETTER I WITH MACRON | ||
ĭ (\u012d) 6 := LATIN SMALL LETTER I WITH BREVE | ||
ł (\u0142) 6 := LATIN SMALL LETTER L WITH STROKE | ||
œ (\u0153) 4 := LATIN SMALL LIGATURE OE | ||
Ś (\u015a) 828 := LATIN CAPITAL LETTER S WITH ACUTE | ||
ś (\u015b) 6417 := LATIN SMALL LETTER S WITH ACUTE | ||
Ū (\u016a) 53 := LATIN CAPITAL LETTER U WITH MACRON | ||
ū (\u016b) 2799 := LATIN SMALL LETTER U WITH MACRON | ||
ŭ (\u016d) 2 := LATIN SMALL LETTER U WITH BREVE | ||
ɴ (\u0274) 403 := LATIN LETTER SMALL CAPITAL N | ||
ʼ (\u02bc) 1743 := MODIFIER LETTER APOSTROPHE | ||
́ (\u0301) 1825 := COMBINING ACUTE ACCENT | ||
̂ (\u0302) 11 := COMBINING CIRCUMFLEX ACCENT | ||
̄ (\u0304) 13 := COMBINING MACRON | ||
̆ (\u0306) 67 := COMBINING BREVE | ||
̐ (\u0310) 1 := COMBINING CANDRABINDU | ||
̣ (\u0323) 1 := COMBINING DOT BELOW | ||
ά (\u03ac) 3 := GREEK SMALL LETTER ALPHA WITH TONOS | ||
ή (\u03ae) 3 := GREEK SMALL LETTER ETA WITH TONOS | ||
ί (\u03af) 1 := GREEK SMALL LETTER IOTA WITH TONOS | ||
α (\u03b1) 3 := GREEK SMALL LETTER ALPHA | ||
δ (\u03b4) 3 := GREEK SMALL LETTER DELTA | ||
ε (\u03b5) 5 := GREEK SMALL LETTER EPSILON | ||
η (\u03b7) 1 := GREEK SMALL LETTER ETA | ||
ι (\u03b9) 1 := GREEK SMALL LETTER IOTA | ||
μ (\u03bc) 3 := GREEK SMALL LETTER MU | ||
ν (\u03bd) 5 := GREEK SMALL LETTER NU | ||
ο (\u03bf) 1 := GREEK SMALL LETTER OMICRON | ||
π (\u03c0) 2 := GREEK SMALL LETTER PI | ||
ρ (\u03c1) 5 := GREEK SMALL LETTER RHO | ||
ς (\u03c2) 1 := GREEK SMALL LETTER FINAL SIGMA | ||
τ (\u03c4) 4 := GREEK SMALL LETTER TAU | ||
χ (\u03c7) 1 := GREEK SMALL LETTER CHI | ||
ό (\u03cc) 1 := GREEK SMALL LETTER OMICRON WITH TONOS | ||
ϛ (\u03db) 2 := GREEK SMALL LETTER STIGMA | ||
Ḍ (\u1e0c) 31 := LATIN CAPITAL LETTER D WITH DOT BELOW | ||
ḍ (\u1e0d) 1258 := LATIN SMALL LETTER D WITH DOT BELOW | ||
ḥ (\u1e25) 477 := LATIN SMALL LETTER H WITH DOT BELOW | ||
Ḷ (\u1e36) 2 := LATIN CAPITAL LETTER L WITH DOT BELOW | ||
ḷ (\u1e37) 34 := LATIN SMALL LETTER L WITH DOT BELOW | ||
Ṃ (\u1e42) 22 := LATIN CAPITAL LETTER M WITH DOT BELOW | ||
ṃ (\u1e43) 2363 := LATIN SMALL LETTER M WITH DOT BELOW | ||
Ṅ (\u1e44) 12 := LATIN CAPITAL LETTER N WITH DOT ABOVE | ||
ṅ (\u1e45) 1081 := LATIN SMALL LETTER N WITH DOT ABOVE | ||
Ṇ (\u1e46) 24 := LATIN CAPITAL LETTER N WITH DOT BELOW | ||
ṇ (\u1e47) 4869 := LATIN SMALL LETTER N WITH DOT BELOW | ||
Ṛ (\u1e5a) 220 := LATIN CAPITAL LETTER R WITH DOT BELOW | ||
ṛ (\u1e5b) 4760 := LATIN SMALL LETTER R WITH DOT BELOW | ||
Ṝ (\u1e5c) 11 := LATIN CAPITAL LETTER R WITH DOT BELOW AND MACRON | ||
ṝ (\u1e5d) 33 := LATIN SMALL LETTER R WITH DOT BELOW AND MACRON | ||
Ṣ (\u1e62) 123 := LATIN CAPITAL LETTER S WITH DOT BELOW | ||
ṣ (\u1e63) 6536 := LATIN SMALL LETTER S WITH DOT BELOW | ||
Ṭ (\u1e6c) 44 := LATIN CAPITAL LETTER T WITH DOT BELOW | ||
ṭ (\u1e6d) 2514 := LATIN SMALL LETTER T WITH DOT BELOW | ||
ἀ (\u1f00) 2 := GREEK SMALL LETTER ALPHA WITH PSILI | ||
ὖ (\u1f56) 1 := GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI | ||
ὥ (\u1f65) 1 := GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA | ||
‒ (\u2012) 10 := FIGURE DASH | ||
— (\u2014) 5872 := EM DASH | ||
‘ (\u2018) 112 := LEFT SINGLE QUOTATION MARK | ||
’ (\u2019) 112 := RIGHT SINGLE QUOTATION MARK | ||
‿ (\u203f) 2852 := UNDERTIE | ||
⁄ (\u2044) 12 := FRACTION SLASH | ||
₀ (\u2080) 1 := SUBSCRIPT ZERO | ||
₂ (\u2082) 7 := SUBSCRIPT TWO | ||
₄ (\u2084) 3 := SUBSCRIPT FOUR | ||
₆ (\u2086) 1 := SUBSCRIPT SIX | ||
₈ (\u2088) 2 := SUBSCRIPT EIGHT | ||
Ⅰ (\u2160) 469 := ROMAN NUMERAL ONE | ||
Ⅱ (\u2161) 98 := ROMAN NUMERAL TWO | ||
Ⅲ (\u2162) 38 := ROMAN NUMERAL THREE | ||
Ⅳ (\u2163) 134 := ROMAN NUMERAL FOUR | ||
Ⅴ (\u2164) 33 := ROMAN NUMERAL FIVE | ||
Ⅵ (\u2165) 110 := ROMAN NUMERAL SIX | ||
Ⅶ (\u2166) 22 := ROMAN NUMERAL SEVEN | ||
Ⅷ (\u2167) 7 := ROMAN NUMERAL EIGHT | ||
Ⅸ (\u2168) 52 := ROMAN NUMERAL NINE | ||
Ⅹ (\u2169) 30 := ROMAN NUMERAL TEN | ||
√ (\u221a) 1811 := SQUARE ROOT | ||
⏑ (\u23d1) 7 := METRICAL BREVE | ||
〈 (\u3008) 628 := LEFT ANGLE BRACKET | ||
〉 (\u3009) 628 := RIGHT ANGLE BRACKET | ||
🞄 (\u1f784) 60401 := BLACK SLIGHTLY SMALL CIRCLE |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters