-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnormalize.py
19 lines (17 loc) · 1020 Bytes
/
normalize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import sys
import unicodedata
import re
# Source: https://github.com/nlpaueb/greek-bert#pre-process-text-deaccent---lower
def strip_accents_and_lowercase(s):
return ''.join(c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn').lower()
with open(sys.argv[1], "r", encoding="utf-8") as input:
with open(sys.argv[2], "x", encoding="utf-8") as output:
for line in input:
clean = strip_accents_and_lowercase(line)
clean = re.sub("\#.*\\n", '', clean) # remove header info from TR text
clean = re.sub('[0-9]+,[0-9]+,', '', clean) # remove chapter/verse numbers from BYZ text
clean = re.sub('chapter,verse,text\n', '', clean) # remove CSV headers from BYZ text
clean = re.sub('[0-9]+ ', '', clean) # remove numbers from TR text
clean = re.sub("[.,'·:;ʼ!\"]", '', clean) # strip all punctuation
output.write(clean)