-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_source_vocab.py
61 lines (44 loc) · 1.84 KB
/
extract_source_vocab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import json
import os
from bs4 import BeautifulSoup
# ==================================================================================================
filepath = os.path.dirname(os.path.realpath(__file__)) + "/"
datafile = filepath + "sourcedata/hsk_{}_{}.txt"
levels = [1, 2, 3, 4, 5, 6]
language = "ger"
outfile = filepath + "sourcedata/vocab_{}.json".format(language)
# ==================================================================================================
def main():
vocabulary = []
for level in levels:
sfile = datafile.format(level, language)
with open(sfile, "r", encoding="utf-8-sig", errors="ignore") as file:
content = file.read()
soup = BeautifulSoup(content, "lxml")
cbox = soup.find("div", attrs={"class": "content_txt"})
vboxes = cbox.findAll("tr")
for vbox in vboxes:
items = vbox.findAll("td")
entry = {
"level": level,
"hanzi": items[0].text.strip(),
"pinyin": items[1].text.strip(),
}
# Some words can have multiple meanings depending on their type
trans = {}
for child in items[2].children:
if child.name == "span":
key = child.text.strip()
key = key.replace(":", "")
elif child.name == "br":
continue
else:
trans[key] = child.text.strip()
entry["text"] = trans
vocabulary.append(entry)
with open(outfile, "w+", encoding="utf-8") as file:
json.dump(vocabulary, file, ensure_ascii=False, indent=2, sort_keys=True)
# ==================================================================================================
if __name__ == "__main__":
main()
print("FINISHED")