-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutils.py
100 lines (83 loc) · 2.57 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import pandas as pd
import pickle
import string
import os, sys
from vocab import Vocabulary
import torch
import torchtext
from torchtext.data import get_tokenizer
import tqdm
def clean_text(text):
"""Preprocesses a text string to increase glove embedding count"""
dic = {
'$': ' $ ',
'-': ' - ',
'£': ' £ ',
'₹': ' ₹ ',
'“': ' “ ',
'\'': ' \' ',
'/' : ' / ',
'[': ' [ ',
']': ' ] ',
'—': ' - ',
'–': ' - ',
'¢': ' ¢ ',
'‘‘': ' ‘‘ ',
'€': ' € ',
'<': ' < ',
'”': ' ” ',
'`' : ' ` ',
'£': ' £ ',
'+': ' + ',
'’': ' ’ ',
'°': ' ° ',
'″': ' ″ ',
'−': ' − ',
'×': ' × '
}
for elem in dic.keys():
text = text.replace(elem, dic[elem])
return text
def save_vocab(vocab):
with open('savedVocab', 'wb') as savedVocab:
pickle.dump(vocab, savedVocab)
print("Saved the vocab.")
def load_vocab():
if not os.path.exists('savedVocab'):
build_vocab()
with open(os.path.join(sys.path[0], 'savedVocab'), 'rb') as savedVocab:
vocab = pickle.load(savedVocab)
print("loaded vocab")
return vocab
def build_vocab():
train = pd.read_csv(os.path.join(sys.path[0], './data/train.csv'))
val = pd.read_csv(os.path.join(sys.path[0], './data/val.csv'))
vocab = Vocabulary()
tokenizer = get_tokenizer("basic_english")
words = []
print("Building Vocabulary")
for i in range(len(train)):
row = train.iloc[i]
passage = str(row['passage'])
question = str(row['question'])
answer = str(row['answer'])
passage = tokenizer(passage)
question = tokenizer(question)
answer = tokenizer(answer)
words += passage + question + answer
for i in range(len(val)):
row = val.iloc[i]
passage = str(row['passage'])
question = str(row['question'])
answer = str(row['answer'])
passage = tokenizer(passage)
question = tokenizer(question)
answer = tokenizer(answer)
words += passage + question + answer
ser = pd.Series(words)
counts = ser.value_counts()
all_words = list(ser[ser.isin(counts[counts >= 2].index)].unique())
for elem in all_words:
vocab.add_word(elem)
save_vocab(vocab)
return vocab