-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract word.py
81 lines (70 loc) · 2.98 KB
/
extract word.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import os.path
import torch
from transformers import BertTokenizer, BertModel
import pandas as pd
from openpyxl import Workbook,load_workbook
class WRITE_TO_EXCEL():
def __init__(self,path):
self.path = path
def write_to_file(self,value):
df = value
emp = pd.DataFrame()
if not os.path.exists(self.path):
workbook = Workbook()
emp.to_excel(self.path)
workbook.create_sheet('Sheet1')
else:
workbook = load_workbook(self.path)
df1 = pd.DataFrame(pd.read_excel(self.path, sheet_name='Sheet1'))
writer = pd.ExcelWriter(self.path)
writer.book = workbook
writer.sheets = dict((ws.title, ws) for ws in workbook.worksheets)
df_rows = df1.shape[0]
df.to_excel(writer, sheet_name='Sheet1', startrow=df_rows + 1, index=False,
header=False)
writer.save() # Save
class read_vocab:
def __init__(self):
self.filename = r"Google/uncased_L-12_H-768_A-12/vocab.txt" # file name
def read(self):
with open(self.filename, 'rb') as f:
lines = f.read().splitlines() # read a line
i = 0 # read from the second line
dict = {}
list_all_english_word = []
while i < len(lines): # determine whether is the end of file
if lines[i].isalpha() or '[PAD]' in lines[i].decode() or '[UNK]' in lines[i].decode()\
or '[CLS]' in lines[i].decode() or '[SEP]' in lines[i].decode()\
or '[MASK]' in lines[i].decode():
dict[i] = lines[i]
list_all_english_word.append(i)
i += 1
return dict, list_all_english_word
output_file = 'BERT_vec.xlsx'
value = [['word_id', 'word', 'vector'],]
writer = WRITE_TO_EXCEL(output_file)
read_BERT = read_vocab()
model = BertModel.from_pretrained('bert-base-uncased',
output_hidden_states=True,
)
# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()
BERT_dict, BERT_dict_key = read_BERT.read()
for i in range(len(BERT_dict_key)):
text = BERT_dict[BERT_dict_key[i]]
marked_text = "[CLS] " + text.decode() + " [SEP]"
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_text = tokenizer.tokenize(marked_text)
index_of_word = tokenizer.convert_tokens_to_ids(tokenized_text)
print(tokenized_text)
id = [1]*len(tokenized_text)
input_tensor = torch.tensor([index_of_word])
segment_tensor = torch.tensor([id])
with torch.no_grad():
output = model(input_tensor, segment_tensor)
vec = output[2][12][0][1] #2 , layer(0-12), batch, which word,
[['word_id', 'word', 'vector'], ]
data = {'word_id':BERT_dict_key[i],'word':tokenized_text[1],'vector':[vec.numpy().T]}
df = pd.DataFrame(data)
writer.write_to_file(df)
print('Done')