-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtfidf.py
140 lines (119 loc) · 5.28 KB
/
tfidf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import re
import os
from tqdm import tqdm
import logging
import pandas as pd
from preprocess.preprocess import load_stopwords, seg_sentence
from gensim import corpora, models, similarities
from w2v_wmd.dataProcess import seg_sentence, load_stopwords
from utils import read_pkl_file, save_pkl_file, eval_MPP
logging.basicConfig(format='%(asctime)s:%(levelname)s: %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)
data_dir = "../data/"
subwayqq_path = data_dir + "documents.csv"
tfidf_file_dir = data_dir + "tfidf/"
stopwords_file = data_dir + 'stopwords.txt'
class TfIdf_Model:
def __init__(self, docs, tfidf_file_dir, stop_words_file=None):
logger.info('TfIdf_Model is initializing...')
self.docs = docs
self.tfidf_file_dir = tfidf_file_dir
self.stop_words_file = stop_words_file
# if not os.path.exists(self.seg_file_dir):
# os.mkdir(self.seg_file_dir)
if self.stop_words_file:
self.stopwords = load_stopwords(stop_words_file)
if not os.path.exists(tfidf_file_dir):
os.mkdir(tfidf_file_dir)
# compute tfidf and save to file
def compute_tfidf(self, tfidf_file, tfidf_dictionary_file, docs_tfidf_file):
texts = []
for question in self.docs:
texts.append(self.preprocess_data(question))
# texts = []
# for cur_data in self.data:
# texts.append(cur_data["query_seg"])
dictionary = corpora.Dictionary(texts)
feature_cnt = len(dictionary.token2id)
corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = models.TfidfModel(corpus)
docs_tfidf = tfidf[corpus]
save_pkl_file(tfidf, tfidf_file)
save_pkl_file(dictionary, tfidf_dictionary_file)
save_pkl_file(docs_tfidf, docs_tfidf_file)
return tfidf, docs_tfidf, dictionary
# init search engine
def init_search_engine(self):
tfidf_file = os.path.join(self.tfidf_file_dir, "tfidf_model.pkl")
tfidf_dictionary_file = os.path.join(self.tfidf_file_dir, "tfidf_dictionary.pkl")
docs_tfidf_file = os.path.join(self.tfidf_file_dir, "docs_tfidf.pkl")
if os.path.exists(tfidf_file):
self.tfidf_model = read_pkl_file(tfidf_file)
self.dictionary = read_pkl_file(tfidf_dictionary_file)
self.docs_tfidf = read_pkl_file(docs_tfidf_file)
else:
self.tfidf_model, self.docs_tfidf, self.dictionary = self.compute_tfidf(tfidf_file, tfidf_dictionary_file, docs_tfidf_file)
self.cosine_similar = similarities.MatrixSimilarity(self.docs_tfidf, num_features=len(self.dictionary.token2id))
# self.cosine_similar = similarities.SparseMatrixSimilarity(self.docs_tfidf, num_features=len(self.dictionary.token2id))
# preprocess query
def preprocess_data(self, sentence):
if self.stop_words_file:
sentence_words = seg_sentence(sentence, self.stopwords)
else:
sentence_words = seg_sentence(sentence)
return sentence_words
# search releated top n file, you can try to use min-heap to implement it.
# but here we will use limited insertion
def search_related_files(self, query, top_k):
query_words = self.preprocess_data(query)
top_docs = []
query_bow = self.dictionary.doc2bow(query_words)
query_tfidf = self.tfidf_model[query_bow]
sims = self.cosine_similar[query_tfidf]
similars = []
for index, sim in enumerate(sims.tolist()):
similars.append((index, sim))
similars = sorted(similars, key=lambda x: x[1], reverse=True)
for item in similars[:top_k]:
index = item[0]
sim = item[1]
top_docs.append((index, self.docs[index], sim))
return top_docs
def eval(queries, tfIdf_model, topk):
labels = []
pred = []
with tqdm(total=len(queries)) as pbar:
for query, qid in queries:
pre = []
labels.append(qid)
top_docs = tfIdf_model.search_related_files(query, topk)
for index, question, sim in top_docs:
pre.append(index)
pred.append(pre)
pbar.update(1)
MRR = eval_MPP(labels, pred)
# print("MRR:{}".format(MRR))
logger.info("MRR:{}".format(MRR))
if __name__ == '__main__':
all_data = pd.read_csv(subwayqq_path)
# with open(subwayqq_path, "rb") as fin:
# all_data = pickle.load(fin)
queries = []
docs = all_data["docs"].drop_duplicates().reset_index(drop=True).values.tolist()
for index, row in all_data.iterrows():
query = row["similar_query"]
question = row["standard_question"]
q_id = docs.index(question)
queries.append((query, q_id))
# initialize document manager
tfIdf_model = TfIdf_Model(docs, tfidf_file_dir, stopwords_file)
# # initialzie search engine
tfIdf_model.init_search_engine()
eval(queries, tfIdf_model, topk=10)
# 单个查询
# query = queries[0]
# # # search query and get top documents with weight
# top_docs = tfIdf_model.search_related_files(query, 10)
# print('query is: ', query)
# print('result is: ')
# print(top_docs)