-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtfidf.py
49 lines (41 loc) · 1.38 KB
/
tfidf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import numpy as np
from sklearn.preprocessing import normalize
def count_freq(sentences):
"""Calculate word freq and word count for word in these sentences
Args:
sentences (list(str)):
"""
wordfreq = dict() # word->freq
wordcount = dict() # word->count
for s in sentences:
tmp = s.split(' ')
word_in_sentence = set(tmp)
for w in word_in_sentence:
if wordcount.get(w) is None:
wordcount[w] = 1
else:
wordcount[w] += 1
for w in tmp:
if wordfreq.get(w) is None:
wordfreq[w] = 1
else:
wordfreq[w] += 1
return wordfreq, wordcount
def foo(categories):
wordfreq, wordcount = [], []
for category in categories:
t1, t2 = count_freq(category)
wordfreq.append(t1)
wordcount.append(t2)
return wordfreq, wordcount
def tfidf(sentence, label_num, cat_len, wordfreq, wordcount):
score = np.zeros(label_num)
for i, (wf, wc) in enumerate(zip(wordfreq, wordcount)):
tmp = []
for w in sentence.split(' '):
# TODO: if w not in wf (not in this category)?
tf = wf[w] / len(wf)
idf = np.log10(len(cat_len[i]) / (1 + wc[w]))
tmp.append(tf * idf)
score[i] = np.average(normalize(tmp, axis=-1)) # sum?
return score