-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathbim_bayes.py
157 lines (128 loc) · 5.1 KB
/
bim_bayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/python
# -*- coding: utf-8 -*-
import os
import pandas
import cPickle as pickle
from chisquare_test import count_cate_page
from math import log
os.chdir("/Users/pengtuo/Downloads/corpus/")
def readbunchobj(path):
with open(path, "rb") as file_obj:
bunch = pickle.load(file_obj)
return bunch
def readfile(path):
with open(path, 'rb') as fp:
content = fp.read()
content = content.decode('utf-8')
return content
# 构建特征词向量
def contrust_vect(words_path):
words_vect = []
file_list = os.listdir(words_path)
for file_name in file_list:
if file_name == '.DS_Store':
continue
if file_name[-11:] == '_select.txt':
tmp = readfile(words_path + '/' + file_name).split('\n')
for word in tmp[:1000]:
words_vect.append(word)
return words_vect
# 特征词向量中每个词在每个类下的文档数频率p(x_i|y)矩阵
def get_class_wordtimes(category, words_vect, word_times_path, cate_page_num):
df = pandas.DataFrame(index=category, columns=words_vect)
file_list = os.listdir(word_times_path)
for cate_file in file_list:
if cate_file == '.DS_Store':
continue
cate = cate_file.split('_')[0]
word_fequents_dict = dict()
wordtimes = readfile(word_times_path + '/' + cate_file).split('\n')
for word in wordtimes:
word = word.split(':')
if word[0] in words_vect:
word_fequents_dict[word[0]] = log(int(word[1]) + 1) - log(cate_page_num[cate] + 2)
df.loc[cate] = word_fequents_dict
df = df.fillna(-10)
return df
# 分类器预测核心
def classify(content=None, wordtimes_ndarray=None):
per_page_words = set(content.decode('utf-8').split())
columns = set(wordtimes_ndarray.columns)
col = per_page_words & columns # 将文章里的词与特征词集合‘与’运算
p_ndarray = wordtimes_ndarray[list(col)].as_matrix()
line_sum = map(sum, p_ndarray) # numpy在行方向上求和
prediction = wordtimes_ndarray.index[line_sum.index(max(line_sum))]
return prediction
# 二项式朴素贝叶斯分类
def predict(test_set, wordtimes_ndarray):
TP, FP, FN = 0, 0, 0
average_accuracy, average_recall, average_f = 0.0, 0.0, 0.0
cate_TP = {}
cate_FP = {}
cate_FN = {}
cate_accuracy = {}
cate_recall = {}
cate_F_measure = {}
clock = 0
category = test_set.target_name
labels = test_set.label
for cate in category:
cate_TP[cate] = 0
cate_FP[cate] = 0
cate_FN[cate] = 0
cate_accuracy[cate] = 0
cate_recall[cate] = 0
cate_F_measure[cate] = 0
index = len(test_set.contents)
for i in range(index): # 对test_set.contents[i] 文档进行分类
prediction = classify(
content=test_set.contents[i],
wordtimes_ndarray=wordtimes_ndarray
)
if prediction == labels[i]:
cate_TP[prediction] += 1
else:
cate_FP[prediction] += 1
cate_FN[labels[i]] += 1
clock += 1
print clock
for cate in category:
cate_accuracy[cate] = float(cate_TP.get(cate, 1)) / (cate_TP.get(cate, 1) + cate_FP.get(cate, 1))
cate_recall[cate] = float(cate_TP.get(cate, 1)) / (cate_TP.get(cate, 1) + cate_FN.get(cate, 1))
cate_F_measure[cate] = float(2*cate_TP.get(cate, 1)) / (2*cate_TP.get(cate, 1) + cate_FN.get(cate, 1) + cate_FP.get(cate, 1))
TP += cate_TP[cate]
FP += cate_FP[cate]
FN += cate_FN[cate]
average_accuracy += cate_accuracy[cate]
average_recall += cate_recall[cate]
average_f += cate_F_measure[cate]
average_accuracy = average_accuracy / 10
average_recall = average_recall / 10
average_f = average_f / 10
return average_accuracy, average_recall, average_f, cate_accuracy, cate_recall, cate_F_measure
if __name__ == '__main__':
category = os.listdir('test/test_data_seg')
category.remove('.DS_Store')
# 构建特征词词向量
words_path = 'train/train_chi_order/'
words_vect = set(contrust_vect(words_path))
# 构建特征词向量中每个词在每个类下的文档数频率p(x_i|y)矩阵
word_times_path = 'train/wordtimes/'
cate_page_num = count_cate_page(category)
word_fequents_ndarray = get_class_wordtimes(category, words_vect, word_times_path, cate_page_num)
# 预测
test_set = readbunchobj('test/test_word_bag/test_set.dat')
accuracy_rate, recall_rate, f_score, cate_accuracy, cate_recall, cate_F_measure \
= predict(test_set, word_fequents_ndarray)
print '--' * 40
print 'features_number = ' + repr(len(words_vect))
print '--' * 40
for cate in category:
print cate + '_accuracy_rate = ' + repr(cate_accuracy[cate])
print cate + '_recall_rate = ' + repr(cate_recall[cate])
print cate + '_f_score = ' + repr(cate_F_measure[cate])
print '--' * 40
print 'accuracy_rate = ' + repr(accuracy_rate)
print 'recall_rate = ' + repr(accuracy_rate)
print 'f_score = ' + repr(accuracy_rate)
print '--' * 40