-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcluster_0.py
185 lines (168 loc) · 6.31 KB
/
cluster_0.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#! python3
# -*- coding: utf-8 -*-
"""
目标:实现小规模文本的聚类
1.去除无用词
2.分词
3.tf-idf
4.聚类
"""
import json
import jieba
from random import shuffle
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
"""
sklearn里面的TF-IDF主要用到了两个函数:CountVectorizer()和TfidfTransformer()。
CountVectorizer是通过fit_transform函数将文本中的词语转换为词频矩阵。
矩阵元素weight[i][j] 表示j词在第i个文本下的词频,即各个词语出现的次数。
通过get_feature_names()可看到所有文本的关键字,通过toarray()可看到词频矩阵的结果。
TfidfTransformer也有个fit_transform函数,它的作用是计算tf-idf值。
"""
def extract_native_comment(filename):
"""
从数据库中提取原始文本
:param filename:
:return:
"""
with open(filename, 'rb') as f:
data = json.loads(f.read())
record = data['RECORDS']
comment = []
for item in record:
# logger.info(item['id'])
single_comment = item['comment_content']
# print(single_comment)
comment.append(single_comment)
# break
# print(comment)
return comment
def shuffle_comment(comment):
"""
1.先打乱顺序
2.将短文本进行中文冒号分割,提取真正的评论
:param comment:
:return:
"""
shuffle(comment)
real_comment = []
for item in comment:
print(item)
split_list = item.split(u':')
print(split_list)
length = len(split_list)
if length == 1:
real_comment.append(item)
elif length > 1:
flag = False
for idx in range(1, length):
if '回复' in split_list[idx]:
inner_split_list = split_list[idx].split(':')
print(inner_split_list[-1])
print('------------------')
real_comment.append(inner_split_list[-1])
flag = True
if not flag:
"""
如果遍历完成后,都没有回复的词语
"""
real_comment.append(split_list[-1])
print(real_comment)
return real_comment
def cut_word(real_comment):
after_cut_word = []
for item in real_comment:
print('----------------->')
print(item)
seg_list = list(jieba.cut(item, cut_all=False)) # 精确模式,适合文本分析
print(seg_list)
print('----------------->')
after_cut_word.extend(seg_list)
# break
# print(after_cut_word)
return after_cut_word
def remove_stop_word(after_cut_word):
"""
去除停用词,暂时还是用的网上通用的停用词(如哈工大),后面看效果可能会构建微博独有的停用词表
:param after_cut_word:
:return:
"""
non_empty = [i for i in after_cut_word if i != ' '] # 去除空白符
stop_word_collection = []
with open('stop_word/所有停用词.txt', 'r', encoding='utf8') as f:
for line in f.readlines():
if line != '\n':
stop_word_collection.append(line)
# print(line)
after_remove = []
for word in non_empty:
if word not in stop_word_collection:
after_remove.append(word)
return after_remove
def continue_remove_useless_word(after_remove):
"""
肉眼发现还是有一些无用词语,要进行是否是汉字的判断
:param after_remove:
:return:
"""
final_result = [char for char in after_remove if '\u4e00' <= char <= '\u9fff']
return final_result
def feature(corpus):
"""
sklearn里面的TF-IDF主要用到了两个函数:CountVectorizer()和TfidfTransformer()。
CountVectorizer是通过fit_transform函数将文本中的词语转换为词频矩阵。
矩阵元素weight[i][j] 表示j词在第i个文本下的词频,即各个词语出现的次数。
通过get_feature_names()可看到所有文本的关键字,通过toarray()可看到词频矩阵的结果。
TfidfTransformer也有个fit_transform函数,它的作用是计算tf-idf值。
"""
vectorizer = CountVectorizer() # 将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频
transformer = TfidfTransformer() # 该类会统计每个词语的tf-idf权值
tf_idf = transformer.fit_transform(vectorizer.fit_transform(corpus))
print('%%%%%%%%%')
print(tf_idf)
print('%%%%%%%%%')
word = vectorizer.get_feature_names() # 获取词袋模型中的所有词语
print('&&&&&&&&&')
print(word)
print('&&&&&&&&&')
print('$$$$$$$$$')
weight = tf_idf.toarray() # 将tf-idf矩阵抽取出来,元素w[i][j]表示j词在i类文本中的tf-idf权重
print(weight)
print('$$$$$$$$$')
train_x, test_x = train_test_split(tf_idf, test_size=0.2)
# scores = []
# for i in range(2, 21):
# km = KMeans(n_clusters=i)
# km.fit(train_x)
# label = km.labels_
# print(label)
# print(km.inertia_) # 用来评估簇的个数是否合适,距离越小说明簇分的越好,选择临界点的簇的个数
# scores.append({-km.score(test_x): i})
# 确定簇的个数
# return 19
km = KMeans(n_clusters=19)
km.fit(train_x)
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
print(vectorizer.get_stop_words())
for i in range(19):
print("Cluster %d:" % i, end='')
for ind in order_centroids[i, :10]:
print(' %s' % terms[ind], end='')
print()
# sort_score = sorted(scores, key=lambda k: k[0], reverse=True)
# print(sort_score)
def main():
comment = extract_native_comment('comment.json')
real_comment = shuffle_comment(comment)
after_cut_word = cut_word(real_comment)
after_remove = remove_stop_word(after_cut_word)
final_result = continue_remove_useless_word(after_remove)
feature(final_result)
# print(final_result)
# with open('final_result.txt', 'w') as f:
# f.write(json.dumps(final_result, sort_keys=True, indent=4, ensure_ascii=False))
if __name__ == '__main__':
main()