-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathclass_word_frequents.py
66 lines (53 loc) · 1.89 KB
/
class_word_frequents.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/usr/bin/python
# -*- coding: utf-8 -*-
import os
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
os.chdir('/Users/pengtuo/Downloads/corpus/')
# 读取文件
def _readfile(path):
with open(path, 'rb') as fp:
content = fp.read()
content = content.decode('utf-8')
return content
# 保存至文件
def _savefile(savepath, content_dict):
with open(savepath, "wb") as fp:
for key, value in content_dict.items():
fp.write(key + ':' + str(value))
fp.write('\n')
# 给该类构造一个方便统计的结构
def _contruct_word(cate_dir):
cate_content = []
file_list = os.listdir(cate_dir)
for file_name in file_list:
if file_name == '.DS_Store':
continue
file_path = cate_dir + '/' + file_name
content = _readfile(file_path)
content_tuple = set(content.split())
cate_content.append(content_tuple)
return cate_content
# 统计每个词在出现的该类别下的文档数
def count_word_frequents(seg_path, wordtimes_path):
cate_list = os.listdir(seg_path)
for cate in cate_list:
if cate == '.DS_Store':
continue
word_frequents_dict = {}
cate_dir = seg_path + cate
save_file = wordtimes_path + cate + '_wordtimes.txt'
cate_content = _contruct_word(cate_dir)
for each_content in cate_content:
for word in each_content:
if len(word) >= 2:
word_frequents_dict[word] = word_frequents_dict.get(word, 0) + 1
_savefile(save_file, word_frequents_dict)
print '>>>' * 25
print 'Writing in the file named %s \n' % save_file
if __name__ == '__main__':
print('start')
seg_path = 'train/train_data_seg/' # 分词后训练集路径
wordtimes_path = 'train/wordtimes/' # 词频统计后存储位置
count_word_frequents(seg_path, wordtimes_path)