-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwash_data.py
75 lines (65 loc) · 3.15 KB
/
wash_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import re
import jieba
import sys
# from nltk.corpus import stopwords
# # 加载英文停用词
# stop_words_en = set(stopwords.words('english'))
# 定义中文停用词列表,包含无意义的词以及与职业要求不相关的词
stop_words_cn = []
# 加载中文停用词
with open('./data/stop_words_cn.txt', 'r', encoding='utf-8') as file:
for line in file:
stop_words_cn.append(line.strip())
# 清洗文本函数
def clean_text(text):
# 分词(使用jieba分词)
words = jieba.cut(text)
# 转换为小写并处理特殊字符
words = [re.sub(r'[^\w\s]', '', word.lower()) for word in words]
# 去除停用词(中英文)和保留词
words = [word for word in words if word not in stop_words_cn]
# 去除条目数字(中文数字、顿号、英文点、右括号等)
words = [re.sub(r'^[0-9一二三四五六七八九十]+[\.、))]?', '', word) for word in words]
# 去除若干空格
words = [word for word in words if word.strip() != '']
# 返回单词列表
return words
# 总函数,清洗职位要求
def clean_requirements_file(job_keyword):
try:
with open(f'./data/output_reqs/{job_keyword}_job_reqs.txt', 'r', encoding='utf-8') as file, \
open(f'./data/output_washed/{job_keyword}_cleaned_data.txt', 'w', encoding='utf-8') as outfile:
lines = file.readlines()
current_text = ""
count = 0
ignore_lines = False # 添加一个标志来忽略行
for line in lines:
line = line.strip()
if line.startswith(("职位要求", "任职要求", "岗位要求", "任职资格", "Requirements", "我们需要你")):
if current_text:
cleaned_words = clean_text(current_text)
count += 1
print(f"=== 清洗中:{count%50} /50 (第 {count//50 + 1} 轮) ===")
print(' '.join(cleaned_words)) # 将单词列表连接成字符串以便打印
outfile.write(' '.join(cleaned_words) + '\n') # 将单词列表连接成字符串后写入文件
current_text = ""
if count % 50 == 0:
outfile.flush() # 刷新输出缓冲区
ignore_lines = False # 重置忽略标志
elif line.startswith(("职位福利", "职位亮点", "福利", "薪酬", "薪资")):
ignore_lines = True # 设置忽略标志
elif line and not ignore_lines: # 如果不是空行且未被忽略
current_text += " " + line
else:
print(f"=== 忽略:{line} ===")
continue
# 处理最后一段文本
if current_text:
cleaned_words = clean_text(current_text)
outfile.write(' '.join(cleaned_words) + '\n')
except FileNotFoundError:
print(f"文件 {job_keyword}_job_reqs.txt 未找到...")
sys.exit(1)
print(f"=== 清洗完成,共计 {count} 条数据!===")
# 调用:
# clean_requirements_file("数据挖掘")