-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
167 lines (135 loc) · 6.11 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import os
import streamlit as st
from spider_urls import scrape_job_links
from spider_req import scrape_job_requirements
from wash_data import clean_requirements_file
from get_keywords import process_keywords
import time
def main():
st.set_page_config(page_title="求职需求爬爬爬")
st.title("🧑💻求职需求爬爬爬 - 词云版")
st.markdown("<br>", unsafe_allow_html=True)
st.markdown("""
<p style="margin-bottom:0; font-style: italic;">
爬虫来源:<a href="https://www.zhaopin.com/">智联招聘</a><br>
项目作者:@Tutu</p>
""", unsafe_allow_html=True)
# 间隔线
st.markdown("---")
# 搜索框和按钮
left, right = st.columns([5,1])
job_keyword = left.text_input("请输入要爬取的职位关键词", "", placeholder="职位关键字")
right.markdown('<div style="height: 34px;"></div>', unsafe_allow_html=True)
button = right.button("生成词云")
# 错误消息占位符
error_placeholder = st.empty()
# 检查是否点击按钮且没有输入关键词
if button and not job_keyword:
error_placeholder.error("请输入职位关键词!")
time.sleep(2)
error_placeholder.empty()
else:
error_placeholder.empty()
st.markdown("<br>", unsafe_allow_html=True)
# 字体选择(可根据 ./data/fonts 文件夹中进行 font_option 的修改)
font_option = st.selectbox("选择词云字体", ("PingFangSC-Medium.otf", "PingFangSC-Light.otf", "PingFangSC-Semibold.otf", "HYZhengYuan-55W.ttf", "MiSans VF.ttf",
"OPPOSans-B.ttf", "OPPOSans-L.ttf", "OPPOSans-R.ttf", "HarmonyOS_Sans_SC_Thin.ttf", "HarmonyOS_Sans_SC_Medium.ttf", "HarmonyOS_Sans_SC_Black.ttf"), index=0)
st.markdown("<br>", unsafe_allow_html=True)
if 'save_path' not in st.session_state:
st.session_state.save_path = None
# 上传背景图片
uploaded_file = st.file_uploader("自定义词云背景图片(白底为佳)", type=["jpg", "png", "jpeg", "webp"])
if uploaded_file:
col1, col2, col3 = st.columns(3)
with col2:
# 图片居中显示
st.image(uploaded_file, caption="预览", width=200)
current_dir = os.path.dirname(os.path.abspath(__file__))
save_dir = os.path.join(current_dir, "data/image")
os.makedirs(save_dir, exist_ok=True)
save_path = os.path.join(save_dir, uploaded_file.name)
st.write(f"{save_path}")
if button:
with open(save_path, "wb") as f:
f.write(uploaded_file.getbuffer())
st.session_state.save_path = save_path
# 按钮触发爬虫过程
if button and job_keyword:
st.markdown("<br>", unsafe_allow_html=True)
# 检查所需文件是否存在
if all([
os.path.exists(f'./data/output_reqs/{job_keyword}_job_reqs.txt'),
os.path.exists(f'./data/output_urls/{job_keyword}_job_links.txt'),
os.path.exists(f'./data/output_washed/{job_keyword}_cleaned_data.txt')
]):
with st.spinner('正在生成词云... 请稍等'):
# 直接生成词云
process_keywords(job_keyword, st.session_state.save_path, font_option)
st.success(f"词云已生成,请查看或保存。")
else:
with st.spinner('正在爬取职位信息... 约1.5h,耗时较长,可最小化本窗口做其他的事儿 :)'):
# 爬取职位链接
scrape_job_links(job_keyword)
# 爬取职位要求
scrape_job_requirements(job_keyword)
# 清洗数据
clean_requirements_file(job_keyword)
# 生成关键词并生成词云
process_keywords(job_keyword, st.session_state.save_path, font_option)
st.success(f"爬取完成,词云已生成,请查看或保存。")
wordcloud_path = f"./data/output_wordcloud/{job_keyword}.png"
if os.path.exists(wordcloud_path):
col1, col2, col3 = st.columns(3)
with col2:
# 图片居中显示
st.image(wordcloud_path, caption="词云图展示", use_container_width=True)
st.markdown("---")
st.write("📌推荐以下词云背景图")
# 预置词云背景图片流
image_urls = [
"./data/image/chiikawa.webp",
"./data/image/pink_flower_emoji.jpg",
"./data/image/moon.jpg",
"./data/image/pink_cake.jpg",
"./data/image/pink_telephone.png",
"./data/image/shaun_the_sheep.jpg",
"./data/image/strawberry.jpg",
"./data/image/pixel_star.jpg",
"./data/image/white_green_flower.jpg",
"./data/image/tutu.jpg",
"./data/image/record.jpg"
]
if 'clicked_image' not in st.session_state:
st.session_state.clicked_image = None
# 创建两列布局
col1, col2 = st.columns(2)
# 遍历图片URL列表
for i, url in enumerate(image_urls):
# 根据索引决定将图片放在哪一列
column = col1 if i % 2 == 0 else col2
with column:
# 创建一个容器来包装图片和隐藏的按钮
container = st.container()
caption = os.path.splitext(os.path.basename(url))[0]
# 显示图片
container.image(url, caption=caption, use_container_width=True)
# 创建按钮
if st.session_state.clicked_image == i:
if st.button("已选择", key=f"btn_{i}", type="primary"):
st.session_state.clicked_image = None
else:
if st.button("使用图片", key=f"btn_{i}"):
st.session_state.clicked_image = i
st.write(f"已选择图片地址: {url}")
st.session_state.save_path = url
# 添加CSS来调整按钮样式
st.markdown("""
<style>
.stButton > button {
width: 100%;
margin-top: -15px;
}
</style>
""", unsafe_allow_html=True)
if __name__ == "__main__":
main()