-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathhelpers.py
201 lines (153 loc) · 7.01 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
#Libraries for Web Scraping process
import requests
from bs4 import BeautifulSoup
from IPython.core.display import clear_output
#Libraries for Data Wrangling and Data Cleaning
import numpy as np
import re
from itertools import groupby
import pandas as pd
def webscraping_jobs(search,pages):
""" START OF WEB SCRAPING CODE"""
#Create new lists to store information
titles = []
regions = []
features = []
glosas = []
profiles = []
target_urls = []
#create the target url of the main webpage based on the search parameter (This is different for each webpage)
front_url = "trabajo-de-"+search.replace(" ","-")
back_url = search.replace(" ", "%20")
main_url = 'https://www.computrabajo.cl/'
print("Running Web Scraping...")
#Iterate over the number of pages to extract the information on the job offers containers.
for page in range(1,pages+1):
url = main_url + front_url + "?q=" + back_url +'&p=' + str(page)
main_response = requests.get(url)
main_soup = BeautifulSoup(main_response.text,'html.parser')
soup_urls = main_soup.find_all('a',class_='js-o-link')
# Iterate over each of the job offers containers to obtain the url for each job offer and append the results into the list target_urls
for uri in soup_urls:
target_urls.append(main_url+uri['href'])
#Iterate for each of the urls on target_urls list to extract the target information of each job offer.
for target in target_urls:
response = requests.get(target)
soup = BeautifulSoup(response.text,'html.parser')
#Separate the HTML code of each job offer page in 2 main distributions (description and table)
#(this is different for each webpage)
#Obtain the frame of description
try:
description_frame = soup.find_all('ul',class_='p0 m0')[0]
table_frame = soup.find_all('ul',class_='p0 m0')[1]
except (AttributeError,IndexError):
pass
#extract the main features of frame table and store them in the lists created above
try:
temp_title = table_frame.find_all('p')[0]
titles.append(temp_title.text)
except (AttributeError,IndexError):
titles.append("nan")
try:
temp_region = table_frame.find_all('p')[1]
regions.append(temp_region.text)
except (AttributeError,IndexError):
regions.append("nan")
try:
feature = table_frame.find_all('p',class_='fw_b fs15 mt10')
temp_feature = []
for ix in range(0,len(feature)):
temp_feature.append(feature[ix].find_next_sibling().text.strip())
features.append(temp_feature)
except (AttributeError,IndexError):
features.append("nan")
#extract the main features of frame description and store them in the lists created above
try:
temp_glosa = description_frame.find_all('li')[1]
glosas.append(temp_glosa.text)
except (AttributeError,IndexError):
glosas.append("nan")
try:
temp_profile = description_frame.find_all('li')[3:]
profiles.append(temp_profile)
except (AttributeError,IndexError):
profiles.append("nan")
print(f"Extracting Job Offers for {search}: {len(titles)}")
clear_output(wait=True)
print("Web Scraping Finished!")
print(f"Total Job Offers Extracted: {len(titles)}")
""" END OF WEB SCRAPING CODE"""
""" START OF DATA WRANGLING AND DATA CLEANING CODE"""
#Create new lists to store final features
companies = []
contracts = []
journeys = []
salaries = []
travels = []
residence = []
experience = []
regs = []
#extract the target values of each feature from the lists created using the web scraping
for feature in features:
if len(feature)==2:
contracts.append(feature[0])
journeys.append(feature[1])
companies.append("nan")
salaries.append("nan")
if len(feature)==4:
companies.append(feature[0])
contracts.append(feature[1])
journeys.append(feature[2])
salaries.append(feature[3])
if len(feature)==3 and "$" in feature[2]:
contracts.append(feature[0])
journeys.append(feature[1])
salaries.append(feature[2])
companies.append("nan")
if len(feature)==3 and not "$" in feature[2]:
companies.append(feature[0])
contracts.append(feature[1])
journeys.append(feature[2])
salaries.append("nan")
#clean text values of travels, residence and experience using regex
for profile in profiles:
for prop in profile:
reg_travel = re.findall(r"viajar:\s\w*",str(prop))
reg_resid = re.findall(r"residencia:\s\w*",str(prop))
reg_exp = re.findall(r"experiencia: \d*",str(prop))
if reg_travel:
travels.append(str(reg_travel))
if reg_resid:
residence.append(str(reg_resid))
if reg_exp:
experience.append(str(reg_exp))
if not reg_exp in profile:
experience.append("nan")
#Extract the information of education
education = [str(t[0]) for t in profiles]
#Clean lists of features travels, residence, experience and title
experience = [x for y in (list(amount) if value != "nan" else ["nan"] * (len(list(amount))-1)
for value, amount in groupby(experience)) for x in y if x]
experience = [s.replace("['experiencia: ","").replace("']","") for s in experience]
if len(experience) != len(titles):
experience.append("nan")
travels = [s.replace("['viajar: ","").replace("']","") for s in travels ]
residence = [s.replace("['residencia: ","").replace("']","") for s in travels ]
titles = [s.replace("\r\n","").strip() for s in titles]
regions = [s.replace("\r\n","").strip() for s in regions]
glosas = [s.replace("•","").replace("\t","").strip() for s in glosas]
salaries = [s.replace("(Neto mensual)","").replace("$","").replace(".","").replace(",",".") for s in salaries]
#Filter by region and clean values
regions2 = [s.split("-") for s in regions]
for region in regions2:
if len(region)==1:
regs.append(region[0])
if len(region)==2:
regs.append(region[1])
#mantain the same length of experience and titles features (to avoid length conflicts)
if len(experience) != len(titles):
experience.append("nan")
#Store the final features on Data Frame
df = pd.DataFrame({'oferta':titles,'region':regs,'empresa':companies,'salario':salaries,'experiencia':experience,'viajes':travels,
'residencia':residence,'jornada':journeys,'contrato':contracts,'educacion':education,'glosa':glosas,'url':target_urls})
return df