-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_cleaning.py
204 lines (173 loc) · 9.69 KB
/
data_cleaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 10 17:39:04 2023
@author: svetlanamaslenkova
"""
import pickle5 as pickle
from collections import Counter
import re
# paths to data
curr_path = '/Users/svetlanamaslenkova/Documents/DS_project/'
data_scraped_path = curr_path + 'data_scraped/'
# open & and retrieve rows with salary info
with open(data_scraped_path + 'gcc_jobs_df.pkl', 'rb') as handle:
jobs_df = pickle.load(handle)
print(jobs_df.shape)
jobs_with_salary_df = jobs_df[~((jobs_df.salary_estimate=='-1')|(jobs_df.salary_estimate==-1))\
|~((jobs_df.salary_description=='-1')|(jobs_df.salary_description==-1))]
print('n of rows with salary: ', jobs_with_salary_df.shape[0])
jobs_to_delete= ['carpenter foreman', 'retail operation executive', 'lead generation specialist', 'digital marketing specialist']
jobs_with_salary_df = jobs_with_salary_df[~(jobs_with_salary_df['job_title'].str.lower().str.contains(('|').join(jobs_to_delete)))]
## clean and organize salary info
# convert types into str
jobs_with_salary_df.salary_estimate = jobs_with_salary_df.salary_estimate.astype(str)
jobs_with_salary_df.salary_description = jobs_with_salary_df.salary_description.astype(str)
# create columns with time period salary info
jobs_with_salary_df['hourly'] = jobs_with_salary_df['salary_estimate'].apply(lambda x: 1 if 'per hour' in x.lower() else 0)
jobs_with_salary_df['annually'] = jobs_with_salary_df['salary_description'].apply(lambda x: 1 if any(map(x.lower().__contains__, ['per annum', 'usd'])) else 0)
# add for Cobblestone job
jobs_with_salary_df.loc[412, 'annually'] = 1
# merge two salary columns
jobs_with_salary_df['salary'] = [a if b=='-1' else b for a,b in zip(jobs_with_salary_df.salary_estimate.values, jobs_with_salary_df.salary_description.values)]
# remove unnecessary text info from salary column
jobs_with_salary_df['salary'] = jobs_with_salary_df['salary'].apply(lambda x: x.split('(')[0])
# create a column with the currency info
jobs_with_salary_df['currency'] = jobs_with_salary_df['salary'].apply(lambda x: 'usd' if 'usd' in x.lower() or '$' in x.lower() else 'aed' if 'aed' in x.lower()\
else 'bhd' if 'bhd' in x.lower() else 'qar' if 'qar' in x.lower() \
else 'sar' if 'sar' in x.lower() else 'omr' if 'omr' in x.lower() \
else 'kd' if 'kd' in x.lower()else -1)
# get salary column cleaned from text
# remove text from the salary column and replace 'K' with zeroes
salary = jobs_with_salary_df['salary'].apply(lambda x: x.lower().replace('k', '000'))
salary = salary.apply(lambda x: re.sub('[a-zA-Z$]', '', x))
# salary = salary.apply(lambda x: re.sub(' - 0%', '', x))
# create columns for min and max salary
min_salary = salary.apply(lambda x: x.replace('–', '-').split('-')[0])
max_salary = salary.apply(lambda x: x.replace('–', '-').split('-')[1] if \
len(x.replace('–', '-').split('-'))>1 else x.replace('–', '-').split('-')[0])
# clean the columns from spaces and unnecesary symbols
jobs_with_salary_df['min_salary'] = min_salary.apply(lambda x: x.replace(',', '').replace(' ', ''))
jobs_with_salary_df['max_salary'] = max_salary.apply(lambda x: x.replace(',', '').replace(' ', ''))
# convert column types
jobs_with_salary_df['min_salary'] = jobs_with_salary_df['min_salary'].astype(float)
jobs_with_salary_df['max_salary'] = jobs_with_salary_df['max_salary'].astype(float)
# convert currency to AED
currenccies_dict = {'usd' : 3.67,
'bhd' : 9.76,
'qar' : 1.01,
'sar' : 0.98,
'omr' : 9.57,
'kd' : 12,
'aed':1.0}
jobs_with_salary_df['min_salary'] = jobs_with_salary_df.apply(lambda x: x['min_salary'] * currenccies_dict[x['currency']], axis=1)
jobs_with_salary_df['max_salary'] = jobs_with_salary_df.apply(lambda x: x['max_salary'] * currenccies_dict[x['currency']], axis=1)
# add the average salary column
jobs_with_salary_df['avg_salary'] = (jobs_with_salary_df.min_salary + jobs_with_salary_df.max_salary) / 2
# convert salary into monthly salary
jobs_with_salary_df['min_salary'] = jobs_with_salary_df.apply(lambda x: x.min_salary * 160 if x.hourly==1 else x.min_salary/12 if x.annually==1 else x.min_salary, axis=1)
jobs_with_salary_df['max_salary'] = jobs_with_salary_df.apply(lambda x: x.max_salary * 160 if x.hourly==1 else x.max_salary/12 if x.annually==1 else x.max_salary, axis=1)
jobs_with_salary_df['avg_salary'] = jobs_with_salary_df.apply(lambda x: x.avg_salary * 160 if x.hourly==1 else x.avg_salary/12 if x.annually==1 else x.avg_salary, axis=1)
# remove new line character and rating from company name
jobs_with_salary_df['company_name'] = jobs_with_salary_df['company_name'].apply(lambda x: x.split('\n')[0] if len(x.split('\n'))>1 else x)
# add column indicating if a job was posted more than 30 days ago
jobs_with_salary_df['older_than_30_days'] = jobs_with_salary_df['job_age'].apply(lambda x: 1 if '30' in x.lower() else 0)
# add job family column
jobs_with_salary_df['job_fam'] = jobs_with_salary_df.job_title.apply(lambda x: 'ds' if 'data scien' in x.lower() \
else 'da' if 'analy' in x.lower() \
else 'ml' if 'machine learning' in x.lower()\
# else 'research' if 'research' in x.lower()\
else 'other')
# add company age column
jobs_with_salary_df['founded'] = jobs_with_salary_df['founded'].astype(int)
jobs_with_salary_df['company_age'] = jobs_with_salary_df.founded.apply(lambda x: x if x<0 else 2023 - x)
## add required skills columns
# python
jobs_with_salary_df['python_yn'] = jobs_with_salary_df['job_description'].apply(lambda x: 1 if 'python' in x.lower() else 0)
# r
jobs_with_salary_df['r_yn'] = jobs_with_salary_df['job_description'].apply(lambda x: 1 if 'r-studio' in x.lower() or 'r studio' in x.lower() else 0)
# spark
jobs_with_salary_df['spark_yn'] = jobs_with_salary_df['job_description'].apply(lambda x: 1 if 'spark' in x.lower() else 0)
# aws
jobs_with_salary_df['aws_yn'] = jobs_with_salary_df['job_description'].apply(lambda x: 1 if 'aws' in x.lower() else 0)
# pytorch
jobs_with_salary_df['pytorch_yn'] = jobs_with_salary_df['job_description'].apply(lambda x: 1 if 'pytorch' in x.lower() or 'torch' in x.lower() else 0)
# tf
jobs_with_salary_df['tf_yn'] = jobs_with_salary_df['job_description'].apply(lambda x: 1 if 'tensorflow' in x.lower() else 0)
# ml
jobs_with_salary_df['ml_yn'] = jobs_with_salary_df['job_description'].apply(lambda x: 1 if 'machine learning' in x.lower() else 0)
# power bi OR tableau
jobs_with_salary_df['pbi_tableau_yn'] = jobs_with_salary_df['job_description'].apply(lambda x: 1 if 'power bi' in x.lower() or 'tableau' in x.lower() else 0)
# statistics
jobs_with_salary_df['stats_yn'] = jobs_with_salary_df['job_description'].apply(lambda x: 1 if 'statisti' in x.lower() else 0)
# sql
jobs_with_salary_df['sql_yn'] = jobs_with_salary_df['job_description'].apply(lambda x: 1 if 'sql' in x.lower() else 0)
jobs_with_salary_df.sum()
# keep the following skills: stats_yn, ml_yn, sql_yn, python_yn
# see the frequency of words n descriptions
result_text = ''
for text in list(jobs_with_salary_df.job_description.values):
result_text = result_text + ' ' + text.replace('\n', ' ')
freq = Counter(result_text.split()).most_common(60)
words = re.findall(r'\w+', result_text)
two_words = [' '.join(ws) for ws in zip(words, words[1:])]
wordscount = {w:f for w, f in Counter(two_words).most_common() if f > 1}
jobs_with_salary_df['experience_level'] = jobs_with_salary_df['job_title'].apply(lambda x: 'junior' if 'junior' in x.lower() or 'jr.' in x.lower() or 'entry' in x.lower() \
else 'senior' if 'senior' in x.lower() or 'sr.' in x.lower() or 'expert' in x.lower() \
or 'lead' in x.lower() or 'principal' in x.lower() else 'other')
# # add job simplified column
jobs_with_salary_df['job_simp'] = jobs_with_salary_df['job_title'].apply(lambda x: \
'analyst' if 'analyst' in x.lower() else 'data engineer' if 'data engineer' in x.lower() else 'data scientist' if 'data scientist' in x.lower()\
else 'director' if 'director' in x.lower() else 'manager' if 'manager' in x.lower() else 'machine learning engineer' if \
'machine learning engineer' in x.lower() else 'researcher' if 'research' in x.lower() else 'other')
jobs_with_salary_df['job_simp'].value_counts()
jobs_with_salary_df['job_fam'].value_counts()
import geonamescache
gc = geonamescache.GeonamesCache()
def get_country(city, gc):
if city.lower()=='remote':
return 'remote'
country_code = gc.search_cities(city, case_sensitive=False)[0]['countrycode']
if country_code=='AE':
return city
return country_code
jobs_with_salary_df['country'] = jobs_with_salary_df['location'].apply(lambda x: get_country(x, gc))
# # rearrange the columns
# jobs_with_salary_df = jobs_with_salary_df[['job_title',
# 'salary_estimate',
# 'job_description',
# 'rating',
# 'company_name',
# 'location',
# 'job_age',
# 'size',
# 'founded',
# 'type_of_ownership',
# 'industry',
# 'sector',
# 'revenue',
# 'competitors',
# 'date_scrapped',
# 'job_fam',
# 'salary_description',
# 'hourly',
# 'annually',
# 'salary',
# 'currency',
# 'min_salary',
# 'max_salary',
# 'avg_salary',
# 'company_age',
# 'python_yn',
# 'r_yn',
# 'spark_yn',
# 'aws_yn',
# 'pytorch_yn',
# 'tf_yn',
# 'ml_yn',
# 'pbi_tableau_yn',
# 'stats_yn',
# 'sql_yn',
# 'experience_level'
# ]]
jobs_with_salary_df.to_csv('glassdoor_jobs.csv', index=False)