-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathscrapePremiumInsights.py
201 lines (184 loc) · 10.9 KB
/
scrapePremiumInsights.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
import time
import math
from random import *
import csv
csv_content = []
header_content = ['company_link', 'company_name', 'average_tenure']
master_array = []
## Get the company links from the spreadsheet
company_links = []
with open('C:/Users/abagh/CREtech_Full_List_Premium_Insights.csv', 'rb') as csvfile:
csv_reader = csv.reader(csvfile, delimiter=' ')
for row in csv_reader:
if len(row) == 0:
company_links.append('')
elif 'company-beta' in row[0]:
company_links.append(row[0])
else:
company_id = row[0].replace('https://www.linkedin.com/search/results/people/?facetCurrentCompany=%5B%22', '') # get only the id
company_id = company_id.replace('%22%5D', '') # again removing unnecessary stuff
company_links.append('https://www.linkedin.com/company-beta/' + company_id + '/')
print len(company_links)
def clean_growth(raw_growth):
raw_growth = raw_growth.replace(',', '')
raw_growth = raw_growth.replace('No change', '0')
raw_growth = raw_growth.replace('% increase', '')
raw_growth = raw_growth.replace('% decrease', '')
if '%' in raw_growth:
raw_growth = raw_growth.split('\n')[1]
return int(raw_growth.strip())
def replace(your_list, to_replace, replacement):
for n, i in enumerate(your_list):
if n == to_replace:
your_list[n]=replacement
return your_list
def add_to_stats(header_name, value_to_insert):
try:
index = header_content.index(header_name)
replace(company_stats, index, value_to_insert)
except ValueError:
header_content.append(header_name)
index = header_content.index(header_name)
replace(company_stats, index, value_to_insert)
## General Variables
page_count = 0
## Create the driver
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 6) # To be used to detect if elements have loaded
driver.implicitly_wait(8) # seconds
## Open the Linkedin page and login
driver.get(
'https://www.linkedin.com/uas/login?session_redirect=%2Fvoyager%2FloginRedirect%2Ehtml&fromSignIn=true&trk=uno-reg-join-sign-in') # Login page
wait.until(EC.presence_of_element_located((By.ID, 'btn-primary'))) # Wait for the sign in button to load first
login = driver.find_element_by_id('session_key-login') # Get username box
login.send_keys('[email protected]') # Enter username
password = driver.find_element_by_id('session_password-login') # Get password box
password.send_keys('dantrinet') # Enter your password
password.submit()
time.sleep(7)
if 'a quick security' in driver.page_source:
raw_input("A quick security check")
time.sleep(30)
if 'pin' in driver.page_source:
pin = driver.find_element_by_id("verification-code")
pin.send_keys(raw_input("Enter pin"))
pin.submit()
time.sleep(10)
for company_link in company_links: # Loop through each link for each company
company_stats = [] # Where their names will be stored
if company_link == '': # If the company is a blank string
add_to_stats('Error', 'No link')
print 'no link'
add_to_stats('company_link', 'None')
master_array.append(company_stats)
continue
for i in range(100):
company_stats.append('')
driver.get(company_link) # Open up the employee page
try:
company_name = driver.find_element_by_xpath('//h1[contains(@class, "org-top-card-module__name")]') # Wait for the total search results box to load
except (NoSuchElementException, TimeoutException):
print 'Excepted: ' + company_link
add_to_stats('Error', 'Loading took too long')
add_to_stats('company_link', company_link)
time.sleep(120)
master_array.append(company_stats)
continue
add_to_stats('company_link', company_link)
add_to_stats('company_name', company_name.text.encode('utf-8'))
time.sleep(uniform(2, 5)) # Throttle
driver.execute_script(
"window.scrollTo(0, " + str((randint(900, 1000))) + ");") # Scroll down to the mid page
time.sleep(uniform(2, 4)) # Throttle, allow content to load
try: # employee headcount
avg_ten = driver.find_element_by_xpath('//div[contains(@class, "org-insights-module__facts")]').text
avg_ten = avg_ten.replace('Average tenure', '')
avg_ten = avg_ten.replace(' years', '')
add_to_stats('average_tenure', avg_ten)
print 'ten: ' + avg_ten
time.sleep(uniform(2, 4))
test_for_basics = driver.find_element_by_xpath('//table[contains(@class, "org-insights-module__summary-table")]/tr[1]')
employee_growth = driver.find_elements_by_xpath('//table[contains(@class, "org-insights-module__summary-table")]/tr[1]/td')
values = []
for td in employee_growth:
values.append(clean_growth(td.text))
employee_growth_names = driver.find_elements_by_xpath('//table[contains(@class, "org-insights-module__summary-table")]/tr[2]/th')
for i in range(0, len(employee_growth_names)):
add_to_stats('General ' + employee_growth_names[i].text, values[i])
except (NoSuchElementException, TimeoutException):
add_to_stats('Error', 'No Tenure Option')
master_array.append(company_stats)
continue
try: # employee distribution
button = driver.find_element_by_xpath('//button[contains(@class, "org-insights-premium-dropdown__trigger")]') ## Get the employee distribution table
button.send_keys(webdriver.common.keys.Keys.SPACE)
employee_distributions = driver.find_elements_by_xpath('//button[@class="org-insights-premium-dropdown__option-button"][@aria-pressed="false"]')
for distribution in employee_distributions:
distribution.send_keys(webdriver.common.keys.Keys.SPACE) # Show all items in table
time.sleep(uniform(.25, .4))
first_table = driver.find_elements_by_xpath('//table[@class="org-insights-functions-growth__table"]')[0]
distribution_growth_percentages = driver.find_elements_by_xpath('//table[@class="org-insights-functions-growth__table"]')[0].find_elements_by_xpath('tr/td/span/span[2]')
distribution_growth_array = []
for percentage in distribution_growth_percentages:
distribution_growth_array.append(clean_growth(percentage.text))
distribution_growth_names = driver.find_elements_by_xpath('//table[@class="org-insights-functions-growth__table"]')[0].find_elements_by_xpath('tr/th')
category_name = distribution_growth_names.pop(0) # 6m, 1y, and/or 2y
job_openings_names = driver.find_elements_by_xpath('//table[@class="org-insights-functions-growth__table"]')[0].find_elements_by_xpath('tr/td/div')
number_of_time_categories = len(distribution_growth_names)
for i in range(len(job_openings_names)):
if number_of_time_categories >= 1:
add_to_stats(category_name.text + ' ' + job_openings_names[i].text + '_' + distribution_growth_names[0].text.encode('utf-8'), (distribution_growth_array[i * number_of_time_categories]))
if number_of_time_categories >= 2:
add_to_stats(category_name.text + ' ' + job_openings_names[i].text + '_' + distribution_growth_names[1].text.encode('utf-8'), distribution_growth_array[i * number_of_time_categories + 1])
if number_of_time_categories >= 3:
add_to_stats(category_name.text + ' ' + job_openings_names[i].text + '_' + distribution_growth_names[2].text.encode('utf-8'), distribution_growth_array[i * number_of_time_categories + 2])
except (NoSuchElementException, TimeoutException):
pass
time.sleep(uniform(2, 4))
try: # job openings
driver.execute_script(
"window.scrollTo(0, " + str((randint(3100, 3200))) + ");") # Scroll down to the bottom of the page
time.sleep(uniform(3, 5))
button = driver.find_elements_by_xpath('//button[contains(@class, "org-insights-premium-dropdown__trigger")]') ## Get the employee distribution table
if len(button) < 2: # Meaning it doesn't have the other graph
raise NoSuchElementException
button = button[1] # Second button on page
button.send_keys(webdriver.common.keys.Keys.SPACE)
employee_distributions = driver.find_elements_by_xpath('//button[@class="org-insights-premium-dropdown__option-button"][@aria-pressed="false"]')
for distribution in employee_distributions:
distribution.send_keys(webdriver.common.keys.Keys.SPACE) # Show all items in table
time.sleep(uniform(.25, .4))
second_table = driver.find_elements_by_xpath('//table[@class="org-insights-functions-growth__table"]')[1]
distribution_growth_percentages = driver.find_elements_by_xpath('//table[@class="org-insights-functions-growth__table"]')[1].find_elements_by_xpath('tr/td/span/span[2]')
distribution_growth_array = []
for percentage in distribution_growth_percentages:
distribution_growth_array.append(clean_growth(percentage.text))
distribution_growth_names = driver.find_elements_by_xpath('//table[@class="org-insights-functions-growth__table"]')[1].find_elements_by_xpath('tr/th')
category_name = distribution_growth_names.pop(0) # 6m, 1y, and/or 2y
job_openings_names = driver.find_elements_by_xpath('//table[@class="org-insights-functions-growth__table"]')[1].find_elements_by_xpath('tr/td/div')
number_of_time_categories = len(distribution_growth_names)
for i in range(len(job_openings_names)):
if number_of_time_categories >= 1:
add_to_stats(category_name.text + ' ' + job_openings_names[i].text + '_' + distribution_growth_names[0].text.encode('utf-8'), (distribution_growth_array[i * number_of_time_categories]))
if number_of_time_categories >= 2:
add_to_stats(category_name.text + ' ' + job_openings_names[i].text + '_' + distribution_growth_names[1].text.encode('utf-8'), distribution_growth_array[i * number_of_time_categories + 1])
if number_of_time_categories >= 3:
add_to_stats(category_name.text + ' ' + job_openings_names[i].text + '_' + distribution_growth_names[2].text.encode('utf-8'), distribution_growth_array[i * number_of_time_categories + 2])
except (NoSuchElementException, TimeoutException):
pass
print header_content
with open("C:/Users/abagh/tenure.csv", "wb") as csv_file:
writer = csv.writer(csv_file, delimiter=',')
master_array.append(company_stats)
writer.writerow(header_content)
for row in master_array:
writer.writerow(row)
print 'done!!!'