This repository has been archived by the owner on Jan 5, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathscrape_jobs.py
93 lines (90 loc) · 3.39 KB
/
scrape_jobs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
"""
Scrape linkedin jobs by using selenium, to simulate the navigation
(click, scroll) and BeautifulSoup to parse the HTML code of the page
Perform a number of queries and log a number of files
for each scraped job offer.
Write dataset to mongoDB with the scraped data
"""
from selenium.common.exceptions import TimeoutException
from utils import init_driver, get_job_urls, login, print_scraped_data,\
load_config, get_unseen_urls, scroll_job_panel, connect_mongo
from time import sleep
from bs4 import BeautifulSoup
from classes.JobScraper import JobScraper
import argparse
parser = argparse.ArgumentParser(
description=("Scrape linkedin job offers based on the " +
"queries specified in the conf file")
)
parser.add_argument('-c', '--conf',
type=str,
metavar='',
required=True,
help='Specify the path of the configuration file')
args = parser.parse_args()
conf = load_config(args.conf)
parameters = conf["parameters"]
credentials = conf["credentials"]
CHROME_PATH = parameters["CHROME_PATH"]
CHROMEDRIVER_PATH = parameters["CHROMEDRIVER_PATH"]
QUERIES = parameters["JOB_QUERIES"]
LINUSERNAME = credentials["LINUSERNAME"]
LINPWD = credentials["LINPWD"]
MONGOUSER = credentials["MONGOUSER"]
MONGOPWD = credentials["MONGOPWD"]
HOST = parameters["HOST"]
client = connect_mongo(HOST, MONGOUSER, MONGOPWD)
db = client["linkedin"]
jobs = db["jobs"]
driver = init_driver(CHROME_PATH, CHROMEDRIVER_PATH)
driver.get("https://www.linkedin.com")
login(driver, LINUSERNAME, LINPWD)
JOB_SEARCH_URL = "https://www.linkedin.com/jobs/search/?keywords="
for query in QUERIES:
driver.get(JOB_SEARCH_URL + query)
sleep(0.5)
scroll_job_panel(driver)
soup = BeautifulSoup(driver.page_source, 'html.parser')
n_results_element = soup.find(class_="t-12 t-black--light t-normal")
n_results_string = n_results_element.get_text()
n_results = int(n_results_string.split()[0].replace(',', ''))
job_urls = get_job_urls(soup)
start = 25
url = JOB_SEARCH_URL + query + "&start=" + str(start)
while start < n_results:
try:
driver.get(url)
scroll_job_panel(driver)
soup = BeautifulSoup(driver.page_source, 'html.parser')
job_urls.extend(get_job_urls(soup))
start += 25
except TimeoutException:
print(
"\nINFO :: TimeoutException raised while getting " +
"URL\n" + url
)
if len(job_urls) == 0:
print()
print("WARNING :: Could not get any URLs for the query\n" +
query)
print("Please double-check that LinkedIn is not " +
"blocking the query")
continue
unseen_urls = get_unseen_urls(jobs, job_urls)
if len(unseen_urls) != 0:
print("INFO :: Resuming from URL", unseen_urls[0])
else:
print("INFO :: All job URLs for the query " + query +
" have already been scraped. " +
"Moving onto the next query if any.")
continue
for url in unseen_urls:
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
js = JobScraper(soup, url, query)
job_data = js.get_job_data()
if job_data and\
not db["jobs"].count_documents(job_data, limit=1):
print_scraped_data(job_data)
jobs.insert_one(job_data)
driver.quit()