Skip to content

Commit

Permalink
enh: google jobs
Browse files Browse the repository at this point in the history
  • Loading branch information
cullenwatson committed Oct 24, 2024
1 parent f395597 commit e08104d
Show file tree
Hide file tree
Showing 12 changed files with 330 additions and 17 deletions.
14 changes: 7 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ work with us.*

## Features

- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, & **ZipRecruiter** simultaneously
- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Googe** & **ZipRecruiter** simultaneously
- Aggregates the job postings in a Pandas DataFrame
- Proxies support

Expand All @@ -30,9 +30,9 @@ import csv
from jobspy import scrape_jobs

jobs = scrape_jobs(
site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor"],
site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google"],
search_term="software engineer",
location="Dallas, TX",
location="San Francisco, CA",
results_wanted=20,
hours_old=72, # (only Linkedin/Indeed is hour specific, others round up to days old)
country_indeed='USA', # only needed for indeed / glassdoor
Expand Down Expand Up @@ -80,9 +80,6 @@ Optional
| in format ['user:pass@host:port', 'localhost']
| each job board scraper will round robin through the proxies
|
├── ca_cert (str)
| path to CA Certificate file for proxies
├── is_remote (bool)
├── results_wanted (int):
Expand Down Expand Up @@ -116,6 +113,9 @@ Optional
|
├── enforce_annual_salary (bool):
| converts wages to annual salary
|
├── ca_cert (str)
| path to CA Certificate file for proxies
```

```
Expand Down Expand Up @@ -168,7 +168,7 @@ Indeed specific
├── company_employees_label
├── company_revenue_label
├── company_description
└── logo_photo_url
└── company_logo
```

## Supported Countries for Job Searching
Expand Down
7 changes: 5 additions & 2 deletions src/jobspy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,15 @@
from .scrapers.indeed import IndeedScraper
from .scrapers.ziprecruiter import ZipRecruiterScraper
from .scrapers.glassdoor import GlassdoorScraper
from .scrapers.google import GoogleJobsScraper
from .scrapers.linkedin import LinkedInScraper
from .scrapers import SalarySource, ScraperInput, Site, JobResponse, Country
from .scrapers.exceptions import (
LinkedInException,
IndeedException,
ZipRecruiterException,
GlassdoorException,
GoogleJobsException,
)


Expand Down Expand Up @@ -50,6 +52,7 @@ def scrape_jobs(
Site.INDEED: IndeedScraper,
Site.ZIP_RECRUITER: ZipRecruiterScraper,
Site.GLASSDOOR: GlassdoorScraper,
Site.GOOGLE: GoogleJobsScraper,
}
set_logger_level(verbose)

Expand Down Expand Up @@ -223,12 +226,12 @@ def convert_to_annual(job_data: dict):
"is_remote",
"job_level",
"job_function",
"company_industry",
"listing_type",
"emails",
"description",
"company_industry",
"company_url",
"logo_photo_url",
"company_logo",
"company_url_direct",
"company_addresses",
"company_num_employees",
Expand Down
2 changes: 1 addition & 1 deletion src/jobspy/jobs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ class JobPost(BaseModel):
company_num_employees: str | None = None
company_revenue: str | None = None
company_description: str | None = None
logo_photo_url: str | None = None
company_logo: str | None = None
banner_photo_url: str | None = None

# linkedin only atm
Expand Down
7 changes: 6 additions & 1 deletion src/jobspy/scrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,14 @@ class Site(Enum):
INDEED = "indeed"
ZIP_RECRUITER = "zip_recruiter"
GLASSDOOR = "glassdoor"
GOOGLE = "google"


class SalarySource(Enum):
DIRECT_DATA = "direct_data"
DESCRIPTION = "description"


class ScraperInput(BaseModel):
site_type: list[Site]
search_term: str | None = None
Expand All @@ -42,7 +45,9 @@ class ScraperInput(BaseModel):


class Scraper(ABC):
def __init__(self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None):
def __init__(
self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None
):
self.site = site
self.proxies = proxies
self.ca_cert = ca_cert
Expand Down
5 changes: 5 additions & 0 deletions src/jobspy/scrapers/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,8 @@ def __init__(self, message=None):
class GlassdoorException(Exception):
def __init__(self, message=None):
super().__init__(message or "An error occurred with Glassdoor")


class GoogleJobsException(Exception):
def __init__(self, message=None):
super().__init__(message or "An error occurred with Google Jobs")
2 changes: 1 addition & 1 deletion src/jobspy/scrapers/glassdoor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ def _process_job(self, job_data):
is_remote=is_remote,
description=description,
emails=extract_emails_from_text(description) if description else None,
logo_photo_url=company_logo,
company_logo=company_logo,
listing_type=listing_type,
)

Expand Down
217 changes: 217 additions & 0 deletions src/jobspy/scrapers/google/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
"""
jobspy.scrapers.google
~~~~~~~~~~~~~~~~~~~
This module contains routines to scrape Glassdoor.
"""

from __future__ import annotations

import math
import re
import json
from typing import Tuple
from datetime import datetime, timedelta

from .constants import headers_jobs, headers_initial, async_param
from .. import Scraper, ScraperInput, Site
from ..utils import extract_emails_from_text, create_logger, extract_job_type
from ..utils import (
create_session,
)
from ...jobs import (
JobPost,
JobResponse,
Location,
JobType,
)

logger = create_logger("Google")


class GoogleJobsScraper(Scraper):
def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
):
"""
Initializes GlassdoorScraper with the Glassdoor job search url
"""
site = Site(Site.GOOGLE)
super().__init__(site, proxies=proxies, ca_cert=ca_cert)

self.base_url = None
self.country = None
self.session = None
self.scraper_input = None
self.jobs_per_page = 10
self.seen_urls = set()
self.url = "https://www.google.com/search"
self.jobs_url = "https://www.google.com/async/callback:550"

def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
Scrapes Glassdoor for jobs with scraper_input criteria.
:param scraper_input: Information about job search criteria.
:return: JobResponse containing a list of jobs.
"""
self.scraper_input = scraper_input
self.scraper_input.results_wanted = min(900, scraper_input.results_wanted)
self.base_url = self.scraper_input.country.get_glassdoor_url()

self.session = create_session(
proxies=self.proxies, ca_cert=self.ca_cert, is_tls=False, has_retry=True
)
forward_cursor = self._get_initial_cursor()
if forward_cursor is None:
logger.error("initial cursor not found")
return JobResponse(jobs=[])

page = 1
job_list: list[JobPost] = []

while (
len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset
and forward_cursor
):
logger.info(
f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
)
jobs, forward_cursor = self._get_jobs_next_page(forward_cursor)
if not jobs:
logger.info(f"found no jobs on page: {page}")
break
job_list += jobs
page += 1
return JobResponse(
jobs=job_list[
scraper_input.offset : scraper_input.offset
+ scraper_input.results_wanted
]
)

def _get_initial_cursor(self):
"""Gets initial cursor to paginate through job listings"""
query = f"{self.scraper_input.search_term} jobs"

def get_time_range(hours_old):
if hours_old <= 24:
return "since yesterday"
elif hours_old <= 72:
return "in the last 3 days"
elif hours_old <= 168:
return "in the last week"
else:
return "in the last month"

job_type_mapping = {
JobType.FULL_TIME: "Full time",
JobType.PART_TIME: "Part time",
JobType.INTERNSHIP: "Internship",
JobType.CONTRACT: "Contract",
}

if self.scraper_input.job_type in job_type_mapping:
query += f" {job_type_mapping[self.scraper_input.job_type]}"

if self.scraper_input.location:
query += f" near {self.scraper_input.location}"

if self.scraper_input.hours_old:
time_filter = get_time_range(self.scraper_input.hours_old)
query += f" {time_filter}"

if self.scraper_input.is_remote:
query += " remote"

params = {"q": query, "udm": "8"}
response = self.session.get(self.url, headers=headers_initial, params=params)

pattern_fc = r'<div jsname="Yust4d"[^>]+data-async-fc="([^"]+)"'
match_fc = re.search(pattern_fc, response.text)
data_async_fc = match_fc.group(1) if match_fc else None
return data_async_fc

def _get_jobs_next_page(self, forward_cursor: str) -> Tuple[list[JobPost], str]:
params = {"fc": [forward_cursor], "fcv": ["3"], "async": [async_param]}
response = self.session.get(self.jobs_url, headers=headers_jobs, params=params)
return self._parse_jobs(response.text)

def _parse_jobs(self, job_data: str) -> Tuple[list[JobPost], str]:
"""
Parses jobs on a page with next page cursor
"""
start_idx = job_data.find("[[[")
end_idx = job_data.rindex("]]]") + 3
s = job_data[start_idx:end_idx]
parsed = json.loads(s)[0]

pattern_fc = r'data-async-fc="([^"]+)"'
match_fc = re.search(pattern_fc, job_data)
data_async_fc = match_fc.group(1) if match_fc else None
jobs_on_page = []

for array in parsed:

_, job_data = array
if not job_data.startswith("[[["):
continue
job_d = json.loads(job_data)

job_info = self._find_job_info(job_d)

job_url = job_info[3][0][0] if job_info[3] and job_info[3][0] else None
if job_url in self.seen_urls:
continue
self.seen_urls.add(job_url)

title = job_info[0]
company_name = job_info[1]
location = city = job_info[2]
state = country = date_posted = None
if location and "," in location:
city, state, *country = [*map(lambda x: x.strip(), location.split(","))]

days_ago_str = job_info[12]
if type(days_ago_str) == str:
match = re.search(r"\d+", days_ago_str)
days_ago = int(match.group()) if match else None
date_posted = (datetime.now() - timedelta(days=days_ago)).date()

description = job_info[19]

job_post = JobPost(
id=f"go-{job_info[28]}",
title=title,
company_name=company_name,
location=Location(
city=city, state=state, country=country[0] if country else None
),
job_url=job_url,
job_url_direct=job_url,
date_posted=date_posted,
is_remote="remote" in description.lower()
or "wfh" in description.lower(),
description=description,
emails=extract_emails_from_text(description),
job_type=extract_job_type(description),
)
jobs_on_page.append(job_post)
return jobs_on_page, data_async_fc

@staticmethod
def _find_job_info(jobs_data: list | dict) -> list | None:
"""Iterates through the JSON data to find the job listings"""
if isinstance(jobs_data, dict):
for key, value in jobs_data.items():
if key == "520084652" and isinstance(value, list):
return value
else:
result = GoogleJobsScraper._find_job_info(value)
if result:
return result
elif isinstance(jobs_data, list):
for item in jobs_data:
result = GoogleJobsScraper._find_job_info(item)
if result:
return result
return None
Loading

0 comments on commit e08104d

Please sign in to comment.