diff --git a/README.md b/README.md
index 391bcc6c..148f2be8 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ work with us.*
## Features
-- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, & **ZipRecruiter** simultaneously
+- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Googe** & **ZipRecruiter** simultaneously
- Aggregates the job postings in a Pandas DataFrame
- Proxies support
@@ -30,9 +30,9 @@ import csv
from jobspy import scrape_jobs
jobs = scrape_jobs(
- site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor"],
+ site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google"],
search_term="software engineer",
- location="Dallas, TX",
+ location="San Francisco, CA",
results_wanted=20,
hours_old=72, # (only Linkedin/Indeed is hour specific, others round up to days old)
country_indeed='USA', # only needed for indeed / glassdoor
@@ -80,9 +80,6 @@ Optional
| in format ['user:pass@host:port', 'localhost']
| each job board scraper will round robin through the proxies
|
-├── ca_cert (str)
-| path to CA Certificate file for proxies
-│
├── is_remote (bool)
│
├── results_wanted (int):
@@ -116,6 +113,9 @@ Optional
|
├── enforce_annual_salary (bool):
| converts wages to annual salary
+|
+├── ca_cert (str)
+| path to CA Certificate file for proxies
```
```
@@ -168,7 +168,7 @@ Indeed specific
├── company_employees_label
├── company_revenue_label
├── company_description
-└── logo_photo_url
+└── company_logo
```
## Supported Countries for Job Searching
diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py
index f9f02ad8..6c8573b3 100644
--- a/src/jobspy/__init__.py
+++ b/src/jobspy/__init__.py
@@ -9,6 +9,7 @@
from .scrapers.indeed import IndeedScraper
from .scrapers.ziprecruiter import ZipRecruiterScraper
from .scrapers.glassdoor import GlassdoorScraper
+from .scrapers.google import GoogleJobsScraper
from .scrapers.linkedin import LinkedInScraper
from .scrapers import SalarySource, ScraperInput, Site, JobResponse, Country
from .scrapers.exceptions import (
@@ -16,6 +17,7 @@
IndeedException,
ZipRecruiterException,
GlassdoorException,
+ GoogleJobsException,
)
@@ -50,6 +52,7 @@ def scrape_jobs(
Site.INDEED: IndeedScraper,
Site.ZIP_RECRUITER: ZipRecruiterScraper,
Site.GLASSDOOR: GlassdoorScraper,
+ Site.GOOGLE: GoogleJobsScraper,
}
set_logger_level(verbose)
@@ -223,12 +226,12 @@ def convert_to_annual(job_data: dict):
"is_remote",
"job_level",
"job_function",
- "company_industry",
"listing_type",
"emails",
"description",
+ "company_industry",
"company_url",
- "logo_photo_url",
+ "company_logo",
"company_url_direct",
"company_addresses",
"company_num_employees",
diff --git a/src/jobspy/jobs/__init__.py b/src/jobspy/jobs/__init__.py
index 48ef824f..c51839c0 100644
--- a/src/jobspy/jobs/__init__.py
+++ b/src/jobspy/jobs/__init__.py
@@ -256,7 +256,7 @@ class JobPost(BaseModel):
company_num_employees: str | None = None
company_revenue: str | None = None
company_description: str | None = None
- logo_photo_url: str | None = None
+ company_logo: str | None = None
banner_photo_url: str | None = None
# linkedin only atm
diff --git a/src/jobspy/scrapers/__init__.py b/src/jobspy/scrapers/__init__.py
index 8ca0539b..492fd77f 100644
--- a/src/jobspy/scrapers/__init__.py
+++ b/src/jobspy/scrapers/__init__.py
@@ -17,11 +17,14 @@ class Site(Enum):
INDEED = "indeed"
ZIP_RECRUITER = "zip_recruiter"
GLASSDOOR = "glassdoor"
+ GOOGLE = "google"
+
class SalarySource(Enum):
DIRECT_DATA = "direct_data"
DESCRIPTION = "description"
+
class ScraperInput(BaseModel):
site_type: list[Site]
search_term: str | None = None
@@ -42,7 +45,9 @@ class ScraperInput(BaseModel):
class Scraper(ABC):
- def __init__(self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None):
+ def __init__(
+ self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None
+ ):
self.site = site
self.proxies = proxies
self.ca_cert = ca_cert
diff --git a/src/jobspy/scrapers/exceptions.py b/src/jobspy/scrapers/exceptions.py
index e49680bd..eba04794 100644
--- a/src/jobspy/scrapers/exceptions.py
+++ b/src/jobspy/scrapers/exceptions.py
@@ -24,3 +24,8 @@ def __init__(self, message=None):
class GlassdoorException(Exception):
def __init__(self, message=None):
super().__init__(message or "An error occurred with Glassdoor")
+
+
+class GoogleJobsException(Exception):
+ def __init__(self, message=None):
+ super().__init__(message or "An error occurred with Google Jobs")
diff --git a/src/jobspy/scrapers/glassdoor/__init__.py b/src/jobspy/scrapers/glassdoor/__init__.py
index eab4ee5f..d2de6dcf 100644
--- a/src/jobspy/scrapers/glassdoor/__init__.py
+++ b/src/jobspy/scrapers/glassdoor/__init__.py
@@ -214,7 +214,7 @@ def _process_job(self, job_data):
is_remote=is_remote,
description=description,
emails=extract_emails_from_text(description) if description else None,
- logo_photo_url=company_logo,
+ company_logo=company_logo,
listing_type=listing_type,
)
diff --git a/src/jobspy/scrapers/google/__init__.py b/src/jobspy/scrapers/google/__init__.py
new file mode 100644
index 00000000..4f8ecbdf
--- /dev/null
+++ b/src/jobspy/scrapers/google/__init__.py
@@ -0,0 +1,217 @@
+"""
+jobspy.scrapers.google
+~~~~~~~~~~~~~~~~~~~
+
+This module contains routines to scrape Glassdoor.
+"""
+
+from __future__ import annotations
+
+import math
+import re
+import json
+from typing import Tuple
+from datetime import datetime, timedelta
+
+from .constants import headers_jobs, headers_initial, async_param
+from .. import Scraper, ScraperInput, Site
+from ..utils import extract_emails_from_text, create_logger, extract_job_type
+from ..utils import (
+ create_session,
+)
+from ...jobs import (
+ JobPost,
+ JobResponse,
+ Location,
+ JobType,
+)
+
+logger = create_logger("Google")
+
+
+class GoogleJobsScraper(Scraper):
+ def __init__(
+ self, proxies: list[str] | str | None = None, ca_cert: str | None = None
+ ):
+ """
+ Initializes GlassdoorScraper with the Glassdoor job search url
+ """
+ site = Site(Site.GOOGLE)
+ super().__init__(site, proxies=proxies, ca_cert=ca_cert)
+
+ self.base_url = None
+ self.country = None
+ self.session = None
+ self.scraper_input = None
+ self.jobs_per_page = 10
+ self.seen_urls = set()
+ self.url = "https://www.google.com/search"
+ self.jobs_url = "https://www.google.com/async/callback:550"
+
+ def scrape(self, scraper_input: ScraperInput) -> JobResponse:
+ """
+ Scrapes Glassdoor for jobs with scraper_input criteria.
+ :param scraper_input: Information about job search criteria.
+ :return: JobResponse containing a list of jobs.
+ """
+ self.scraper_input = scraper_input
+ self.scraper_input.results_wanted = min(900, scraper_input.results_wanted)
+ self.base_url = self.scraper_input.country.get_glassdoor_url()
+
+ self.session = create_session(
+ proxies=self.proxies, ca_cert=self.ca_cert, is_tls=False, has_retry=True
+ )
+ forward_cursor = self._get_initial_cursor()
+ if forward_cursor is None:
+ logger.error("initial cursor not found")
+ return JobResponse(jobs=[])
+
+ page = 1
+ job_list: list[JobPost] = []
+
+ while (
+ len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset
+ and forward_cursor
+ ):
+ logger.info(
+ f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
+ )
+ jobs, forward_cursor = self._get_jobs_next_page(forward_cursor)
+ if not jobs:
+ logger.info(f"found no jobs on page: {page}")
+ break
+ job_list += jobs
+ page += 1
+ return JobResponse(
+ jobs=job_list[
+ scraper_input.offset : scraper_input.offset
+ + scraper_input.results_wanted
+ ]
+ )
+
+ def _get_initial_cursor(self):
+ """Gets initial cursor to paginate through job listings"""
+ query = f"{self.scraper_input.search_term} jobs"
+
+ def get_time_range(hours_old):
+ if hours_old <= 24:
+ return "since yesterday"
+ elif hours_old <= 72:
+ return "in the last 3 days"
+ elif hours_old <= 168:
+ return "in the last week"
+ else:
+ return "in the last month"
+
+ job_type_mapping = {
+ JobType.FULL_TIME: "Full time",
+ JobType.PART_TIME: "Part time",
+ JobType.INTERNSHIP: "Internship",
+ JobType.CONTRACT: "Contract",
+ }
+
+ if self.scraper_input.job_type in job_type_mapping:
+ query += f" {job_type_mapping[self.scraper_input.job_type]}"
+
+ if self.scraper_input.location:
+ query += f" near {self.scraper_input.location}"
+
+ if self.scraper_input.hours_old:
+ time_filter = get_time_range(self.scraper_input.hours_old)
+ query += f" {time_filter}"
+
+ if self.scraper_input.is_remote:
+ query += " remote"
+
+ params = {"q": query, "udm": "8"}
+ response = self.session.get(self.url, headers=headers_initial, params=params)
+
+ pattern_fc = r'
]+data-async-fc="([^"]+)"'
+ match_fc = re.search(pattern_fc, response.text)
+ data_async_fc = match_fc.group(1) if match_fc else None
+ return data_async_fc
+
+ def _get_jobs_next_page(self, forward_cursor: str) -> Tuple[list[JobPost], str]:
+ params = {"fc": [forward_cursor], "fcv": ["3"], "async": [async_param]}
+ response = self.session.get(self.jobs_url, headers=headers_jobs, params=params)
+ return self._parse_jobs(response.text)
+
+ def _parse_jobs(self, job_data: str) -> Tuple[list[JobPost], str]:
+ """
+ Parses jobs on a page with next page cursor
+ """
+ start_idx = job_data.find("[[[")
+ end_idx = job_data.rindex("]]]") + 3
+ s = job_data[start_idx:end_idx]
+ parsed = json.loads(s)[0]
+
+ pattern_fc = r'data-async-fc="([^"]+)"'
+ match_fc = re.search(pattern_fc, job_data)
+ data_async_fc = match_fc.group(1) if match_fc else None
+ jobs_on_page = []
+
+ for array in parsed:
+
+ _, job_data = array
+ if not job_data.startswith("[[["):
+ continue
+ job_d = json.loads(job_data)
+
+ job_info = self._find_job_info(job_d)
+
+ job_url = job_info[3][0][0] if job_info[3] and job_info[3][0] else None
+ if job_url in self.seen_urls:
+ continue
+ self.seen_urls.add(job_url)
+
+ title = job_info[0]
+ company_name = job_info[1]
+ location = city = job_info[2]
+ state = country = date_posted = None
+ if location and "," in location:
+ city, state, *country = [*map(lambda x: x.strip(), location.split(","))]
+
+ days_ago_str = job_info[12]
+ if type(days_ago_str) == str:
+ match = re.search(r"\d+", days_ago_str)
+ days_ago = int(match.group()) if match else None
+ date_posted = (datetime.now() - timedelta(days=days_ago)).date()
+
+ description = job_info[19]
+
+ job_post = JobPost(
+ id=f"go-{job_info[28]}",
+ title=title,
+ company_name=company_name,
+ location=Location(
+ city=city, state=state, country=country[0] if country else None
+ ),
+ job_url=job_url,
+ job_url_direct=job_url,
+ date_posted=date_posted,
+ is_remote="remote" in description.lower()
+ or "wfh" in description.lower(),
+ description=description,
+ emails=extract_emails_from_text(description),
+ job_type=extract_job_type(description),
+ )
+ jobs_on_page.append(job_post)
+ return jobs_on_page, data_async_fc
+
+ @staticmethod
+ def _find_job_info(jobs_data: list | dict) -> list | None:
+ """Iterates through the JSON data to find the job listings"""
+ if isinstance(jobs_data, dict):
+ for key, value in jobs_data.items():
+ if key == "520084652" and isinstance(value, list):
+ return value
+ else:
+ result = GoogleJobsScraper._find_job_info(value)
+ if result:
+ return result
+ elif isinstance(jobs_data, list):
+ for item in jobs_data:
+ result = GoogleJobsScraper._find_job_info(item)
+ if result:
+ return result
+ return None
diff --git a/src/jobspy/scrapers/google/constants.py b/src/jobspy/scrapers/google/constants.py
new file mode 100644
index 00000000..a0d13b00
--- /dev/null
+++ b/src/jobspy/scrapers/google/constants.py
@@ -0,0 +1,52 @@
+headers_initial = {
+ "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+ "accept-language": "en-US,en;q=0.9",
+ "priority": "u=0, i",
+ "referer": "https://www.google.com/",
+ "sec-ch-prefers-color-scheme": "dark",
+ "sec-ch-ua": '"Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"',
+ "sec-ch-ua-arch": '"arm"',
+ "sec-ch-ua-bitness": '"64"',
+ "sec-ch-ua-form-factors": '"Desktop"',
+ "sec-ch-ua-full-version": '"130.0.6723.58"',
+ "sec-ch-ua-full-version-list": '"Chromium";v="130.0.6723.58", "Google Chrome";v="130.0.6723.58", "Not?A_Brand";v="99.0.0.0"',
+ "sec-ch-ua-mobile": "?0",
+ "sec-ch-ua-model": '""',
+ "sec-ch-ua-platform": '"macOS"',
+ "sec-ch-ua-platform-version": '"15.0.1"',
+ "sec-ch-ua-wow64": "?0",
+ "sec-fetch-dest": "document",
+ "sec-fetch-mode": "navigate",
+ "sec-fetch-site": "same-origin",
+ "sec-fetch-user": "?1",
+ "upgrade-insecure-requests": "1",
+ "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
+ "x-browser-channel": "stable",
+ "x-browser-copyright": "Copyright 2024 Google LLC. All rights reserved.",
+ "x-browser-year": "2024",
+}
+
+headers_jobs = {
+ "accept": "*/*",
+ "accept-language": "en-US,en;q=0.9",
+ "priority": "u=1, i",
+ "referer": "https://www.google.com/",
+ "sec-ch-prefers-color-scheme": "dark",
+ "sec-ch-ua": '"Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"',
+ "sec-ch-ua-arch": '"arm"',
+ "sec-ch-ua-bitness": '"64"',
+ "sec-ch-ua-form-factors": '"Desktop"',
+ "sec-ch-ua-full-version": '"130.0.6723.58"',
+ "sec-ch-ua-full-version-list": '"Chromium";v="130.0.6723.58", "Google Chrome";v="130.0.6723.58", "Not?A_Brand";v="99.0.0.0"',
+ "sec-ch-ua-mobile": "?0",
+ "sec-ch-ua-model": '""',
+ "sec-ch-ua-platform": '"macOS"',
+ "sec-ch-ua-platform-version": '"15.0.1"',
+ "sec-ch-ua-wow64": "?0",
+ "sec-fetch-dest": "empty",
+ "sec-fetch-mode": "cors",
+ "sec-fetch-site": "same-origin",
+ "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
+}
+
+async_param = "_basejs:/xjs/_/js/k=xjs.s.en_US.JwveA-JiKmg.2018.O/am=AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIAAAAAAAAACAAAoICAAAAAAAKMAfAAAAIAQAAAAAAAAAAAAACCAAAEJDAAACAAAAAGABAIAAARBAAABAAAAAgAgQAABAASKAfv8JAAABAAAAAAwAQAQACQAAAAAAcAEAQABoCAAAABAAAIABAACAAAAEAAAAFAAAAAAAAAAAAAAAAAAAAAAAAACAQADoBwAAAAAAAAAAAAAQBAAAAATQAAoACOAHAAAAAAAAAQAAAIIAAAA_ZAACAAAAAAAAcB8APB4wHFJ4AAAAAAAAAAAAAAAACECCYA5If0EACAAAAAAAAAAAAAAAAAAAUgRNXG4AMAE/dg=0/br=1/rs=ACT90oGxMeaFMCopIHq5tuQM-6_3M_VMjQ,_basecss:/xjs/_/ss/k=xjs.s.IwsGu62EDtU.L.B1.O/am=QOoQIAQAAAQAREADEBAAAAAAAAAAAAAAAAAAAAAgAQAAIAAAgAQAAAIAIAIAoEwCAADIC8AfsgEAawwAPkAAjgoAGAAAAAAAAEADAAAAAAIgAECHAAAAAAAAAAABAQAggAARQAAAQCEAAAAAIAAAABgAAAAAIAQIACCAAfB-AAFIQABoCEA_CgEAAIABAACEgHAEwwAEFQAM4CgAAAAAAAAAAAAACABCAAAAQEAAABAgAMCPAAA4AoE2BAEAggSAAIoAQAAAAAgAAAAACCAQAAAxEwA_ZAACAAAAAAAAAAkAAAAAAAAgAAAAAAAAAAAAAAAAAAAAAAAAQAEAAAAAAAAAAAAAAAAAAAAAQA/br=1/rs=ACT90oGZc36t3uUQkj0srnIvvbHjO2hgyg,_basecomb:/xjs/_/js/k=xjs.s.en_US.JwveA-JiKmg.2018.O/ck=xjs.s.IwsGu62EDtU.L.B1.O/am=QOoQIAQAAAQAREADEBAAAAAAAAAAAAAAAAAAAAAgAQAAIAAAgAQAAAKAIAoIqEwCAADIK8AfsgEAawwAPkAAjgoAGAAACCAAAEJDAAACAAIgAGCHAIAAARBAAABBAQAggAgRQABAQSOAfv8JIAABABgAAAwAYAQICSCAAfB-cAFIQABoCEA_ChEAAIABAACEgHAEwwAEFQAM4CgAAAAAAAAAAAAACABCAACAQEDoBxAgAMCPAAA4AoE2BAEAggTQAIoASOAHAAgAAAAACSAQAIIxEwA_ZAACAAAAAAAAcB8APB4wHFJ4AAAAAAAAAAAAAAAACECCYA5If0EACAAAAAAAAAAAAAAAAAAAUgRNXG4AMAE/d=1/ed=1/dg=0/br=1/ujg=1/rs=ACT90oFNLTjPzD_OAqhhtXwe2pg1T3WpBg,_fmt:prog,_id:fc_5FwaZ86OKsfdwN4P4La3yA4_2"
diff --git a/src/jobspy/scrapers/indeed/__init__.py b/src/jobspy/scrapers/indeed/__init__.py
index f3f679c7..bd379ab5 100644
--- a/src/jobspy/scrapers/indeed/__init__.py
+++ b/src/jobspy/scrapers/indeed/__init__.py
@@ -72,7 +72,7 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse:
while len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset:
logger.info(
- f"search page: {page} / {math.ceil(scraper_input.results_wanted / 100)}"
+ f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
)
jobs, cursor = self._scrape_page(cursor)
if not jobs:
@@ -258,7 +258,7 @@ def _process_job(self, job: dict) -> JobPost | None:
company_num_employees=employer_details.get("employeesLocalizedLabel"),
company_revenue=employer_details.get("revenueLocalizedLabel"),
company_description=employer_details.get("briefDescription"),
- logo_photo_url=(
+ company_logo=(
employer["images"].get("squareLogoUrl")
if employer and employer.get("images")
else None
diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py
index f6bc63ba..c3629f64 100644
--- a/src/jobspy/scrapers/linkedin/__init__.py
+++ b/src/jobspy/scrapers/linkedin/__init__.py
@@ -232,7 +232,7 @@ def _process_job(
description=job_details.get("description"),
job_url_direct=job_details.get("job_url_direct"),
emails=extract_emails_from_text(job_details.get("description")),
- logo_photo_url=job_details.get("logo_photo_url"),
+ company_logo=job_details.get("company_logo"),
job_function=job_details.get("job_function"),
)
@@ -275,7 +275,7 @@ def _get_job_details(self, job_id: str) -> dict:
if job_function_span:
job_function = job_function_span.text.strip()
- logo_photo_url = (
+ company_logo = (
logo_image.get("data-delayed-url")
if (logo_image := soup.find("img", {"class": "artdeco-entity-image"}))
else None
@@ -286,7 +286,7 @@ def _get_job_details(self, job_id: str) -> dict:
"company_industry": self._parse_company_industry(soup),
"job_type": self._parse_job_type(soup),
"job_url_direct": self._parse_job_url_direct(soup),
- "logo_photo_url": logo_photo_url,
+ "company_logo": company_logo,
"job_function": job_function,
}
diff --git a/src/jobspy/scrapers/utils.py b/src/jobspy/scrapers/utils.py
index 760d52cd..7c032d74 100644
--- a/src/jobspy/scrapers/utils.py
+++ b/src/jobspy/scrapers/utils.py
@@ -264,3 +264,22 @@ def convert_monthly_to_annual(monthly_wage):
else:
return interval, min_salary, max_salary, "USD"
return None, None, None, None
+
+
+def extract_job_type(description: str):
+ if not description:
+ return []
+
+ keywords = {
+ JobType.FULL_TIME: r"full\s?time",
+ JobType.PART_TIME: r"part\s?time",
+ JobType.INTERNSHIP: r"internship",
+ JobType.CONTRACT: r"contract",
+ }
+
+ listing_types = []
+ for key, pattern in keywords.items():
+ if re.search(pattern, description, re.IGNORECASE):
+ listing_types.append(key)
+
+ return listing_types if listing_types else None
diff --git a/tests/test_google.py b/tests/test_google.py
new file mode 100644
index 00000000..9f30ffec
--- /dev/null
+++ b/tests/test_google.py
@@ -0,0 +1,12 @@
+from jobspy import scrape_jobs
+import pandas as pd
+
+
+def test_google():
+ result = scrape_jobs(
+ site_name="google", search_term="software engineer", results_wanted=5
+ )
+
+ assert (
+ isinstance(result, pd.DataFrame) and len(result) == 5
+ ), "Result should be a non-empty DataFrame"