Adding Bayt.com Scraper to current codebase (#246)

cullenwatson · Feb 21, 2025 · 1be009b · 1be009b
1 parent 13c74a0
commit 1be009b
Show file tree

Hide file tree

Showing 5 changed files with 166 additions and 0 deletions.
diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py
@@ -11,6 +11,7 @@
 from .scrapers.glassdoor import GlassdoorScraper
 from .scrapers.google import GoogleJobsScraper
 from .scrapers.linkedin import LinkedInScraper
+from .scrapers.bayt import BaytScraper
 from .scrapers import SalarySource, ScraperInput, Site, JobResponse, Country
 from .scrapers.exceptions import (
     LinkedInException,
@@ -54,6 +55,7 @@ def scrape_jobs(
         Site.ZIP_RECRUITER: ZipRecruiterScraper,
         Site.GLASSDOOR: GlassdoorScraper,
         Site.GOOGLE: GoogleJobsScraper,
+        Site.BAYT: BaytScraper,
     }
     set_logger_level(verbose)
 

diff --git a/src/jobspy/scrapers/__init__.py b/src/jobspy/scrapers/__init__.py
@@ -18,6 +18,7 @@ class Site(Enum):
     ZIP_RECRUITER = "zip_recruiter"
     GLASSDOOR = "glassdoor"
     GOOGLE = "google"
+    BAYT = "bayt"
 
 
 class SalarySource(Enum):

diff --git a/src/jobspy/scrapers/bayt/__init__.py b/src/jobspy/scrapers/bayt/__init__.py
@@ -0,0 +1,159 @@
+from __future__ import annotations
+
+import time
+import random
+from typing import Optional
+
+import requests
+from bs4 import BeautifulSoup
+
+from .. import Scraper, ScraperInput, Site
+from ..exceptions import BaytException
+from ...jobs import JobPost, JobResponse, Location, Country
+from ..utils import create_logger
+
+logger = create_logger("Bayt")
+logger.setLevel("DEBUG")  # Ensure DEBUG messages are output
+
+
+class BaytScraper(Scraper):
+    base_url = "https://www.bayt.com"
+    delay = 2
+    band_delay = 3
+
+    def __init__(
+        self, proxies: list[str] | str | None = None, ca_cert: str | None = None
+    ):
+        super().__init__(Site.BAYT, proxies=proxies, ca_cert=ca_cert)
+        self.scraper_input = None
+        self.country = "worldwide"
+
+    def scrape(self, scraper_input: ScraperInput) -> JobResponse:
+        self.scraper_input = scraper_input
+        job_list: list[JobPost] = []
+        page = 1
+        results_wanted = (
+            scraper_input.results_wanted if scraper_input.results_wanted else 10
+        )
+
+        while len(job_list) < results_wanted:
+            logger.info(f"Fetching Bayt jobs page {page}")
+            job_elements = self._fetch_jobs(self.scraper_input.search_term, page)
+            if not job_elements:
+                break
+
+            if job_elements:
+                logger.debug("First job element snippet:\n" + job_elements[0].prettify()[:500])
+
+            initial_count = len(job_list)
+            for job in job_elements:
+                try:
+                    job_post = self._extract_job_info(job)
+                    if job_post:
+                        job_list.append(job_post)
+                        if len(job_list) >= results_wanted:
+                            break
+                    else:
+                        logger.debug(
+                            "Extraction returned None. Job snippet:\n"
+                            + job.prettify()[:500]
+                        )
+                except Exception as e:
+                    logger.error(f"Bayt: Error extracting job info: {str(e)}")
+                    continue
+
+            if len(job_list) == initial_count:
+                logger.info(f"No new jobs found on page {page}. Ending pagination.")
+                break
+
+            page += 1
+            time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
+
+        job_list = job_list[: scraper_input.results_wanted]
+        return JobResponse(jobs=job_list)
+
+    def _fetch_jobs(self, query: str, page: int = 1) -> Optional[list]:
+        """
+        Grabs the job results for the given query and page number.
+        """
+        try:
+            # Updated URL to include the "international" segment as per the original code.
+            url = f"{self.base_url}/en/international/jobs/{query}-jobs/?page={page}"
+            logger.info(f"Constructed URL: {url}")
+            headers = {
+                "User-Agent": (
+                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+                    "AppleWebKit/537.36 (KHTML, like Gecko) "
+                    "Chrome/115.0.0.0 Safari/537.36"
+                )
+            }
+            response = requests.get(url, headers=headers, timeout=10)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.text, "html.parser")
+            # Use the attribute selector as in the original code.
+            job_listings = soup.find_all("li", attrs={"data-js-job": ""})
+            logger.info(f"Found {len(job_listings)} job listing elements")
+            return job_listings
+        except Exception as e:
+            logger.error(f"Bayt: Error fetching jobs - {str(e)}")
+            return None
+
+    def _extract_job_info(self, job: BeautifulSoup) -> Optional[JobPost]:
+        """
+        Extracts the job information from a single job listing.
+        """
+        # Find the h2 element holding the title and link (no class filtering)
+        job_general_information = job.find("h2")
+        if not job_general_information:
+            return None
+
+        job_title = job_general_information.get_text(strip=True)
+        job_url = self._extract_job_url(job_general_information)
+        if not job_url:
+            return None
+
+        # Extract company name using the original approach:
+        company_tag = job.find("div", class_="t-nowrap p10l")
+        company_name = (
+            company_tag.find("span").get_text(strip=True)
+            if company_tag and company_tag.find("span")
+            else None
+        )
+
+        # Extract location using the original approach:
+        location_tag = job.find("div", class_="t-mute t-small")
+        location = location_tag.get_text(strip=True) if location_tag else None
+
+        job_id = f"bayt-{abs(hash(job_url))}"
+        location_obj = Location(
+            city=location,
+            country=Country.from_string(self.country),
+        )
+
+        return JobPost(
+            id=job_id,
+            title=job_title,
+            company_name=company_name,
+            company_url="",
+            location=location_obj,
+            date_posted=None,
+            job_url=job_url,
+            compensation=None,
+            job_type=None,
+            job_level=None,
+            company_industry=None,
+            description=None,
+            job_url_direct=None,
+            emails=[],
+            company_logo=None,
+            job_function=None,
+        )
+
+    def _extract_job_url(self, job_general_information: BeautifulSoup) -> Optional[str]:
+        """
+        Pulls the job URL from the 'a' within the h2 element.
+        """
+        a_tag = job_general_information.find("a")
+        if a_tag and a_tag.has_attr("href"):
+            return self.base_url + a_tag["href"].strip()
+        return None
diff --git a/src/jobspy/scrapers/bayt/constants.py b/src/jobspy/scrapers/bayt/constants.py
diff --git a/src/jobspy/scrapers/exceptions.py b/src/jobspy/scrapers/exceptions.py
@@ -29,3 +29,7 @@ def __init__(self, message=None):
 class GoogleJobsException(Exception):
     def __init__(self, message=None):
         super().__init__(message or "An error occurred with Google Jobs")
+
+class BaytException(Exception):
+    def __init__(self, message=None):
+        super().__init__(message or "An error occurred with Bayt")