-
-
Notifications
You must be signed in to change notification settings - Fork 311
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1059 from City-Bureau/adcrc
🏗️ Build spider: Illinois African Descent Citizens Reparations Commission
- Loading branch information
Showing
6 changed files
with
1,248 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
import re | ||
from datetime import datetime | ||
|
||
from bs4 import BeautifulSoup | ||
from city_scrapers_core.constants import COMMISSION | ||
from city_scrapers_core.items import Meeting | ||
from city_scrapers_core.spiders import CityScrapersSpider | ||
|
||
|
||
class IlAdcrcSpider(CityScrapersSpider): | ||
name = "il_adcrc" | ||
agency = "Illinois African Descent Citizens Reparations Commission" | ||
timezone = "America/Chicago" | ||
start_urls = ["https://adcrc.illinois.gov/meetings.html"] | ||
|
||
def parse(self, response): | ||
""" | ||
Retrieve the URL to the upcoming meetings JSON feed from the main | ||
page and then follow that URL to parse the feed. | ||
""" | ||
upcoming_meetings_url = response.css( | ||
".cmp-news-feed::attr(data-news-feed-url)" | ||
).get() | ||
if not upcoming_meetings_url: | ||
self.logger.error("No upcoming meetings found") | ||
return | ||
yield response.follow(upcoming_meetings_url, self.parse_json) | ||
|
||
def parse_json(self, response): | ||
"""Parse the JSON feed to get the meeting details.""" | ||
json = response.json() | ||
for item in json["eventFeedItemList"]: | ||
meeting = Meeting( | ||
title=item["eventTitle"], | ||
description=self._parse_description(item), | ||
classification=COMMISSION, | ||
start=self._parse_datetime(item["start"]), | ||
end=self._parse_datetime(item["end"]), | ||
all_day=False, | ||
time_notes="", | ||
location=self._parse_location(item), | ||
links=self._parse_links(item), | ||
source=self._parse_source(response), | ||
) | ||
meeting["status"] = self._get_status(meeting, item) | ||
meeting["id"] = self._get_id(meeting) | ||
yield meeting | ||
|
||
def _parse_description(self, item): | ||
"""Parse meeting description. In most cases, the description appears | ||
to be empty. If so, we default to providing info about the virtual | ||
meeting. | ||
""" | ||
if item.get("description"): | ||
return item["description"] | ||
elif item.get("virtualList"): | ||
html_content = item["virtualList"][0]["additionalInfo"] | ||
soup = BeautifulSoup(html_content, "html.parser") | ||
plain_text = soup.get_text(separator=" ", strip=True) | ||
normalized_text = re.sub(r"\s+", " ", plain_text).strip() | ||
return normalized_text | ||
return "" | ||
|
||
def _parse_datetime(self, datetime_str): | ||
"""Parse start datetime as a naive datetime object.""" | ||
return datetime.strptime(datetime_str, "%Y-%m-%dT%H:%M:%S.%f") | ||
|
||
def _parse_location(self, item): | ||
"""Parse or generate location.""" | ||
if not item.get("physicalList") or len(item["physicalList"]) == 0: | ||
return { | ||
"address": "", | ||
"name": "TBD", | ||
} | ||
location = item["physicalList"][0] | ||
name = location["locationName"].replace("In-Person: ", "") | ||
address = ( | ||
f"{location['streetLineOne']}, {location['city']}, {location['state']}" | ||
) | ||
return { | ||
"address": address, | ||
"name": name, | ||
} | ||
|
||
def _parse_links(self, item): | ||
"""Generate links based on virtual link details.""" | ||
if not item.get("virtualList"): | ||
return "" | ||
virtualLocation = item["virtualList"][0] | ||
return [ | ||
{"href": virtualLocation["link"], "title": virtualLocation["locationName"]} | ||
] | ||
|
||
def _parse_source(self, response): | ||
"""Generate source.""" | ||
return response.url | ||
|
||
def _get_status(self, meeting, item): | ||
"""Checks the canceledEvent property first and then passes a | ||
"canceled" string to the parent class's _get_status method so | ||
we can rely on default status handling. | ||
""" | ||
if item.get("canceledEvent") and item["canceledEvent"] == "true": | ||
return super()._get_status(meeting, text="canceled") | ||
return super()._get_status(meeting) |
Oops, something went wrong.