Skip to content

Commit

Permalink
Merge pull request #1059 from City-Bureau/adcrc
Browse files Browse the repository at this point in the history
🏗️ Build spider: Illinois African Descent Citizens Reparations Commission
  • Loading branch information
SimmonsRitchie authored Feb 12, 2024
2 parents b79eaca + e981b54 commit f451a7f
Show file tree
Hide file tree
Showing 6 changed files with 1,248 additions and 7 deletions.
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ scrapy-sentry-errors = "1.0.0"
scrapy-wayback-middleware = "*"
city-scrapers-core = {ref = "main", git = "https://github.com/City-Bureau/city-scrapers-core.git", extras = ["azure"]}
pdfminer-six = "*"
bs4 = "*"

[dev-packages]
freezegun = "*"
Expand Down
40 changes: 33 additions & 7 deletions Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

105 changes: 105 additions & 0 deletions city_scrapers/spiders/il_adcrc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import re
from datetime import datetime

from bs4 import BeautifulSoup
from city_scrapers_core.constants import COMMISSION
from city_scrapers_core.items import Meeting
from city_scrapers_core.spiders import CityScrapersSpider


class IlAdcrcSpider(CityScrapersSpider):
name = "il_adcrc"
agency = "Illinois African Descent Citizens Reparations Commission"
timezone = "America/Chicago"
start_urls = ["https://adcrc.illinois.gov/meetings.html"]

def parse(self, response):
"""
Retrieve the URL to the upcoming meetings JSON feed from the main
page and then follow that URL to parse the feed.
"""
upcoming_meetings_url = response.css(
".cmp-news-feed::attr(data-news-feed-url)"
).get()
if not upcoming_meetings_url:
self.logger.error("No upcoming meetings found")
return
yield response.follow(upcoming_meetings_url, self.parse_json)

def parse_json(self, response):
"""Parse the JSON feed to get the meeting details."""
json = response.json()
for item in json["eventFeedItemList"]:
meeting = Meeting(
title=item["eventTitle"],
description=self._parse_description(item),
classification=COMMISSION,
start=self._parse_datetime(item["start"]),
end=self._parse_datetime(item["end"]),
all_day=False,
time_notes="",
location=self._parse_location(item),
links=self._parse_links(item),
source=self._parse_source(response),
)
meeting["status"] = self._get_status(meeting, item)
meeting["id"] = self._get_id(meeting)
yield meeting

def _parse_description(self, item):
"""Parse meeting description. In most cases, the description appears
to be empty. If so, we default to providing info about the virtual
meeting.
"""
if item.get("description"):
return item["description"]
elif item.get("virtualList"):
html_content = item["virtualList"][0]["additionalInfo"]
soup = BeautifulSoup(html_content, "html.parser")
plain_text = soup.get_text(separator=" ", strip=True)
normalized_text = re.sub(r"\s+", " ", plain_text).strip()
return normalized_text
return ""

def _parse_datetime(self, datetime_str):
"""Parse start datetime as a naive datetime object."""
return datetime.strptime(datetime_str, "%Y-%m-%dT%H:%M:%S.%f")

def _parse_location(self, item):
"""Parse or generate location."""
if not item.get("physicalList") or len(item["physicalList"]) == 0:
return {
"address": "",
"name": "TBD",
}
location = item["physicalList"][0]
name = location["locationName"].replace("In-Person: ", "")
address = (
f"{location['streetLineOne']}, {location['city']}, {location['state']}"
)
return {
"address": address,
"name": name,
}

def _parse_links(self, item):
"""Generate links based on virtual link details."""
if not item.get("virtualList"):
return ""
virtualLocation = item["virtualList"][0]
return [
{"href": virtualLocation["link"], "title": virtualLocation["locationName"]}
]

def _parse_source(self, response):
"""Generate source."""
return response.url

def _get_status(self, meeting, item):
"""Checks the canceledEvent property first and then passes a
"canceled" string to the parent class's _get_status method so
we can rely on default status handling.
"""
if item.get("canceledEvent") and item["canceledEvent"] == "true":
return super()._get_status(meeting, text="canceled")
return super()._get_status(meeting)
Loading

0 comments on commit f451a7f

Please sign in to comment.