Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

0572 spider chi ssa 38 #962

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 116 additions & 0 deletions city_scrapers/spiders/chi_ssa_38.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import re
from datetime import datetime

from city_scrapers_core.constants import NOT_CLASSIFIED
from city_scrapers_core.items import Meeting
from city_scrapers_core.spiders import CityScrapersSpider


class ChiSsa38Spider(CityScrapersSpider):
name = "chi_ssa_38"
agency = "Chicago Special Service Area #38 Northcenter"
timezone = "America/Chicago"
start_urls = ["http://www.northcenterchamber.com/pages/MeetingsTransparency1"]

def parse(self, response):
"""
`parse` should always `yield` Meeting items.

Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping
needs.
"""
for item in response.css("div#content-167642 li::text"):
meeting = Meeting(
title=self._parse_title(item),
description=self._parse_description(item),
classification=self._parse_classification(item),
start=self._parse_start(item),
end=self._parse_end(item),
all_day=self._parse_all_day(item),
time_notes=self._parse_time_notes(item),
location=self._parse_location(item),
links=self._parse_links(item),
source=self._parse_source(response),
)

meeting["status"] = self._get_status(meeting)
meeting["id"] = self._get_id(meeting)

yield meeting

def _parse_title(self, item):
"""Parse or generate meeting title."""
title = "Chamber of Commerce"
return title
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mentioned in _parse_description, but it's fine to just return the string without assigning to a variable first. It's a bit odd for SSAs, but this one should be "Commission". They're technically separate entities managed by a nonprofit


def _parse_description(self, item):
"""Parse or generate meeting description."""
description = ""
return description
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's fine to just return "" instead of setting a variable first


def _parse_classification(self, item):
"""Parse or generate classification from allowed options."""
return NOT_CLASSIFIED
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be COMMISSION for all meetings on this spider


def _parse_start(self, item):
"""Parse start datetime as a naive datetime object."""
date_str = item.extract()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like we might be able to simplify this a bit, and we'll also need to handle situations where minutes may be supplied for the time. Haven't tested this, but something like this snippet could work:

item_str = item.extract()
month_day_str = re.search(r"[A-Z][a-z]{2,9} \d{1,2}", item_str).group()

year_str = re.search(r"\d{4}", item_str).group()
if not year_str[:2] == "20":
  year_str = str(datetime.today().year)  # Default to current year

time_match = re.search(r"\d{1,2}(\:\d\d) [apm\.]{2,4}", item_str)  # We want to check for a minutes portion here
time_str = "12 am"
if time_match:
  time_str = time_match.group().replace(".", "")

time_fmt = "%I %p"
if ":" in time_str:
  time_fmt = "%I:%M %p"

return datetime.strptime(f"{month_day_str} {year_str} {time_str}", f"%B %d %Y {time_str}")


# Use regex to extract parts of the date
date_words = date_str.split()
month_name = date_words[0]
datetime_obj = datetime.strptime(month_name, "%B")
meeting_day = int(re.findall("[0-9]+", date_words[1])[0])

# Meeting time defaults to 12:00 AM
meeting_hour = 0
# Meeting year defaults to current
meeting_year = datetime.today().year

# Handle irregularly formatted parts of date
for i in range(len(date_words)):
# Meeting time appears in different formats, but always one
# index before "a.m." or "p.m."
if "a.m." in date_words[i]:
meeting_hour = int(date_words[i - 1])
elif "p.m." in date_words[i]:
meeting_hour = int(date_words[i - 1]) + 12

# Validate reasonable year exists if it starts with "20"
if date_words[i][0:2] == "20":
meeting_year = int(re.findall("[0-9]+", date_words[i])[0])

# Put time, day, and year into datetime object
start_time = datetime_obj.replace(
hour=meeting_hour, day=meeting_day, year=meeting_year
)

return start_time

def _parse_end(self, item):
"""Parse end datetime as a naive datetime object. Added by pipeline if None"""
return None

def _parse_time_notes(self, item):
"""Parse any additional notes on the timing of the meeting"""
return ""

def _parse_all_day(self, item):
"""Parse or generate all-day status. Defaults to False."""
return False

def _parse_location(self, item):
# Meetings seemingly ocurred at
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like an abbreviated version of this is in the meeting item, since we don't have examples of another format we can do something like this to return a default location if "4054" is present and raise an exception otherwise:

if "4054" not in item.extract():
  raise ValueError("Meeting location has changed")
return {
  "address": "4054 N Lincoln Ave, Chicago, IL 60618",
  "name": "Northcenter Chamber of Commerce",
}


return {
"address": "4054 N Lincoln Ave, Chicago, IL 60618",
"name": "Northcenter Chamber of Commerce",
}

def _parse_links(self, item):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We'll need to parse a mapping of dates to relevant links from the page so that things like meeting minutes can be associated with the meetings listed. Here's an example of that:

def _parse_link_date_map(self, response):
"""Generate a defaultdict mapping of meeting dates and associated links"""
link_date_map = defaultdict(list)
for link in response.css(
".vc_col-sm-4.column_container:nth-child(1) .mk-text-block.indent16"
)[:1].css("a"):
link_str = link.xpath("./text()").extract_first()
link_start = self._parse_start(link_str)
if link_start:
link_date_map[link_start.date()].append(
{
"title": re.sub(r"\s+", " ", link_str.split(" – ")[-1]).strip(),
"href": link.attrib["href"],
}
)
for section in response.css(
".vc_col-sm-4.column_container:nth-child(1) .vc_tta-panel"
):
year_str = section.css(".vc_tta-title-text::text").extract_first().strip()
for section_link in section.css("p > a"):
link_str = section_link.xpath("./text()").extract_first()
link_dt = self._parse_start(link_str, year=year_str)
if link_dt:
link_date_map[link_dt.date()].append(
{
"title": re.sub(
r"\s+", " ", link_str.split(" – ")[-1]
).strip(),
"href": section_link.xpath("@href").extract_first(),
}
)
return link_date_map

"""Parse or generate links."""
return [{"href": "", "title": ""}]

def _parse_source(self, response):
"""Parse or generate source."""
return response.url
Loading