Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(scotus): Update scotus slip #772

Merged
merged 1 commit into from
Nov 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,14 +1,158 @@
from juriscraper.opinions.united_states.federal_appellate import scotus_slip

"""
Court Contact: https://www.supremecourt.gov/contact/contact_webmaster.aspx
"""


from datetime import date

from juriscraper.AbstractSite import logger
from juriscraper.lib.exceptions import InsanityException
from juriscraper.lib.string_utils import convert_date_string
from juriscraper.OpinionSite import OpinionSite


class Site(OpinionSite):
required_headers = ["Date", "Docket", "Name", "J."]
expected_headers = required_headers + ["Revised", "R-", "Pt."]
justices = {
"A": "Samuel Alito",
"AB": "Amy Coney Barrett",
"AS": "Antonin Scalia",
"B": "Stephen Breyer",
"BK": "Brett Kavanaugh",
"D": "Decree",
"DS": "David Souter",
"EK": "Elana Kagan",
"G": "Ruth Bader Ginsburg",
"JS": "John Paul Stephens",
"K": "Anthony Kennedy",
"KJ": "Ketanji Brown Jackson",
"NG": "Neil Gorsuch",
"PC": "Per Curiam",
"R": "John G. Roberts",
"SS": "Sonia Sotomayor",
"T": "Clarence Thomas",
}

class Site(scotus_slip.Site):
# Note that scotus_relating inherits from this class.
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self.yy = self._get_current_term()
self.back_scrape_iterable = list(range(5, int(self.yy) + 1))
self.url_base = "https://www.supremecourt.gov/opinions"
self.path_table = "//table[@class='table table-bordered']"
self.path_row = f"{self.path_table}/tr[position() > 1]"
self.precedential = "In-chambers"
self.court = "in-chambers"
self.headers = False
self.url = False
self.headers = []
self.cases = []

@staticmethod
def _get_current_term():
"""The URLs for SCOTUS correspond to the term, not the calendar.

The terms kick off on the first Monday of October, so we use October 1st
as our cut off date.
"""
today = date.today()
term_cutoff = date(today.year, 10, 1)
if today < term_cutoff:
# Haven't hit the cutoff, return previous year.
return int(today.strftime("%y")) - 1 # y3k bug!
else:
return today.strftime("%y")

def _download(self, request_dict={}):
if not self.test_mode_enabled():
self.set_url()
html = super()._download(request_dict)
self.extract_cases_from_html(html)
return html

def set_url(self):
self.url = f"{self.url_base}/{self.court}.aspx"

def set_table_headers(self, html):
# Do nothing if table is missing
if html.xpath(self.path_table):
path = f"{self.path_table}//th"
self.headers = [
cell.text_content().strip() for cell in html.xpath(path)
]
# Ensure that expected/required headers are present
if not set(self.required_headers).issubset(self.headers):
raise InsanityException("Required table column missing")

def extract_cases_from_html(self, html):
self.set_table_headers(html)
for row in html.xpath(self.path_row):
case = self.extract_case_data_from_row(row)
if case:
# Below will raise key error is new judge key encountered (new SC judge appointed)
case["judge"] = self.justices[case["J."]] if case["J."] else ""
self.cases.append(case)
for revision_data in case["revisions"]:
revision = case.copy()
revision["Date"] = revision_data["date_string"]
revision["Name_Url"] = revision_data["href"]
self.cases.append(revision)

def extract_case_data_from_row(self, row):
cell_index = 0
case = {"revisions": []}
# Process each cell in row
for cell in row.xpath("./td"):
text = cell.text_content().strip()
# Skip rows with blank first cell
if cell_index == 0 and not text:
break
label = self.headers[cell_index]
if label in ["R-", "Pt."]:
# Ignore some columns that we don't need
pass
elif label == "Revised":
# It is possible for an opinion to have
# multiple revisions, so we need to iterate
# over the links the the cell
for anchor in cell.xpath("a"):
case["revisions"].append(
{
"href": anchor.xpath("@href")[0],
"date_string": anchor.text_content(),
}
)
else:
# Handle normal data cells
case[label] = text
href = cell.xpath("./a/@href")
if href:
case[f"{label}_Url"] = href[0]
cell_index += 1
return case

def _get_case_names(self):
return [case["Name"] for case in self.cases]

def _get_download_urls(self):
return [case["Name_Url"] for case in self.cases]

def _get_case_dates(self):
return [convert_date_string(case["Date"]) for case in self.cases]

def _get_docket_numbers(self):
return [case["Docket"] for case in self.cases]

def _get_judges(self):
return [case["judge"] for case in self.cases]

def _get_precedential_statuses(self):
return [self.precedential] * len(self.cases)

def _download_backwards(self, d):
self.yy = str(d if d >= 10 else f"0{d}")
logger.info(f"Running backscraper for year: 20{self.yy}")
self.html = self._download()
126 changes: 22 additions & 104 deletions juriscraper/opinions/united_states/federal_appellate/scotus_slip.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,10 @@

from datetime import date

from juriscraper.AbstractSite import logger
from juriscraper.lib.exceptions import InsanityException
from juriscraper.lib.string_utils import convert_date_string
from juriscraper.OpinionSite import OpinionSite
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


class Site(OpinionSite):
required_headers = ["Date", "Docket", "Name", "J."]
expected_headers = required_headers + ["Revised", "R-", "Pt."]
class Site(OpinionSiteLinear):
justices = {
"A": "Samuel Alito",
"AB": "Amy Coney Barrett",
Expand All @@ -38,16 +33,11 @@ def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self.yy = self._get_current_term()
self.back_scrape_iterable = list(range(6, int(self.yy) + 1))
self.status = "Published"
self.url_base = "https://www.supremecourt.gov/opinions"
self.path_table = "//table[@class='table table-bordered']"
self.path_row = f"{self.path_table}/tr[position() > 1]"
self.precedential = "Published"
self.court = "slipopinion"
self.headers = False
self.url = False
self.headers = []
self.cases = []
self.url = f"{self.url_base}/{self.court}/{self.yy}"

@staticmethod
def _get_current_term():
Expand All @@ -64,93 +54,21 @@ def _get_current_term():
else:
return today.strftime("%y")

def _download(self, request_dict={}):
if not self.test_mode_enabled():
self.set_url()
html = super()._download(request_dict)
self.extract_cases_from_html(html)
return html

def set_url(self):
self.url = f"{self.url_base}/{self.court}/{self.yy}"

def set_table_headers(self, html):
# Do nothing if table is missing
if html.xpath(self.path_table):
path = f"{self.path_table}//th"
self.headers = [
cell.text_content().strip() for cell in html.xpath(path)
]
# Ensure that expected/required headers are present
if not set(self.required_headers).issubset(self.headers):
raise InsanityException("Required table column missing")

def extract_cases_from_html(self, html):
self.set_table_headers(html)
for row in html.xpath(self.path_row):
case = self.extract_case_data_from_row(row)
if case:
# Below will raise key error is new judge key encountered (new SC judge appointed)
case["judge"] = self.justices[case["J."]] if case["J."] else ""
self.cases.append(case)
for revision_data in case["revisions"]:
revision = case.copy()
revision["Date"] = revision_data["date_string"]
revision["Name_Url"] = revision_data["href"]
self.cases.append(revision)

def extract_case_data_from_row(self, row):
cell_index = 0
case = {"revisions": []}
# Process each cell in row
for cell in row.xpath("./td"):
text = cell.text_content().strip()
# Skip rows with blank first cell
if cell_index == 0 and not text:
break
label = self.headers[cell_index]
if label in ["R-", "Pt."]:
# Ignore some columns that we don't need
pass
elif label == "Revised":
# It is possible for an opinion to have
# multiple revisions, so we need to iterate
# over the links the the cell
for anchor in cell.xpath("a"):
case["revisions"].append(
{
"href": anchor.xpath("@href")[0],
"date_string": anchor.text_content(),
}
)
else:
# Handle normal data cells
case[label] = text
href = cell.xpath("./a/@href")
if href:
case[f"{label}_Url"] = href[0]
cell_index += 1
return case

def _get_case_names(self):
return [case["Name"] for case in self.cases]

def _get_download_urls(self):
return [case["Name_Url"] for case in self.cases]

def _get_case_dates(self):
return [convert_date_string(case["Date"]) for case in self.cases]

def _get_docket_numbers(self):
return [case["Docket"] for case in self.cases]

def _get_judges(self):
return [case["judge"] for case in self.cases]

def _get_precedential_statuses(self):
return [self.precedential] * len(self.cases)

def _download_backwards(self, d):
self.yy = str(d if d >= 10 else f"0{d}")
logger.info(f"Running backscraper for year: 20{self.yy}")
self.html = self._download()
def _process_html(self):
for row in self.html.xpath("//tr"):
cells = row.xpath(".//td")
if len(cells) != 6:
continue
a, date, docket, link, justice, citation = row.xpath(".//td")
if not link.text_content():
continue
self.cases.append(
{
"citation": citation.text_content(),
"date": date.text_content(),
"url": link.xpath(".//a/@href")[0],
"name": link.text_content(),
"docket": docket.text_content(),
"judge": self.justices[justice.text_content()],
}
)
Loading
Loading