Skip to content

Commit

Permalink
Merge pull request #772 from freelawproject/fix-scotus-slip
Browse files Browse the repository at this point in the history
fix(scotus): Update scotus slip
  • Loading branch information
flooie authored Nov 18, 2023
2 parents 3e65a49 + 1cc701f commit ea56991
Show file tree
Hide file tree
Showing 8 changed files with 1,787 additions and 4,231 deletions.
Original file line number Diff line number Diff line change
@@ -1,14 +1,158 @@
from juriscraper.opinions.united_states.federal_appellate import scotus_slip

"""
Court Contact: https://www.supremecourt.gov/contact/contact_webmaster.aspx
"""


from datetime import date

from juriscraper.AbstractSite import logger
from juriscraper.lib.exceptions import InsanityException
from juriscraper.lib.string_utils import convert_date_string
from juriscraper.OpinionSite import OpinionSite


class Site(OpinionSite):
required_headers = ["Date", "Docket", "Name", "J."]
expected_headers = required_headers + ["Revised", "R-", "Pt."]
justices = {
"A": "Samuel Alito",
"AB": "Amy Coney Barrett",
"AS": "Antonin Scalia",
"B": "Stephen Breyer",
"BK": "Brett Kavanaugh",
"D": "Decree",
"DS": "David Souter",
"EK": "Elana Kagan",
"G": "Ruth Bader Ginsburg",
"JS": "John Paul Stephens",
"K": "Anthony Kennedy",
"KJ": "Ketanji Brown Jackson",
"NG": "Neil Gorsuch",
"PC": "Per Curiam",
"R": "John G. Roberts",
"SS": "Sonia Sotomayor",
"T": "Clarence Thomas",
}

class Site(scotus_slip.Site):
# Note that scotus_relating inherits from this class.
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self.yy = self._get_current_term()
self.back_scrape_iterable = list(range(5, int(self.yy) + 1))
self.url_base = "https://www.supremecourt.gov/opinions"
self.path_table = "//table[@class='table table-bordered']"
self.path_row = f"{self.path_table}/tr[position() > 1]"
self.precedential = "In-chambers"
self.court = "in-chambers"
self.headers = False
self.url = False
self.headers = []
self.cases = []

@staticmethod
def _get_current_term():
"""The URLs for SCOTUS correspond to the term, not the calendar.
The terms kick off on the first Monday of October, so we use October 1st
as our cut off date.
"""
today = date.today()
term_cutoff = date(today.year, 10, 1)
if today < term_cutoff:
# Haven't hit the cutoff, return previous year.
return int(today.strftime("%y")) - 1 # y3k bug!
else:
return today.strftime("%y")

def _download(self, request_dict={}):
if not self.test_mode_enabled():
self.set_url()
html = super()._download(request_dict)
self.extract_cases_from_html(html)
return html

def set_url(self):
self.url = f"{self.url_base}/{self.court}.aspx"

def set_table_headers(self, html):
# Do nothing if table is missing
if html.xpath(self.path_table):
path = f"{self.path_table}//th"
self.headers = [
cell.text_content().strip() for cell in html.xpath(path)
]
# Ensure that expected/required headers are present
if not set(self.required_headers).issubset(self.headers):
raise InsanityException("Required table column missing")

def extract_cases_from_html(self, html):
self.set_table_headers(html)
for row in html.xpath(self.path_row):
case = self.extract_case_data_from_row(row)
if case:
# Below will raise key error is new judge key encountered (new SC judge appointed)
case["judge"] = self.justices[case["J."]] if case["J."] else ""
self.cases.append(case)
for revision_data in case["revisions"]:
revision = case.copy()
revision["Date"] = revision_data["date_string"]
revision["Name_Url"] = revision_data["href"]
self.cases.append(revision)

def extract_case_data_from_row(self, row):
cell_index = 0
case = {"revisions": []}
# Process each cell in row
for cell in row.xpath("./td"):
text = cell.text_content().strip()
# Skip rows with blank first cell
if cell_index == 0 and not text:
break
label = self.headers[cell_index]
if label in ["R-", "Pt."]:
# Ignore some columns that we don't need
pass
elif label == "Revised":
# It is possible for an opinion to have
# multiple revisions, so we need to iterate
# over the links the the cell
for anchor in cell.xpath("a"):
case["revisions"].append(
{
"href": anchor.xpath("@href")[0],
"date_string": anchor.text_content(),
}
)
else:
# Handle normal data cells
case[label] = text
href = cell.xpath("./a/@href")
if href:
case[f"{label}_Url"] = href[0]
cell_index += 1
return case

def _get_case_names(self):
return [case["Name"] for case in self.cases]

def _get_download_urls(self):
return [case["Name_Url"] for case in self.cases]

def _get_case_dates(self):
return [convert_date_string(case["Date"]) for case in self.cases]

def _get_docket_numbers(self):
return [case["Docket"] for case in self.cases]

def _get_judges(self):
return [case["judge"] for case in self.cases]

def _get_precedential_statuses(self):
return [self.precedential] * len(self.cases)

def _download_backwards(self, d):
self.yy = str(d if d >= 10 else f"0{d}")
logger.info(f"Running backscraper for year: 20{self.yy}")
self.html = self._download()
126 changes: 22 additions & 104 deletions juriscraper/opinions/united_states/federal_appellate/scotus_slip.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,10 @@

from datetime import date

from juriscraper.AbstractSite import logger
from juriscraper.lib.exceptions import InsanityException
from juriscraper.lib.string_utils import convert_date_string
from juriscraper.OpinionSite import OpinionSite
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


class Site(OpinionSite):
required_headers = ["Date", "Docket", "Name", "J."]
expected_headers = required_headers + ["Revised", "R-", "Pt."]
class Site(OpinionSiteLinear):
justices = {
"A": "Samuel Alito",
"AB": "Amy Coney Barrett",
Expand All @@ -38,16 +33,11 @@ def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self.yy = self._get_current_term()
self.back_scrape_iterable = list(range(6, int(self.yy) + 1))
self.status = "Published"
self.url_base = "https://www.supremecourt.gov/opinions"
self.path_table = "//table[@class='table table-bordered']"
self.path_row = f"{self.path_table}/tr[position() > 1]"
self.precedential = "Published"
self.court = "slipopinion"
self.headers = False
self.url = False
self.headers = []
self.cases = []
self.url = f"{self.url_base}/{self.court}/{self.yy}"

@staticmethod
def _get_current_term():
Expand All @@ -64,93 +54,21 @@ def _get_current_term():
else:
return today.strftime("%y")

def _download(self, request_dict={}):
if not self.test_mode_enabled():
self.set_url()
html = super()._download(request_dict)
self.extract_cases_from_html(html)
return html

def set_url(self):
self.url = f"{self.url_base}/{self.court}/{self.yy}"

def set_table_headers(self, html):
# Do nothing if table is missing
if html.xpath(self.path_table):
path = f"{self.path_table}//th"
self.headers = [
cell.text_content().strip() for cell in html.xpath(path)
]
# Ensure that expected/required headers are present
if not set(self.required_headers).issubset(self.headers):
raise InsanityException("Required table column missing")

def extract_cases_from_html(self, html):
self.set_table_headers(html)
for row in html.xpath(self.path_row):
case = self.extract_case_data_from_row(row)
if case:
# Below will raise key error is new judge key encountered (new SC judge appointed)
case["judge"] = self.justices[case["J."]] if case["J."] else ""
self.cases.append(case)
for revision_data in case["revisions"]:
revision = case.copy()
revision["Date"] = revision_data["date_string"]
revision["Name_Url"] = revision_data["href"]
self.cases.append(revision)

def extract_case_data_from_row(self, row):
cell_index = 0
case = {"revisions": []}
# Process each cell in row
for cell in row.xpath("./td"):
text = cell.text_content().strip()
# Skip rows with blank first cell
if cell_index == 0 and not text:
break
label = self.headers[cell_index]
if label in ["R-", "Pt."]:
# Ignore some columns that we don't need
pass
elif label == "Revised":
# It is possible for an opinion to have
# multiple revisions, so we need to iterate
# over the links the the cell
for anchor in cell.xpath("a"):
case["revisions"].append(
{
"href": anchor.xpath("@href")[0],
"date_string": anchor.text_content(),
}
)
else:
# Handle normal data cells
case[label] = text
href = cell.xpath("./a/@href")
if href:
case[f"{label}_Url"] = href[0]
cell_index += 1
return case

def _get_case_names(self):
return [case["Name"] for case in self.cases]

def _get_download_urls(self):
return [case["Name_Url"] for case in self.cases]

def _get_case_dates(self):
return [convert_date_string(case["Date"]) for case in self.cases]

def _get_docket_numbers(self):
return [case["Docket"] for case in self.cases]

def _get_judges(self):
return [case["judge"] for case in self.cases]

def _get_precedential_statuses(self):
return [self.precedential] * len(self.cases)

def _download_backwards(self, d):
self.yy = str(d if d >= 10 else f"0{d}")
logger.info(f"Running backscraper for year: 20{self.yy}")
self.html = self._download()
def _process_html(self):
for row in self.html.xpath("//tr"):
cells = row.xpath(".//td")
if len(cells) != 6:
continue
a, date, docket, link, justice, citation = row.xpath(".//td")
if not link.text_content():
continue
self.cases.append(
{
"citation": citation.text_content(),
"date": date.text_content(),
"url": link.xpath(".//a/@href")[0],
"name": link.text_content(),
"docket": docket.text_content(),
"judge": self.justices[justice.text_content()],
}
)
Loading

0 comments on commit ea56991

Please sign in to comment.