Skip to content

Commit

Permalink
Merge pull request #1161 from freelawproject/update-ny-ct-app
Browse files Browse the repository at this point in the history
fix(ny): Fix NY Court of Appeals
  • Loading branch information
flooie authored Sep 10, 2024
2 parents 23c03a2 + 657098a commit 551ee21
Show file tree
Hide file tree
Showing 40 changed files with 5,216 additions and 17,471 deletions.
26 changes: 26 additions & 0 deletions juriscraper/lib/auth_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import os

from juriscraper.AbstractSite import logger
from juriscraper.OpinionSite import OpinionSite


def set_api_token_header(site: OpinionSite) -> None:
"""
Puts the NY_API_TOKEN in the X-Api-Token header
Creates the Site.headers attribute, copying the
scraper_site.request[headers]
:param scraper_site: a Site Object
:returns: None
"""
if site.test_mode_enabled():
return
api_token = os.environ.get("NY_API_TOKEN", None)
if not api_token:
logger.warning(
"NY_API_TOKEN environment variable is not set. "
f"It is required for scraping New York Court: {site.court_id}"
)
return
site.request["headers"]["X-APIKEY"] = api_token
site.needs_special_headers = True
222 changes: 110 additions & 112 deletions juriscraper/opinions/united_states/state/ny.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,132 +6,130 @@
2014-07-04: Created by Andrei Chelaru, reviewed by mlr.
2015-10-23: Parts rewritten by mlr.
2016-05-04: Updated by arderyp to handle typos in docket string format
2024-09-05: Updated by flooie to deal with block from main website
"""

import os
import re
from datetime import date
from datetime import date, timedelta
from typing import Any, Dict, Optional, Tuple

from juriscraper.AbstractSite import logger
from juriscraper.lib.html_utils import get_html5_parsed_text
from juriscraper.lib.string_utils import convert_date_string
from juriscraper.OpinionSite import OpinionSite


def set_api_token_header(scraper_site: OpinionSite) -> None:
"""
Puts the NY_API_TOKEN in the X-Api-Token header
Creates the Site.headers attribute, copying the
scraper_site.request[headers]
:param scraper_site: a Site Object
:returns: None
"""
if scraper_site.test_mode_enabled():
return

api_token = os.environ.get("NY_API_TOKEN")
if not api_token:
logger.warning(
"NY_API_TOKEN environment variable is not set. "
"It is required for scraping New York Courts"
)
return
from juriscraper.lib.auth_utils import set_api_token_header
from juriscraper.lib.judge_parsers import normalize_judge_string
from juriscraper.OpinionSiteLinear import OpinionSiteLinear

scraper_site.request["headers"]["X-APIKEY"] = api_token
scraper_site.needs_special_headers = True

class Site(OpinionSiteLinear):

class Site(OpinionSite):
DOWNLOAD_URL_SUB_PATH = "td[2]//@href[not(contains(., 'DecisionList'))]"
FOUR_CELLS_SUB_PATH = "//*[count(td)=3"
first_opinion_date = date(2003, 9, 25)
days_interval = 30

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
today = date.today()
# https://www.nycourts.gov/ctapps/Decisions/2015/Dec15/December15.html
self.url = "http://www.nycourts.gov/ctapps/Decisions/{year}/{mon}{yr}/{month}{yr}.html".format(
year=today.year,
yr=today.strftime("%y"),
mon=today.strftime("%b"),
month=today.strftime("%B"),
)
self.court = "Court of Appeals"
self.court_id = self.__module__
self.url = "https://iapps.courts.state.ny.us/lawReporting/Search?searchType=opinion"
self._set_parameters()
self.expected_content_types = ["application/pdf", "text/html"]
self.make_backscrape_iterable(kwargs)
set_api_token_header(self)

def _make_html_tree(self, text):
return get_html5_parsed_text(text)

def _get_case_names(self):
path = f"{self.FOUR_CELLS_SUB_PATH} and {self.DOWNLOAD_URL_SUB_PATH}]"
case_names = []
for element in self.html.xpath(path):
case_name_parts = []
for t in element.xpath("./td[3]/p/font/text()"):
if t.strip():
case_name_parts.append(t)
if not case_name_parts:
# No hits for first XPath, try another that sometimes works.
for t in element.xpath("./td[3]//text()"):
if t.strip():
case_name_parts.append(t)
if case_name_parts:
case_names.append(", ".join(case_name_parts))
return case_names

def _get_download_urls(self):
return self.html.xpath(
f"{self.FOUR_CELLS_SUB_PATH}]/{self.DOWNLOAD_URL_SUB_PATH}"
def _set_parameters(
self,
start_date: Optional[date] = None,
end_date: Optional[date] = None,
) -> None:
"""Set the parameters for the POST request.
If no start or end dates are given, scrape last month.
This is the default behaviour for the present time scraper
:param start_date:
:param end_date:
:return: None
"""
self.method = "POST"

if not end_date:
end_date = date.today()
start_date = end_date - timedelta(days=30)

self.parameters = {
"rbOpinionMotion": "opinion",
"Pty": "",
"and_or": "and",
"dtStartDate": start_date.strftime("%m/%d/%Y"),
"dtEndDate": end_date.strftime("%m/%d/%Y"),
"court": self.court,
"docket": "",
"judge": "",
"slipYear": "",
"slipNo": "",
"OffVol": "",
"Rptr": "",
"OffPage": "",
"fullText": "",
"and_or2": "and",
"Order_By": "Party Name",
"Submit": "Find",
"hidden1": "",
"hidden2": "",
}

def _process_html(self):
for row in self.html.xpath(".//table")[-1].xpath(".//tr")[1:]:
slip_cite = " ".join(row.xpath("./td[5]//text()"))
official_citation = " ".join(row.xpath("./td[4]//text()"))
url = row.xpath(".//a")[0].get("href")
url = re.findall(r"(http.*htm)", url)[0]
status = "Unpublished" if "(U)" in slip_cite else "Published"
case = {
"name": row.xpath(".//td")[0].text_content(),
"date": row.xpath(".//td")[1].text_content(),
"url": url,
"status": status,
"docket": "",
"citation": official_citation,
"parallel_citation": slip_cite,
"author": "",
"per_curiam": False,
}
author = row.xpath("./td")[-2].text_content()

# Because P E R C U R I A M, PER CURIAM, and Per Curiam
pc = re.sub(r"\s", "", author.lower())
if "percuriam" in pc:
case["per_curiam"] = True
elif author:
cleaned_author = normalize_judge_string(author)[0]
if cleaned_author.endswith(" J."):
cleaned_author = cleaned_author[:-3]
case["author"] = cleaned_author
self.cases.append(case)

def extract_from_text(self, scraped_text: str) -> Dict[str, Any]:
"""Can we extract the docket number from the text?
:param scraped_text: The content of the document downloaded
:return: Metadata to be added to the case
"""
dockets = re.search(
r"^<br>(?P<docket_number>No\. \d+(\s+SSM \d+)?)\s?$",
scraped_text[:2000],
re.MULTILINE,
)
if dockets:
return {"Docket": dockets.groupdict()}
return {}

def _download_backwards(self, dates: Tuple[date]) -> None:
"""Make custom date range request
def _get_case_dates(self):
case_dates = []
case_date = False
# Process rows. If it's a date row,
# save the date, continue, then add
# date for each opinion row below it
for row in self.html.xpath("//tr[not(.//table)]"):
date_from_row = self.get_date_from_text(row.text_content())
if date_from_row:
case_date = date_from_row
continue
elif self._row_contains_opinion(row) and case_date:
case_dates.append(case_date)
return case_dates

def _get_precedential_statuses(self):
return ["Published"] * len(self.case_names)

def _get_docket_numbers(self):
docket_numbers = []
for cell in self.html.xpath(f"{self.FOUR_CELLS_SUB_PATH}]/td[1]"):
text = cell.text_content()
date_from_text = self.get_date_from_text(text)
if not date_from_text:
if re.search(r"N(o|O|0)\.?,?", text):
docket = self._sanitize_docket_string(text)
docket_numbers.append(docket)
return docket_numbers

def _sanitize_docket_string(self, raw_docket_string):
"""Handle typos and non-standard docket number strings
Dockets on this page should be in format of "No. #",
but sometimes they forget the period, or use a comma
instead. We want to trip all variations of that out
and replace slash delimiters with coma delimiters.
:param dates: (start_date, end_date) tuple
:return None
"""
for abbreviation in ["No.", "No ", "No, ", "NO. "]:
raw_docket_string = raw_docket_string.replace(abbreviation, "")
return ", ".join(raw_docket_string.split(" / "))

def _row_contains_opinion(self, row):
p1 = "./td[3]"
p2 = f"./{self.DOWNLOAD_URL_SUB_PATH}"
return row.xpath(p1) and row.xpath(p2)

def get_date_from_text(self, text):
try:
return convert_date_string(text)
except ValueError:
return False
logger.info("Backscraping for range %s %s", *dates)
self._set_parameters(*dates)
self.html = self._download()
self._process_html()
116 changes: 23 additions & 93 deletions juriscraper/opinions/united_states/state/nyappdiv_1st.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,105 +4,35 @@
# Author: Andrei Chelaru
# Reviewer: mlr
# Date: 2014-07-04

import re
from datetime import date
from typing import Any, Dict

from dateutil.rrule import MONTHLY, rrule
from juriscraper.opinions.united_states.state import ny

from juriscraper.AbstractSite import logger
from juriscraper.lib.html_utils import get_html5_parsed_text
from juriscraper.lib.string_utils import convert_date_string
from juriscraper.opinions.united_states.state.ny import set_api_token_header
from juriscraper.OpinionSite import OpinionSite

class Site(ny.Site):
first_opinion_date = date(2003, 9, 25)
days_interval = 30

class Site(OpinionSite):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
date_keys = rrule(
MONTHLY, dtstart=date(2003, 11, 1), until=date(2015, 8, 30)
)
self.back_scrape_iterable = [i.date() for i in date_keys]
self.row_base_path = '//tr[contains(./td[1]/a/@href, "3d")]'
self.division = 1
self.url = self.build_url()
self.expected_content_types = ["application/pdf", "text/html"]
set_api_token_header(self)

def _get_case_names(self):
path = f"{self.row_base_path}/td[1]"
return [cell.text_content() for cell in self.html.xpath(path)]

def build_url(self, target_date=False):
base = (
"http://www.courts.state.ny.us/reporter/slipidx/aidxtable_%s"
% self.division
self.court = "App Div, 1st Dept"
self.make_backscrape_iterable(kwargs)
self._set_parameters()

def extract_from_text(self, scraped_text: str) -> Dict[str, Any]:
"""Can we extract the docket number from the text?
:param scraped_text: The content of the document downloaded
:return: Metadata to be added to the case
"""
dockets = re.search(
r"^<br>(?P<docket_number>(Docket|Index|Ind\.|Motion|SCI) No\..* Case No\..*)\s+?$",
scraped_text[:2000],
re.MULTILINE,
)
if target_date:
return "{}_{}_{}.shtml".format(
base,
target_date.year,
target_date.strftime("%B"),
)
else:
return f"{base}.shtml"

def _get_download_urls(self):
path = f"{self.row_base_path}/td[1]//a/@href"
return self.html.xpath(path)

def _get_case_dates(self):
case_dates = []
for element in self.html.xpath("//caption | //center"):
date_string = (
element.text_content().strip().replace("Cases Decided ", "")
)
path_prefix = (
"./parent::"
if element.tag == "caption"
else "./following-sibling::"
)
path = f"{path_prefix}table[1]{self.row_base_path}"
cases = element.xpath(path)
case_dates.extend([convert_date_string(date_string)] * len(cases))
return case_dates

def _get_precedential_statuses(self):
return ["Published"] * len(self.case_names)

def _get_docket_numbers(self):
path = f"{self.row_base_path}/td[3]"
return list(
map(
self._add_str_to_list_where_empty_element,
self.html.xpath(path),
)
)

def _get_judges(self):
path = f"{self.row_base_path}/td[2]"
return list(
map(
self._add_str_to_list_where_empty_element,
self.html.xpath(path),
)
)

def _get_citations(self):
path = f"{self.row_base_path}/td[4]"
return [cell.text_content().strip() for cell in self.html.xpath(path)]

@staticmethod
def _add_str_to_list_where_empty_element(element):
string_list = element.xpath("./text()")
return string_list[0] if string_list else ""

def _download_backwards(self, target_date):
self.crawl_date = target_date
logger.info(f"Running backscraper with date: {target_date}")
self.url = self.build_url(target_date=target_date)
self.html = self._download()

def _make_html_tree(self, text):
return get_html5_parsed_text(text)
if dockets:
return {"Docket": dockets.groupdict()}
return {}
Loading

0 comments on commit 551ee21

Please sign in to comment.