Merge pull request #1161 from freelawproject/update-ny-ct-app

fix(ny): Fix NY Court of Appeals
freelawproject · Sep 10, 2024 · 551ee21 · 551ee21
2 parents 23c03a2 + 657098a
commit 551ee21
Show file tree

Hide file tree

Showing 40 changed files with 5,216 additions and 17,471 deletions.
diff --git a/juriscraper/lib/auth_utils.py b/juriscraper/lib/auth_utils.py
@@ -0,0 +1,26 @@
+import os
+
+from juriscraper.AbstractSite import logger
+from juriscraper.OpinionSite import OpinionSite
+
+
+def set_api_token_header(site: OpinionSite) -> None:
+    """
+    Puts the NY_API_TOKEN in the X-Api-Token header
+    Creates the Site.headers attribute, copying the
+    scraper_site.request[headers]
+
+    :param scraper_site: a Site Object
+    :returns: None
+    """
+    if site.test_mode_enabled():
+        return
+    api_token = os.environ.get("NY_API_TOKEN", None)
+    if not api_token:
+        logger.warning(
+            "NY_API_TOKEN environment variable is not set. "
+            f"It is required for scraping New York Court: {site.court_id}"
+        )
+        return
+    site.request["headers"]["X-APIKEY"] = api_token
+    site.needs_special_headers = True
diff --git a/juriscraper/opinions/united_states/state/ny.py b/juriscraper/opinions/united_states/state/ny.py
@@ -6,132 +6,130 @@
  2014-07-04: Created by Andrei Chelaru, reviewed by mlr.
  2015-10-23: Parts rewritten by mlr.
  2016-05-04: Updated by arderyp to handle typos in docket string format
+ 2024-09-05: Updated by flooie to deal with block from main website
 """
 
-import os
 import re
-from datetime import date
+from datetime import date, timedelta
+from typing import Any, Dict, Optional, Tuple
 
 from juriscraper.AbstractSite import logger
-from juriscraper.lib.html_utils import get_html5_parsed_text
-from juriscraper.lib.string_utils import convert_date_string
-from juriscraper.OpinionSite import OpinionSite
-
-
-def set_api_token_header(scraper_site: OpinionSite) -> None:
-    """
-    Puts the NY_API_TOKEN in the X-Api-Token header
-    Creates the Site.headers attribute, copying the
-    scraper_site.request[headers]
-
-    :param scraper_site: a Site Object
-    :returns: None
-    """
-    if scraper_site.test_mode_enabled():
-        return
-
-    api_token = os.environ.get("NY_API_TOKEN")
-    if not api_token:
-        logger.warning(
-            "NY_API_TOKEN environment variable is not set. "
-            "It is required for scraping New York Courts"
-        )
-        return
+from juriscraper.lib.auth_utils import set_api_token_header
+from juriscraper.lib.judge_parsers import normalize_judge_string
+from juriscraper.OpinionSiteLinear import OpinionSiteLinear
 
-    scraper_site.request["headers"]["X-APIKEY"] = api_token
-    scraper_site.needs_special_headers = True
 
+class Site(OpinionSiteLinear):
 
-class Site(OpinionSite):
-    DOWNLOAD_URL_SUB_PATH = "td[2]//@href[not(contains(., 'DecisionList'))]"
-    FOUR_CELLS_SUB_PATH = "//*[count(td)=3"
+    first_opinion_date = date(2003, 9, 25)
+    days_interval = 30
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        today = date.today()
-        # https://www.nycourts.gov/ctapps/Decisions/2015/Dec15/December15.html
-        self.url = "http://www.nycourts.gov/ctapps/Decisions/{year}/{mon}{yr}/{month}{yr}.html".format(
-            year=today.year,
-            yr=today.strftime("%y"),
-            mon=today.strftime("%b"),
-            month=today.strftime("%B"),
-        )
+        self.court = "Court of Appeals"
         self.court_id = self.__module__
+        self.url = "https://iapps.courts.state.ny.us/lawReporting/Search?searchType=opinion"
+        self._set_parameters()
+        self.expected_content_types = ["application/pdf", "text/html"]
+        self.make_backscrape_iterable(kwargs)
         set_api_token_header(self)
 
-    def _make_html_tree(self, text):
-        return get_html5_parsed_text(text)
-
-    def _get_case_names(self):
-        path = f"{self.FOUR_CELLS_SUB_PATH} and {self.DOWNLOAD_URL_SUB_PATH}]"
-        case_names = []
-        for element in self.html.xpath(path):
-            case_name_parts = []
-            for t in element.xpath("./td[3]/p/font/text()"):
-                if t.strip():
-                    case_name_parts.append(t)
-            if not case_name_parts:
-                # No hits for first XPath, try another that sometimes works.
-                for t in element.xpath("./td[3]//text()"):
-                    if t.strip():
-                        case_name_parts.append(t)
-            if case_name_parts:
-                case_names.append(", ".join(case_name_parts))
-        return case_names
-
-    def _get_download_urls(self):
-        return self.html.xpath(
-            f"{self.FOUR_CELLS_SUB_PATH}]/{self.DOWNLOAD_URL_SUB_PATH}"
+    def _set_parameters(
+        self,
+        start_date: Optional[date] = None,
+        end_date: Optional[date] = None,
+    ) -> None:
+        """Set the parameters for the POST request.
+
+        If no start or end dates are given, scrape last month.
+        This is the default behaviour for the present time scraper
+
+        :param start_date:
+        :param end_date:
+
+        :return: None
+        """
+        self.method = "POST"
+
+        if not end_date:
+            end_date = date.today()
+            start_date = end_date - timedelta(days=30)
+
+        self.parameters = {
+            "rbOpinionMotion": "opinion",
+            "Pty": "",
+            "and_or": "and",
+            "dtStartDate": start_date.strftime("%m/%d/%Y"),
+            "dtEndDate": end_date.strftime("%m/%d/%Y"),
+            "court": self.court,
+            "docket": "",
+            "judge": "",
+            "slipYear": "",
+            "slipNo": "",
+            "OffVol": "",
+            "Rptr": "",
+            "OffPage": "",
+            "fullText": "",
+            "and_or2": "and",
+            "Order_By": "Party Name",
+            "Submit": "Find",
+            "hidden1": "",
+            "hidden2": "",
+        }
+
+    def _process_html(self):
+        for row in self.html.xpath(".//table")[-1].xpath(".//tr")[1:]:
+            slip_cite = " ".join(row.xpath("./td[5]//text()"))
+            official_citation = " ".join(row.xpath("./td[4]//text()"))
+            url = row.xpath(".//a")[0].get("href")
+            url = re.findall(r"(http.*htm)", url)[0]
+            status = "Unpublished" if "(U)" in slip_cite else "Published"
+            case = {
+                "name": row.xpath(".//td")[0].text_content(),
+                "date": row.xpath(".//td")[1].text_content(),
+                "url": url,
+                "status": status,
+                "docket": "",
+                "citation": official_citation,
+                "parallel_citation": slip_cite,
+                "author": "",
+                "per_curiam": False,
+            }
+            author = row.xpath("./td")[-2].text_content()
+
+            # Because P E R C U R I A M, PER CURIAM, and Per Curiam
+            pc = re.sub(r"\s", "", author.lower())
+            if "percuriam" in pc:
+                case["per_curiam"] = True
+            elif author:
+                cleaned_author = normalize_judge_string(author)[0]
+                if cleaned_author.endswith(" J."):
+                    cleaned_author = cleaned_author[:-3]
+                case["author"] = cleaned_author
+            self.cases.append(case)
+
+    def extract_from_text(self, scraped_text: str) -> Dict[str, Any]:
+        """Can we extract the docket number from the text?
+
+        :param scraped_text: The content of the document downloaded
+        :return: Metadata to be added to the case
+        """
+        dockets = re.search(
+            r"^<br>(?P<docket_number>No\. \d+(\s+SSM \d+)?)\s?$",
+            scraped_text[:2000],
+            re.MULTILINE,
         )
+        if dockets:
+            return {"Docket": dockets.groupdict()}
+        return {}
+
+    def _download_backwards(self, dates: Tuple[date]) -> None:
+        """Make custom date range request
 
-    def _get_case_dates(self):
-        case_dates = []
-        case_date = False
-        # Process rows. If it's a date row,
-        # save the date, continue, then add
-        # date for each opinion row below it
-        for row in self.html.xpath("//tr[not(.//table)]"):
-            date_from_row = self.get_date_from_text(row.text_content())
-            if date_from_row:
-                case_date = date_from_row
-                continue
-            elif self._row_contains_opinion(row) and case_date:
-                case_dates.append(case_date)
-        return case_dates
-
-    def _get_precedential_statuses(self):
-        return ["Published"] * len(self.case_names)
-
-    def _get_docket_numbers(self):
-        docket_numbers = []
-        for cell in self.html.xpath(f"{self.FOUR_CELLS_SUB_PATH}]/td[1]"):
-            text = cell.text_content()
-            date_from_text = self.get_date_from_text(text)
-            if not date_from_text:
-                if re.search(r"N(o|O|0)\.?,?", text):
-                    docket = self._sanitize_docket_string(text)
-                    docket_numbers.append(docket)
-        return docket_numbers
-
-    def _sanitize_docket_string(self, raw_docket_string):
-        """Handle typos and non-standard docket number strings
-
-        Dockets on this page should be in format of "No. #",
-        but sometimes they forget the period, or use a comma
-        instead.  We want to trip all variations of that out
-        and replace slash delimiters with coma delimiters.
+        :param dates: (start_date, end_date) tuple
+        :return None
         """
-        for abbreviation in ["No.", "No ", "No, ", "NO. "]:
-            raw_docket_string = raw_docket_string.replace(abbreviation, "")
-        return ", ".join(raw_docket_string.split(" / "))
-
-    def _row_contains_opinion(self, row):
-        p1 = "./td[3]"
-        p2 = f"./{self.DOWNLOAD_URL_SUB_PATH}"
-        return row.xpath(p1) and row.xpath(p2)
-
-    def get_date_from_text(self, text):
-        try:
-            return convert_date_string(text)
-        except ValueError:
-            return False
+        logger.info("Backscraping for range %s %s", *dates)
+        self._set_parameters(*dates)
+        self.html = self._download()
+        self._process_html()
diff --git a/juriscraper/opinions/united_states/state/nyappdiv_1st.py b/juriscraper/opinions/united_states/state/nyappdiv_1st.py
@@ -4,105 +4,35 @@
 # Author: Andrei Chelaru
 # Reviewer: mlr
 # Date: 2014-07-04
-
+import re
 from datetime import date
+from typing import Any, Dict
 
-from dateutil.rrule import MONTHLY, rrule
+from juriscraper.opinions.united_states.state import ny
 
-from juriscraper.AbstractSite import logger
-from juriscraper.lib.html_utils import get_html5_parsed_text
-from juriscraper.lib.string_utils import convert_date_string
-from juriscraper.opinions.united_states.state.ny import set_api_token_header
-from juriscraper.OpinionSite import OpinionSite
 
+class Site(ny.Site):
+    first_opinion_date = date(2003, 9, 25)
+    days_interval = 30
 
-class Site(OpinionSite):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.court_id = self.__module__
-        date_keys = rrule(
-            MONTHLY, dtstart=date(2003, 11, 1), until=date(2015, 8, 30)
-        )
-        self.back_scrape_iterable = [i.date() for i in date_keys]
-        self.row_base_path = '//tr[contains(./td[1]/a/@href, "3d")]'
-        self.division = 1
-        self.url = self.build_url()
-        self.expected_content_types = ["application/pdf", "text/html"]
-        set_api_token_header(self)
-
-    def _get_case_names(self):
-        path = f"{self.row_base_path}/td[1]"
-        return [cell.text_content() for cell in self.html.xpath(path)]
-
-    def build_url(self, target_date=False):
-        base = (
-            "http://www.courts.state.ny.us/reporter/slipidx/aidxtable_%s"
-            % self.division
+        self.court = "App Div, 1st Dept"
+        self.make_backscrape_iterable(kwargs)
+        self._set_parameters()
+
+    def extract_from_text(self, scraped_text: str) -> Dict[str, Any]:
+        """Can we extract the docket number from the text?
+
+        :param scraped_text: The content of the document downloaded
+        :return: Metadata to be added to the case
+        """
+        dockets = re.search(
+            r"^<br>(?P<docket_number>(Docket|Index|Ind\.|Motion|SCI) No\..* Case No\..*)\s+?$",
+            scraped_text[:2000],
+            re.MULTILINE,
         )
-        if target_date:
-            return "{}_{}_{}.shtml".format(
-                base,
-                target_date.year,
-                target_date.strftime("%B"),
-            )
-        else:
-            return f"{base}.shtml"
-
-    def _get_download_urls(self):
-        path = f"{self.row_base_path}/td[1]//a/@href"
-        return self.html.xpath(path)
-
-    def _get_case_dates(self):
-        case_dates = []
-        for element in self.html.xpath("//caption | //center"):
-            date_string = (
-                element.text_content().strip().replace("Cases Decided ", "")
-            )
-            path_prefix = (
-                "./parent::"
-                if element.tag == "caption"
-                else "./following-sibling::"
-            )
-            path = f"{path_prefix}table[1]{self.row_base_path}"
-            cases = element.xpath(path)
-            case_dates.extend([convert_date_string(date_string)] * len(cases))
-        return case_dates
-
-    def _get_precedential_statuses(self):
-        return ["Published"] * len(self.case_names)
-
-    def _get_docket_numbers(self):
-        path = f"{self.row_base_path}/td[3]"
-        return list(
-            map(
-                self._add_str_to_list_where_empty_element,
-                self.html.xpath(path),
-            )
-        )
-
-    def _get_judges(self):
-        path = f"{self.row_base_path}/td[2]"
-        return list(
-            map(
-                self._add_str_to_list_where_empty_element,
-                self.html.xpath(path),
-            )
-        )
-
-    def _get_citations(self):
-        path = f"{self.row_base_path}/td[4]"
-        return [cell.text_content().strip() for cell in self.html.xpath(path)]
-
-    @staticmethod
-    def _add_str_to_list_where_empty_element(element):
-        string_list = element.xpath("./text()")
-        return string_list[0] if string_list else ""
-
-    def _download_backwards(self, target_date):
-        self.crawl_date = target_date
-        logger.info(f"Running backscraper with date: {target_date}")
-        self.url = self.build_url(target_date=target_date)
-        self.html = self._download()
-
-    def _make_html_tree(self, text):
-        return get_html5_parsed_text(text)
+        if dockets:
+            return {"Docket": dockets.groupdict()}
+        return {}