fix(pacer): Moved common methods to utils

freelawproject · Dec 31, 2024 · 7803d48 · 7803d48
1 parent 1dfe6b7
commit 7803d48
Show file tree

Hide file tree

Showing 6 changed files with 1,254 additions and 114 deletions.
diff --git a/juriscraper/pacer/appellate_docket.py b/juriscraper/pacer/appellate_docket.py
@@ -2,8 +2,9 @@
 import re
 import sys
 from collections import OrderedDict
+from typing import Optional
 
-from lxml.html import tostring
+from lxml import html
 
 from ..lib.judge_parsers import normalize_judge_string
 from ..lib.log_tools import make_default_logger
@@ -18,6 +19,8 @@
 from .reports import BaseReport
 from .utils import (
     get_court_id_from_url,
+    get_file_size_str_from_tr,
+    get_input_value_from_tr,
     get_pacer_doc_id_from_doc1_url,
     is_pdf,
 )
@@ -562,17 +565,20 @@ def parties(self):
         self._parties = parties
         return parties
 
-    def _get_attachment_number(self, row):
+    def _get_attachment_number(self, row: html.HtmlElement) -> int:
         """Return the attachment number for an item.
 
-        In district courts, this can be easily extracted. In bankruptcy courts,
-        you must extract it, then subtract 1 from the value since these are
-        tallied and include the main document.
+        :param row: Table row as an lxml element
+        :return: Attachment number for row
         """
         return int(row.xpath(".//td/text()")[0].strip())
 
-    def _get_description_from_tr(self, row):
-        """Get the description from the row"""
+    def _get_description_from_tr(self, row: html.HtmlElement) -> str:
+        """Get the description from the row
+
+        :param row: Table row
+        :return: Attachment description
+        """
         description_text_nodes = row.xpath(f"./td[4]//text()")
         if not description_text_nodes:
             # No text in the cell.
@@ -581,38 +587,20 @@ def _get_description_from_tr(self, row):
         return force_unicode(description)
 
     @staticmethod
-    def _get_input_value_from_tr(tr, idx):
-        """Take a row from the attachment table and return the input value by
-        index.
-        """
-        try:
-            input = tr.xpath(".//input")[0]
-        except IndexError:
-            return None
-        else:
-            # value="6828943 14732034 1 62576"
-            # "62576" is size in bytes "1" is pages
-            value = input.xpath("./@value")[0]
-            split_value = value.split(" ")
-            if len(split_value) != 4:
-                return None
-            return split_value[idx]
-
-    @staticmethod
-    def _get_page_count_from_tr(tr):
+    def _get_page_count_from_tr(tr: html.HtmlElement) -> Optional[int]:
         """Take a row from the attachment table and return the page count as an
         int extracted from the input value.
         """
-        count = AppellateDocketReport._get_input_value_from_tr(tr, 2)
+        count = get_input_value_from_tr(tr, 2, 4, " ")
         if count is not None:
             return int(count)
 
     @staticmethod
-    def _get_file_size_bytes_from_tr(tr):
+    def _get_file_size_bytes_from_tr(tr: html.HtmlElement) -> Optional[int]:
         """Take a row from the attachment table and return the number of bytes
         as an int.
         """
-        file_size_str = AppellateDocketReport._get_input_value_from_tr(tr, 3)
+        file_size_str = get_input_value_from_tr(tr, 3, 4, " ")
         if file_size_str is None:
             return None
         file_size = int(file_size_str)
@@ -621,23 +609,11 @@ def _get_file_size_bytes_from_tr(tr):
         return file_size
 
     @staticmethod
-    def _get_file_size_str_from_tr(tr):
-        """Take a row from the attachment table and return the number of bytes
-        as a str.
-        """
-        cells = tr.xpath("./td")
-        last_cell_contents = cells[-1].text_content()
-        units = ["kb", "mb"]
-        if any(unit in last_cell_contents.lower() for unit in units):
-            return last_cell_contents.strip()
-        return ""
-
-    @staticmethod
-    def _get_pacer_doc_id(row):
+    def _get_pacer_doc_id(row: html.HtmlElement) -> str:
         return row.xpath(".//a/@data-pacer-doc-id")
 
     @staticmethod
-    def _get_pacer_seq_no_from_tr(row):
+    def _get_pacer_seq_no_from_tr(row: html.HtmlElement) -> Optional[str]:
         """Take a row of the attachment table, and return the sequence number
         from the name attribute.
         """
@@ -666,7 +642,7 @@ def _get_attachments(self, cells):
                 "attachment_number": self._get_attachment_number(row),
                 "description": self._get_description_from_tr(row),
                 "page_count": self._get_page_count_from_tr(row),
-                "file_size_str": self._get_file_size_str_from_tr(row),
+                "file_size_str": get_file_size_str_from_tr(row),
                 "pacer_doc_id": self._get_pacer_doc_id(row),
                 # It may not be needed to reparse the seq_no
                 # for each row, but we may as well. So far, it

diff --git a/juriscraper/pacer/attachment_page.py b/juriscraper/pacer/attachment_page.py
@@ -7,6 +7,8 @@
 from .reports import BaseReport
 from .utils import (
     get_court_id_from_doc_id_prefix,
+    get_file_size_str_from_tr,
+    get_input_value_from_tr,
     get_pacer_doc_id_from_doc1_url,
     reverse_goDLS_function,
 )
@@ -93,15 +95,13 @@ def data(self):
             file_size_bytes = self._get_file_size_bytes_from_tr(first_row)
             if file_size_bytes is not None:
                 result["file_size_bytes"] = file_size_bytes
-            result["file_size_str"] = self._get_file_size_str_from_tr(
-                first_row
-            )
+            result["file_size_str"] = get_file_size_str_from_tr(first_row)
         for row in rows:
             attachment = {
                 "attachment_number": self._get_attachment_number(row),
                 "description": self._get_description_from_tr(row),
                 "page_count": self._get_page_count_from_tr(row),
-                "file_size_str": self._get_file_size_str_from_tr(row),
+                "file_size_str": get_file_size_str_from_tr(row),
                 "pacer_doc_id": self._get_pacer_doc_id(row),
                 # It may not be needed to reparse the seq_no
                 # for each row, but we may as well. So far, it
@@ -272,30 +272,12 @@ def _get_description_from_tr(self, row):
         description = description_text_nodes[0].strip()
         return force_unicode(description)
 
-    @staticmethod
-    def _get_input_value_from_tr(tr, idx):
-        """Take a row from the attachment table and return the input value by
-        index.
-        """
-        try:
-            input = tr.xpath(".//input")[0]
-        except IndexError:
-            return None
-        else:
-            # initial value string "23515655-90555-2"
-            # "90555" is size in bytes "2" is pages
-            value = input.xpath("./@value")[0]
-            split_value = value.split("-")
-            if len(split_value) != 3:
-                return None
-            return split_value[idx]
-
     @staticmethod
     def _get_page_count_from_tr_input_value(tr):
         """Take a row from the attachment table and return the page count as an
         int extracted from the input value.
         """
-        count = AttachmentPage._get_input_value_from_tr(tr, 2)
+        count = get_input_value_from_tr(tr, 2, 3, "-")
         if count is not None:
             return int(count)
 
@@ -327,26 +309,14 @@ def _get_file_size_bytes_from_tr(tr):
         """Take a row from the attachment table and return the number of bytes
         as an int.
         """
-        file_size_str = AttachmentPage._get_input_value_from_tr(tr, 1)
+        file_size_str = get_input_value_from_tr(tr, 1, 3, "-")
         if file_size_str is None:
             return None
         file_size = int(file_size_str)
         if file_size == 0:
             return None
         return file_size
 
-    @staticmethod
-    def _get_file_size_str_from_tr(tr):
-        """Take a row from the attachment table and return the number of bytes
-        as an int.
-        """
-        cells = tr.xpath("./td")
-        last_cell_contents = cells[-1].text_content()
-        units = ["kb", "mb"]
-        if any(unit in last_cell_contents.lower() for unit in units):
-            return last_cell_contents.strip()
-        return ""
-
     @staticmethod
     def _get_pacer_doc_id(row):
         """Take in a row from the attachment table and return the pacer_doc_id

diff --git a/juriscraper/pacer/docket_report.py b/juriscraper/pacer/docket_report.py
@@ -21,6 +21,8 @@
 from .docket_utils import normalize_party_types
 from .reports import BaseReport
 from .utils import (
+    get_file_size_str_from_tr,
+    get_input_value_from_tr,
     get_pacer_doc_id_from_doc1_url,
     get_pacer_seq_no_from_doc1_anchor,
 )
@@ -1177,30 +1179,12 @@ def _get_attachment_id_value_from_tr(tr, idx):
                 return None
             return split_value[idx]
 
-    @staticmethod
-    def _get_input_value_from_tr(tr, idx):
-        """Take a row from the attachment table and return the input value by
-        index.
-        """
-        try:
-            input = tr.xpath(".//input")[0]
-        except IndexError:
-            return None
-        else:
-            # initial value string "23515655-90555-2"
-            # "90555" is size in bytes "2" is pages
-            value = input.xpath("./@value")[0]
-            split_value = value.split("-")
-            if len(split_value) != 3:
-                return None
-            return split_value[idx]
-
     @staticmethod
     def _get_page_count_from_tr_input_value(tr):
         """Take a row from the attachment table and return the page count as an
         int extracted from the input value.
         """
-        count = DocketReport._get_input_value_from_tr(tr, 2)
+        count = get_input_value_from_tr(tr, 2, 3, "-")
         if count is not None:
             return int(count)
 
@@ -1238,26 +1222,14 @@ def _get_file_size_bytes_from_tr(tr):
                 tr, 1
             )
         else:
-            file_size_str = DocketReport._get_input_value_from_tr(tr, 1)
+            file_size_str = get_input_value_from_tr(tr, 1, 3, "-")
         if file_size_str is None:
             return None
         file_size = int(file_size_str)
         if file_size == 0:
             return None
         return file_size
 
-    @staticmethod
-    def _get_file_size_str_from_tr(tr):
-        """Take a row from the attachment table and return the number of bytes
-        as a str.
-        """
-        cells = tr.xpath("./td")
-        last_cell_contents = cells[-1].text_content()
-        units = ["kb", "mb"]
-        if any(unit in last_cell_contents.lower() for unit in units):
-            return last_cell_contents.strip()
-        return ""
-
     def _get_pacer_doc_id(self, row):
         """Take in a row from the attachment table and return the pacer_doc_id
         for the item in that row. Return None if the ID cannot be found.
@@ -1275,7 +1247,7 @@ def _get_pacer_doc_id(self, row):
             if value:
                 pacer_doc_suffix = value[0]
         else:
-            pacer_doc_suffix = DocketReport._get_input_value_from_tr(row, 0)
+            pacer_doc_suffix = get_input_value_from_tr(row, 0, 3, "-")
         if pacer_doc_suffix is None:
             return None
         # after inserting prefixes our final doc_id is "035023515655"
@@ -1315,7 +1287,7 @@ def _get_attachments(self, cells):
                 "attachment_number": self._get_attachment_number(row),
                 "description": self._get_description_from_tr(row),
                 "page_count": self._get_page_count_from_tr(row),
-                "file_size_str": self._get_file_size_str_from_tr(row),
+                "file_size_str": get_file_size_str_from_tr(row),
                 "pacer_doc_id": self._get_pacer_doc_id(row),
                 # It may not be needed to reparse the seq_no
                 # for each row, but we may as well. So far, it

diff --git a/juriscraper/pacer/utils.py b/juriscraper/pacer/utils.py
@@ -853,3 +853,44 @@ def parse_sumDocSelected_from_row(
         if onclick and "sumDocSelected" in onclick[0]:
             return reverse_sumDocSelected_function(onclick[0])
     return None
+
+
+def get_input_value_from_tr(
+    tr: html.HtmlElement, idx: int, expected_values: int, split_value: str
+) -> Optional[str]:
+    """Take a row from the attachment table and return the input value by
+    index.
+
+    :param tr: An HTML row element from which the input value will be extracted.
+    :param idx: The index of the value to extract from the split list.
+    :param expected_values: The expected number of elements in the split value.
+    :param split_value: The delimiter used to split the value string.
+    :return: The extracted value at the specified index or None
+    """
+    try:
+        input_element = tr.xpath(".//input")[0]
+    except IndexError:
+        return None
+    else:
+        # value="6828943 14732034 1 62576"
+        # "62576" is size in bytes "1" is pages
+        # or
+        # value="23515655-90555-2"
+        # "90555" is size in bytes "2" is pages
+        value = input_element.xpath("./@value")[0]
+        split_value = value.split(split_value)
+        if len(split_value) != expected_values:
+            return None
+        return split_value[idx]
+
+
+def get_file_size_str_from_tr(tr: html.HtmlElement) -> str:
+    """Take a row from the attachment table and return the number of bytes
+    as an int.
+    """
+    cells = tr.xpath("./td")
+    last_cell_contents = cells[-1].text_content()
+    units = ["kb", "mb"]
+    if any(unit in last_cell_contents.lower() for unit in units):
+        return last_cell_contents.strip()
+    return ""