Skip to content

Commit

Permalink
fix(pacer): Moved common methods to utils
Browse files Browse the repository at this point in the history
  • Loading branch information
albertisfu committed Dec 31, 2024
1 parent 1dfe6b7 commit 7803d48
Show file tree
Hide file tree
Showing 6 changed files with 1,254 additions and 114 deletions.
64 changes: 20 additions & 44 deletions juriscraper/pacer/appellate_docket.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
import re
import sys
from collections import OrderedDict
from typing import Optional

from lxml.html import tostring
from lxml import html

from ..lib.judge_parsers import normalize_judge_string
from ..lib.log_tools import make_default_logger
Expand All @@ -18,6 +19,8 @@
from .reports import BaseReport
from .utils import (
get_court_id_from_url,
get_file_size_str_from_tr,
get_input_value_from_tr,
get_pacer_doc_id_from_doc1_url,
is_pdf,
)
Expand Down Expand Up @@ -562,17 +565,20 @@ def parties(self):
self._parties = parties
return parties

def _get_attachment_number(self, row):
def _get_attachment_number(self, row: html.HtmlElement) -> int:
"""Return the attachment number for an item.
In district courts, this can be easily extracted. In bankruptcy courts,
you must extract it, then subtract 1 from the value since these are
tallied and include the main document.
:param row: Table row as an lxml element
:return: Attachment number for row
"""
return int(row.xpath(".//td/text()")[0].strip())

def _get_description_from_tr(self, row):
"""Get the description from the row"""
def _get_description_from_tr(self, row: html.HtmlElement) -> str:
"""Get the description from the row
:param row: Table row
:return: Attachment description
"""
description_text_nodes = row.xpath(f"./td[4]//text()")
if not description_text_nodes:
# No text in the cell.
Expand All @@ -581,38 +587,20 @@ def _get_description_from_tr(self, row):
return force_unicode(description)

@staticmethod
def _get_input_value_from_tr(tr, idx):
"""Take a row from the attachment table and return the input value by
index.
"""
try:
input = tr.xpath(".//input")[0]
except IndexError:
return None
else:
# value="6828943 14732034 1 62576"
# "62576" is size in bytes "1" is pages
value = input.xpath("./@value")[0]
split_value = value.split(" ")
if len(split_value) != 4:
return None
return split_value[idx]

@staticmethod
def _get_page_count_from_tr(tr):
def _get_page_count_from_tr(tr: html.HtmlElement) -> Optional[int]:
"""Take a row from the attachment table and return the page count as an
int extracted from the input value.
"""
count = AppellateDocketReport._get_input_value_from_tr(tr, 2)
count = get_input_value_from_tr(tr, 2, 4, " ")
if count is not None:
return int(count)

@staticmethod
def _get_file_size_bytes_from_tr(tr):
def _get_file_size_bytes_from_tr(tr: html.HtmlElement) -> Optional[int]:
"""Take a row from the attachment table and return the number of bytes
as an int.
"""
file_size_str = AppellateDocketReport._get_input_value_from_tr(tr, 3)
file_size_str = get_input_value_from_tr(tr, 3, 4, " ")
if file_size_str is None:
return None
file_size = int(file_size_str)
Expand All @@ -621,23 +609,11 @@ def _get_file_size_bytes_from_tr(tr):
return file_size

@staticmethod
def _get_file_size_str_from_tr(tr):
"""Take a row from the attachment table and return the number of bytes
as a str.
"""
cells = tr.xpath("./td")
last_cell_contents = cells[-1].text_content()
units = ["kb", "mb"]
if any(unit in last_cell_contents.lower() for unit in units):
return last_cell_contents.strip()
return ""

@staticmethod
def _get_pacer_doc_id(row):
def _get_pacer_doc_id(row: html.HtmlElement) -> str:
return row.xpath(".//a/@data-pacer-doc-id")

@staticmethod
def _get_pacer_seq_no_from_tr(row):
def _get_pacer_seq_no_from_tr(row: html.HtmlElement) -> Optional[str]:
"""Take a row of the attachment table, and return the sequence number
from the name attribute.
"""
Expand Down Expand Up @@ -666,7 +642,7 @@ def _get_attachments(self, cells):
"attachment_number": self._get_attachment_number(row),
"description": self._get_description_from_tr(row),
"page_count": self._get_page_count_from_tr(row),
"file_size_str": self._get_file_size_str_from_tr(row),
"file_size_str": get_file_size_str_from_tr(row),
"pacer_doc_id": self._get_pacer_doc_id(row),
# It may not be needed to reparse the seq_no
# for each row, but we may as well. So far, it
Expand Down
42 changes: 6 additions & 36 deletions juriscraper/pacer/attachment_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from .reports import BaseReport
from .utils import (
get_court_id_from_doc_id_prefix,
get_file_size_str_from_tr,
get_input_value_from_tr,
get_pacer_doc_id_from_doc1_url,
reverse_goDLS_function,
)
Expand Down Expand Up @@ -93,15 +95,13 @@ def data(self):
file_size_bytes = self._get_file_size_bytes_from_tr(first_row)
if file_size_bytes is not None:
result["file_size_bytes"] = file_size_bytes
result["file_size_str"] = self._get_file_size_str_from_tr(
first_row
)
result["file_size_str"] = get_file_size_str_from_tr(first_row)
for row in rows:
attachment = {
"attachment_number": self._get_attachment_number(row),
"description": self._get_description_from_tr(row),
"page_count": self._get_page_count_from_tr(row),
"file_size_str": self._get_file_size_str_from_tr(row),
"file_size_str": get_file_size_str_from_tr(row),
"pacer_doc_id": self._get_pacer_doc_id(row),
# It may not be needed to reparse the seq_no
# for each row, but we may as well. So far, it
Expand Down Expand Up @@ -272,30 +272,12 @@ def _get_description_from_tr(self, row):
description = description_text_nodes[0].strip()
return force_unicode(description)

@staticmethod
def _get_input_value_from_tr(tr, idx):
"""Take a row from the attachment table and return the input value by
index.
"""
try:
input = tr.xpath(".//input")[0]
except IndexError:
return None
else:
# initial value string "23515655-90555-2"
# "90555" is size in bytes "2" is pages
value = input.xpath("./@value")[0]
split_value = value.split("-")
if len(split_value) != 3:
return None
return split_value[idx]

@staticmethod
def _get_page_count_from_tr_input_value(tr):
"""Take a row from the attachment table and return the page count as an
int extracted from the input value.
"""
count = AttachmentPage._get_input_value_from_tr(tr, 2)
count = get_input_value_from_tr(tr, 2, 3, "-")
if count is not None:
return int(count)

Expand Down Expand Up @@ -327,26 +309,14 @@ def _get_file_size_bytes_from_tr(tr):
"""Take a row from the attachment table and return the number of bytes
as an int.
"""
file_size_str = AttachmentPage._get_input_value_from_tr(tr, 1)
file_size_str = get_input_value_from_tr(tr, 1, 3, "-")
if file_size_str is None:
return None
file_size = int(file_size_str)
if file_size == 0:
return None
return file_size

@staticmethod
def _get_file_size_str_from_tr(tr):
"""Take a row from the attachment table and return the number of bytes
as an int.
"""
cells = tr.xpath("./td")
last_cell_contents = cells[-1].text_content()
units = ["kb", "mb"]
if any(unit in last_cell_contents.lower() for unit in units):
return last_cell_contents.strip()
return ""

@staticmethod
def _get_pacer_doc_id(row):
"""Take in a row from the attachment table and return the pacer_doc_id
Expand Down
40 changes: 6 additions & 34 deletions juriscraper/pacer/docket_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
from .docket_utils import normalize_party_types
from .reports import BaseReport
from .utils import (
get_file_size_str_from_tr,
get_input_value_from_tr,
get_pacer_doc_id_from_doc1_url,
get_pacer_seq_no_from_doc1_anchor,
)
Expand Down Expand Up @@ -1177,30 +1179,12 @@ def _get_attachment_id_value_from_tr(tr, idx):
return None
return split_value[idx]

@staticmethod
def _get_input_value_from_tr(tr, idx):
"""Take a row from the attachment table and return the input value by
index.
"""
try:
input = tr.xpath(".//input")[0]
except IndexError:
return None
else:
# initial value string "23515655-90555-2"
# "90555" is size in bytes "2" is pages
value = input.xpath("./@value")[0]
split_value = value.split("-")
if len(split_value) != 3:
return None
return split_value[idx]

@staticmethod
def _get_page_count_from_tr_input_value(tr):
"""Take a row from the attachment table and return the page count as an
int extracted from the input value.
"""
count = DocketReport._get_input_value_from_tr(tr, 2)
count = get_input_value_from_tr(tr, 2, 3, "-")
if count is not None:
return int(count)

Expand Down Expand Up @@ -1238,26 +1222,14 @@ def _get_file_size_bytes_from_tr(tr):
tr, 1
)
else:
file_size_str = DocketReport._get_input_value_from_tr(tr, 1)
file_size_str = get_input_value_from_tr(tr, 1, 3, "-")
if file_size_str is None:
return None
file_size = int(file_size_str)
if file_size == 0:
return None
return file_size

@staticmethod
def _get_file_size_str_from_tr(tr):
"""Take a row from the attachment table and return the number of bytes
as a str.
"""
cells = tr.xpath("./td")
last_cell_contents = cells[-1].text_content()
units = ["kb", "mb"]
if any(unit in last_cell_contents.lower() for unit in units):
return last_cell_contents.strip()
return ""

def _get_pacer_doc_id(self, row):
"""Take in a row from the attachment table and return the pacer_doc_id
for the item in that row. Return None if the ID cannot be found.
Expand All @@ -1275,7 +1247,7 @@ def _get_pacer_doc_id(self, row):
if value:
pacer_doc_suffix = value[0]
else:
pacer_doc_suffix = DocketReport._get_input_value_from_tr(row, 0)
pacer_doc_suffix = get_input_value_from_tr(row, 0, 3, "-")
if pacer_doc_suffix is None:
return None
# after inserting prefixes our final doc_id is "035023515655"
Expand Down Expand Up @@ -1315,7 +1287,7 @@ def _get_attachments(self, cells):
"attachment_number": self._get_attachment_number(row),
"description": self._get_description_from_tr(row),
"page_count": self._get_page_count_from_tr(row),
"file_size_str": self._get_file_size_str_from_tr(row),
"file_size_str": get_file_size_str_from_tr(row),
"pacer_doc_id": self._get_pacer_doc_id(row),
# It may not be needed to reparse the seq_no
# for each row, but we may as well. So far, it
Expand Down
41 changes: 41 additions & 0 deletions juriscraper/pacer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -853,3 +853,44 @@ def parse_sumDocSelected_from_row(
if onclick and "sumDocSelected" in onclick[0]:
return reverse_sumDocSelected_function(onclick[0])
return None


def get_input_value_from_tr(
tr: html.HtmlElement, idx: int, expected_values: int, split_value: str
) -> Optional[str]:
"""Take a row from the attachment table and return the input value by
index.
:param tr: An HTML row element from which the input value will be extracted.
:param idx: The index of the value to extract from the split list.
:param expected_values: The expected number of elements in the split value.
:param split_value: The delimiter used to split the value string.
:return: The extracted value at the specified index or None
"""
try:
input_element = tr.xpath(".//input")[0]
except IndexError:
return None
else:
# value="6828943 14732034 1 62576"
# "62576" is size in bytes "1" is pages
# or
# value="23515655-90555-2"
# "90555" is size in bytes "2" is pages
value = input_element.xpath("./@value")[0]
split_value = value.split(split_value)
if len(split_value) != expected_values:
return None
return split_value[idx]


def get_file_size_str_from_tr(tr: html.HtmlElement) -> str:
"""Take a row from the attachment table and return the number of bytes
as an int.
"""
cells = tr.xpath("./td")
last_cell_contents = cells[-1].text_content()
units = ["kb", "mb"]
if any(unit in last_cell_contents.lower() for unit in units):
return last_cell_contents.strip()
return ""
Loading

0 comments on commit 7803d48

Please sign in to comment.