Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DocketReport: handle 'Date Entered' option #227

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
176 changes: 118 additions & 58 deletions juriscraper/pacer/docket_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,9 @@ def _get_value(self, regex, query_strings, cast_to_date=False):
return convert_date_string(m.group(1))
hit = m.group(1)
if "date filed" not in hit.lower():
# Safety check. Sometimes a match is made against the merged
# text string, including its headers. This is wrong.
# Safety check. Sometimes a match is made against
# the merged text string, including its
# headers. This is wrong.
return hit

if cast_to_date:
Expand Down Expand Up @@ -139,7 +140,8 @@ def get_datetime_from_tree(self, path, cast_to_date=False):
logger.debug("Couldn't parse date: %s" % s)
return None
else:
d = d.replace(tzinfo=d.tzinfo or gettz('UTC')) # Set it to UTC.
# Set it to UTC.
d = d.replace(tzinfo=d.tzinfo or gettz('UTC'))
if cast_to_date is True:
return d.date()
return d
Expand All @@ -163,25 +165,30 @@ class DocketReport(BaseDocketReport, BaseReport):
case_name_str = r"(?:Case\s+title:\s+)?(.*\bv\.?\s.*)"
case_name_regex = re.compile(case_name_str)
case_name_i_regex = re.compile(case_name_str, flags=re.IGNORECASE)
case_title_regex = re.compile(r"(?:Case\s+title:\s+)(.*)", flags=re.IGNORECASE)
case_title_regex = re.compile(r"(?:Case\s+title:\s+)(.*)",
flags=re.IGNORECASE)
in_re_regex = re.compile(r"(\bIN\s+RE:\s+.*)", flags=re.IGNORECASE)
in_the_matter_regex = re.compile(r"(\bIn\s+the\s+matter\s+.*)", flags=re.IGNORECASE)
in_the_matter_regex = re.compile(r"(\bIn\s+the\s+matter\s+.*)",
flags=re.IGNORECASE)
case_name_regexes = [
case_name_regex, case_name_i_regex, case_title_regex, in_re_regex,
in_the_matter_regex,
]
date_filed_regex = re.compile(r'Date [fF]iled:\s+(%s)' % date_regex)
date_converted_regex = re.compile(r'Date [Cc]onverted:\s+(%s)' % date_regex)
date_converted_regex = re.compile(
r'Date [Cc]onverted:\s+(%s)' % date_regex)
# Be careful this does not match "Joint debtor discharged" field.
date_discharged_regex = re.compile(r'(?:Date|Debtor)\s+[Dd]ischarged:\s+(%s)' % date_regex)
date_discharged_regex = re.compile(
r'(?:Date|Debtor)\s+[Dd]ischarged:\s+(%s)' % date_regex)
assigned_to_regex = r'Assigned to:\s+(.*)'
referred_to_regex = r'Referred to:\s+(.*)'
cause_regex = re.compile(r'Cause:\s+(.*)')
nos_regex = re.compile(r'Nature of Suit:\s+(.*)')
jury_demand_regex = re.compile(r'Jury Demand:\s+(.*)')
jurisdiction_regex = re.compile(r'Jurisdiction:\s+(.*)')
demand_regex = re.compile(r'^Demand:\s+(.*)')
docket_number_dist_regex = re.compile(r"((\d{1,2}:)?\d\d-[a-zA-Z]{1,4}-\d{1,10})")
docket_number_dist_regex = re.compile(
r"((\d{1,2}:)?\d\d-[a-zA-Z]{1,4}-\d{1,10})")
docket_number_bankr_regex = re.compile(r"(?:#:\s+)?((\d-)?\d\d-\d*)")
offense_regex = re.compile(
r'highest\s+offense.*(?P<status>opening|terminated)', flags=re.I)
Expand All @@ -195,21 +202,26 @@ class DocketReport(BaseDocketReport, BaseReport):
'is_adversary_proceeding']

ERROR_STRINGS = BaseReport.ERROR_STRINGS + [
"The report may take a long time to run because this case has many docket entries",
("The report may take a long time to run because this case has "
"many docket entries"),
"The page ID does not exist. Please enter a valid page ID number. ",
"There are no documents in this case.",
"Incomplete request. Please try your query again by choosing the Query or Reports option",
("Incomplete request. Please try your query again by choosing the "
"Query or Reports option"),
"To accept charges shown below, click on the 'View Report' button",
"Unable to create PDF file.",
"This case was administratively closed",
"The start date must be less than or equal to the end date",
"The starting document number must be less than or equal to the ending document number",
("The starting document number must be less than or equal to "
"the ending document number"),
"Case not found.",
"Either you do not have permission to view the document, or the document does not exist in the case.",
("Either you do not have permission to view the document, "
"or the document does not exist in the case."),
"Format: text",
"Server timeout waiting for the HTTP request from the client.",
"The case type was.*but it must be",
"This case is in the process of being opened, please check back later for additional information.",
("This case is in the process of being opened, "
"please check back later for additional information."),
"Submission already made, please wait for response from server",
]

Expand Down Expand Up @@ -314,9 +326,12 @@ def parties(self):
# document table has bold/underline/italic text.
path = (
'//tr['
' ./td[1]//i/b/text() or ' # Bankruptcy
' ./td[1]//b/u/text() or ' # Regular district
' ./td[1]//b/text()[contains(., "-----")]' # Adversary proceedings
# Bankruptcy
' ./td[1]//i/b/text() or '
# Regular district
' ./td[1]//b/u/text() or '
# Adversary proceedings
' ./td[1]//b/text()[contains(., "-----")]'
']/../tr'
)
party_rows = self.tree.xpath(path)
Expand All @@ -333,9 +348,11 @@ def parties(self):
if should_continue:
continue

name_path = u'.//b[not(./parent::i)][not(./u)][not(contains(., "------"))]'
name_path = (u'.//b[not(./parent::i)]'
'[not(./u)][not(contains(., "------"))]')
is_party_name_cell = (len(cells[0].xpath(name_path)) > 0)
prev_has_disposition = prev is not None and 'Disposition' in prev.text_content()
prev_has_disposition = prev is not None and \
'Disposition' in prev.text_content()
if is_party_name_cell and not prev_has_disposition:
element = cells[0].xpath(name_path)[0]
party[u'name'] = force_unicode(element.text_content().strip())
Expand All @@ -362,9 +379,10 @@ def parties(self):
parties.append(party)

if self.is_adversary_proceeding:
# In adversary proceedings, there are multiple rows under one
# party type header. Nuke the bulk of the party dict, except for
# the type so that it's ready for the next iteration.
# In adversary proceedings, there are multiple rows
# under one party type header. Nuke the bulk of the
# party dict, except for the type so that it's ready
# for the next iteration.
party = {u'type': party[u'type']}
else:
party = {}
Expand Down Expand Up @@ -479,9 +497,10 @@ def _add_criminal_data_to_parties(self, parties, party_rows):
:param party_rows: The trs with party/criminal data
:return: None
"""
# Because criminal data spans multiple trs, the way we do this is by
# keeping track of which party we're currently working on. Then, when we
# get useful criminal data, we add it to that party.
# Because criminal data spans multiple trs, the way we do this
# is by keeping track of which party we're currently working
# on. Then, when we get useful criminal data, we add it to
# that party.
empty_criminal_data = {
u'counts': [],
u'complaints': [],
Expand Down Expand Up @@ -681,7 +700,8 @@ def _get_attorneys(cell):
"""
attorneys = []
for atty_node in cell.xpath('.//b'):
name_parts = force_unicode(atty_node.text_content().strip()).split()
name_parts = force_unicode(
atty_node.text_content().strip()).split()
attorney = {
u'name': u' '.join(name_parts),
u'roles': [],
Expand All @@ -692,7 +712,8 @@ def _get_attorneys(cell):
# noinspection PyProtectedMember
if isinstance(node, (etree._ElementStringResult,
etree._ElementUnicodeResult)):
clean_atty = u'%s\n' % ' '.join(n.strip() for n in node.split())
clean_atty = u'%s\n' % ' '.join(
n.strip() for n in node.split())
if clean_atty.strip():
attorney[u'contact'] += clean_atty
else:
Expand All @@ -715,44 +736,81 @@ def docket_entries(self):
if self._docket_entries is not None:
return self._docket_entries

# There can be multiple docket entry tables on a single docket page. See
# https://github.com/freelawproject/courtlistener/issues/762. ∴ we need
# to identify the first table, and all following tables. The following
# tables lack column headers, so we have to use the preceding-sibling
# tables to make sure it's right.
# There can be multiple docket entry tables on a single docket
# page. See
# https://github.com/freelawproject/courtlistener/issues/762.
# ∴ we need to identify the first table, and all following
# tables. The following tables lack column headers, so we have
# to use the preceding-sibling tables to make sure it's right.
docket_header = './/text()[contains(., "Docket Text")]'
bankr_multi_doc = 'not(.//text()[contains(., "Total file size of selected documents")])'
bankr_multi_doc = ('not(.//text()[contains(.,'
'"Total file size of selected documents")])')
footer_multi_doc = 'not(.//text()[contains(., "Footer format:")])'
docket_entry_rows = self.tree.xpath(
'//table'
'[preceding-sibling::table[{dh}] or {dh}]'
'[{b_multi_doc}]'
'[{footer_multi_doc}]'
'[preceding-sibling::table[{dh}] or {dh}]'
'[{b_multi_doc}]'
'[{footer_multi_doc}]'
'/tbody/tr'.format(
dh=docket_header,
b_multi_doc=bankr_multi_doc,
footer_multi_doc=footer_multi_doc,
)
)[1:] # Skip the first row.
)

CELL_XPATH = u'(./td[not(./input)] | ./th[not(./input)])'

if not docket_entry_rows:
return []

def normalize_whitespace(str):
''' Split (on whitespace) and then rejoin with spaces.
Replaces non-terminal runs of whitespace with a single space.
'''
return ' '.join(str.split())

raw_date_kind = normalize_whitespace(
docket_entry_rows[0].xpath(CELL_XPATH)[0].text_content())
if raw_date_kind == 'Date Entered':
date_kind = u'date_entered'
elif raw_date_kind in ['Date Filed', 'Filing Date', 'Docket Date']:
date_kind = u'date_filed'
else:
raise AssertionError('Unknown date(?) kind <%s>' % raw_date_kind)

docket_entries = []
for row in docket_entry_rows:
# Skip the first row.
for row in docket_entry_rows[1:]:
de = {}
cells = row.xpath(u'./td[not(./input)]')
cells = row.xpath(CELL_XPATH)
if len(cells) == 4:
# In some instances, the document entry table has an extra
# column. See almb, 92-04963
del cells[1]

date_filed_str = force_unicode(cells[0].text_content())
if not date_filed_str:
date_str = force_unicode(cells[0].text_content())
if not date_str:
# Some older dockets have missing dates. Press on.
continue
de[u'date_filed'] = convert_date_string(date_filed_str)
de[date_kind] = convert_date_string(date_str)
de[u'document_number'] = self._get_document_number(cells[1])
de[u'pacer_doc_id'] = self._get_pacer_doc_id(cells[1],
de[u'document_number'])
de[u'pacer_doc_id'] = self._get_pacer_doc_id(
cells[1], de[u'document_number'])
de[u'description'] = self._get_description(cells)

# If there's a '(Entered: xx/yy/zzzz)' notation at end, use it!
match = re.search(r'\(Entered: (\d{2}/\d{2}/\d{4})\)$',
de[u'description'])
if match:
date_entered = convert_date_string(match.group(1))
if u'date_entered' in de:
assert de[u'date_entered'] == date_entered, (
'Date Entered column (%s) does not match parsed value '
'from end of description (%s)' % (de[u'date_entered'],
date_entered))
else:
de[u'date_entered'] = date_entered

if not de[u'document_number']:
# Minute order. Skip for now.
continue
Expand Down Expand Up @@ -791,11 +849,11 @@ def query(self, pacer_case_id, date_range_type='Filed', date_start=None,
were entered into PACER or the date they were filed.
:param date_start: The start date for the date range (as a date object)
:param date_end: The end date for the date range (as a date object)
:param doc_num_start: A range of documents can be requested. This is the
lower bound of their ID numbers.
:param doc_num_start: A range of documents can be
requested. This is the lower bound of their ID numbers.
:param doc_num_end: The upper bound of the requested documents.
:param show_parties_and_counsel: Whether to show the parties and counsel
in a case (note this adds expense).
:param show_parties_and_counsel: Whether to show the parties and
counsel in a case (note this adds expense).
:param show_terminated_parties: Whether to show terminated parties in a
case (note this adds expense).
:param show_list_of_member_cases: Whether to show a list of member
Expand All @@ -808,6 +866,7 @@ def query(self, pacer_case_id, date_range_type='Filed', date_start=None,
:param order_by: The ordering desired for the results.
:return: None. Instead sets self.response attribute and runs
self.parse()

"""
# Set up and sanity tests
assert self.session is not None, \
Expand Down Expand Up @@ -890,8 +949,8 @@ def _set_metadata_values(self):
u'/ancestor::table[not(.//center)][last()]'
)[0]
cells = table.xpath(u'.//td')
# Convert the <br> separated content into text strings, treating as much
# as possible as HTML.
# Convert the <br> separated content into text strings,
# treating as much as possible as HTML.
values = []
for cell in cells:
clean_texts = [clean_string(s) for s in self._br_split(cell)]
Expand All @@ -909,8 +968,8 @@ def _get_pacer_doc_id(cell, document_number):
# column in their docket report.
urls = cell.xpath(u'.//a')
if len(urls) == 0:
# Docket entry exists, but cannot download document (it's sealed
# or otherwise unavailable in PACER).
# Docket entry exists, but cannot download document
# (it's sealed or otherwise unavailable in PACER).
return None
for url in urls:
if url.text_content().strip() == document_number:
Expand All @@ -928,9 +987,9 @@ def _get_document_number(self, cell):
if words:
first_word = re.sub(u'[\s\u00A0]', '', words[0])
if self.court_id == u'txnb':
# txnb merges the second and third columns, so if the first word
# is a number, return it. Otherwise, assume doc number isn't
# listed for the item.
# txnb merges the second and third columns, so if the
# first word is a number, return it. Otherwise, assume
# doc number isn't listed for the item.
if first_word.isdigit():
return first_word
else:
Expand All @@ -942,10 +1001,10 @@ def _get_description(self, cells):
return force_unicode(cells[2].text_content())

s = force_unicode(cells[1].text_content())
# In txnb the second and third columns of the docket entries are
# combined. The field can have one of four formats. Attempt the most
# detailed first, then work our way down to just giving up and capturing
# it all.
# In txnb the second and third columns of the docket entries
# are combined. The field can have one of four
# formats. Attempt the most detailed first, then work our way
# down to just giving up and capturing it all.
ws = u'[\s\u00A0]' # Whitespace including nbsp
regexes = [
# 2 (23 pgs; 4 docs) Blab blah (happens when attachments exist and
Expand Down Expand Up @@ -1054,6 +1113,7 @@ def _get_judge(self, regex):
judge_str = judge_str.split('to:')[1]
return normalize_judge_string(judge_str)[0]


if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python -m juriscraper.pacer.docket_report filepath")
Expand Down
Loading