diff --git a/juriscraper/pacer/docket_report.py b/juriscraper/pacer/docket_report.py index 345950a0b..998736b11 100644 --- a/juriscraper/pacer/docket_report.py +++ b/juriscraper/pacer/docket_report.py @@ -93,8 +93,9 @@ def _get_value(self, regex, query_strings, cast_to_date=False): return convert_date_string(m.group(1)) hit = m.group(1) if "date filed" not in hit.lower(): - # Safety check. Sometimes a match is made against the merged - # text string, including its headers. This is wrong. + # Safety check. Sometimes a match is made against + # the merged text string, including its + # headers. This is wrong. return hit if cast_to_date: @@ -139,7 +140,8 @@ def get_datetime_from_tree(self, path, cast_to_date=False): logger.debug("Couldn't parse date: %s" % s) return None else: - d = d.replace(tzinfo=d.tzinfo or gettz('UTC')) # Set it to UTC. + # Set it to UTC. + d = d.replace(tzinfo=d.tzinfo or gettz('UTC')) if cast_to_date is True: return d.date() return d @@ -163,17 +165,21 @@ class DocketReport(BaseDocketReport, BaseReport): case_name_str = r"(?:Case\s+title:\s+)?(.*\bv\.?\s.*)" case_name_regex = re.compile(case_name_str) case_name_i_regex = re.compile(case_name_str, flags=re.IGNORECASE) - case_title_regex = re.compile(r"(?:Case\s+title:\s+)(.*)", flags=re.IGNORECASE) + case_title_regex = re.compile(r"(?:Case\s+title:\s+)(.*)", + flags=re.IGNORECASE) in_re_regex = re.compile(r"(\bIN\s+RE:\s+.*)", flags=re.IGNORECASE) - in_the_matter_regex = re.compile(r"(\bIn\s+the\s+matter\s+.*)", flags=re.IGNORECASE) + in_the_matter_regex = re.compile(r"(\bIn\s+the\s+matter\s+.*)", + flags=re.IGNORECASE) case_name_regexes = [ case_name_regex, case_name_i_regex, case_title_regex, in_re_regex, in_the_matter_regex, ] date_filed_regex = re.compile(r'Date [fF]iled:\s+(%s)' % date_regex) - date_converted_regex = re.compile(r'Date [Cc]onverted:\s+(%s)' % date_regex) + date_converted_regex = re.compile( + r'Date [Cc]onverted:\s+(%s)' % date_regex) # Be careful this does not match "Joint debtor discharged" field. - date_discharged_regex = re.compile(r'(?:Date|Debtor)\s+[Dd]ischarged:\s+(%s)' % date_regex) + date_discharged_regex = re.compile( + r'(?:Date|Debtor)\s+[Dd]ischarged:\s+(%s)' % date_regex) assigned_to_regex = r'Assigned to:\s+(.*)' referred_to_regex = r'Referred to:\s+(.*)' cause_regex = re.compile(r'Cause:\s+(.*)') @@ -181,7 +187,8 @@ class DocketReport(BaseDocketReport, BaseReport): jury_demand_regex = re.compile(r'Jury Demand:\s+(.*)') jurisdiction_regex = re.compile(r'Jurisdiction:\s+(.*)') demand_regex = re.compile(r'^Demand:\s+(.*)') - docket_number_dist_regex = re.compile(r"((\d{1,2}:)?\d\d-[a-zA-Z]{1,4}-\d{1,10})") + docket_number_dist_regex = re.compile( + r"((\d{1,2}:)?\d\d-[a-zA-Z]{1,4}-\d{1,10})") docket_number_bankr_regex = re.compile(r"(?:#:\s+)?((\d-)?\d\d-\d*)") offense_regex = re.compile( r'highest\s+offense.*(?Popening|terminated)', flags=re.I) @@ -195,21 +202,26 @@ class DocketReport(BaseDocketReport, BaseReport): 'is_adversary_proceeding'] ERROR_STRINGS = BaseReport.ERROR_STRINGS + [ - "The report may take a long time to run because this case has many docket entries", + ("The report may take a long time to run because this case has " + "many docket entries"), "The page ID does not exist. Please enter a valid page ID number. ", "There are no documents in this case.", - "Incomplete request. Please try your query again by choosing the Query or Reports option", + ("Incomplete request. Please try your query again by choosing the " + "Query or Reports option"), "To accept charges shown below, click on the 'View Report' button", "Unable to create PDF file.", "This case was administratively closed", "The start date must be less than or equal to the end date", - "The starting document number must be less than or equal to the ending document number", + ("The starting document number must be less than or equal to " + "the ending document number"), "Case not found.", - "Either you do not have permission to view the document, or the document does not exist in the case.", + ("Either you do not have permission to view the document, " + "or the document does not exist in the case."), "Format: text", "Server timeout waiting for the HTTP request from the client.", "The case type was.*but it must be", - "This case is in the process of being opened, please check back later for additional information.", + ("This case is in the process of being opened, " + "please check back later for additional information."), "Submission already made, please wait for response from server", ] @@ -314,9 +326,12 @@ def parties(self): # document table has bold/underline/italic text. path = ( '//tr[' - ' ./td[1]//i/b/text() or ' # Bankruptcy - ' ./td[1]//b/u/text() or ' # Regular district - ' ./td[1]//b/text()[contains(., "-----")]' # Adversary proceedings + # Bankruptcy + ' ./td[1]//i/b/text() or ' + # Regular district + ' ./td[1]//b/u/text() or ' + # Adversary proceedings + ' ./td[1]//b/text()[contains(., "-----")]' ']/../tr' ) party_rows = self.tree.xpath(path) @@ -333,9 +348,11 @@ def parties(self): if should_continue: continue - name_path = u'.//b[not(./parent::i)][not(./u)][not(contains(., "------"))]' + name_path = (u'.//b[not(./parent::i)]' + '[not(./u)][not(contains(., "------"))]') is_party_name_cell = (len(cells[0].xpath(name_path)) > 0) - prev_has_disposition = prev is not None and 'Disposition' in prev.text_content() + prev_has_disposition = prev is not None and \ + 'Disposition' in prev.text_content() if is_party_name_cell and not prev_has_disposition: element = cells[0].xpath(name_path)[0] party[u'name'] = force_unicode(element.text_content().strip()) @@ -362,9 +379,10 @@ def parties(self): parties.append(party) if self.is_adversary_proceeding: - # In adversary proceedings, there are multiple rows under one - # party type header. Nuke the bulk of the party dict, except for - # the type so that it's ready for the next iteration. + # In adversary proceedings, there are multiple rows + # under one party type header. Nuke the bulk of the + # party dict, except for the type so that it's ready + # for the next iteration. party = {u'type': party[u'type']} else: party = {} @@ -479,9 +497,10 @@ def _add_criminal_data_to_parties(self, parties, party_rows): :param party_rows: The trs with party/criminal data :return: None """ - # Because criminal data spans multiple trs, the way we do this is by - # keeping track of which party we're currently working on. Then, when we - # get useful criminal data, we add it to that party. + # Because criminal data spans multiple trs, the way we do this + # is by keeping track of which party we're currently working + # on. Then, when we get useful criminal data, we add it to + # that party. empty_criminal_data = { u'counts': [], u'complaints': [], @@ -681,7 +700,8 @@ def _get_attorneys(cell): """ attorneys = [] for atty_node in cell.xpath('.//b'): - name_parts = force_unicode(atty_node.text_content().strip()).split() + name_parts = force_unicode( + atty_node.text_content().strip()).split() attorney = { u'name': u' '.join(name_parts), u'roles': [], @@ -692,7 +712,8 @@ def _get_attorneys(cell): # noinspection PyProtectedMember if isinstance(node, (etree._ElementStringResult, etree._ElementUnicodeResult)): - clean_atty = u'%s\n' % ' '.join(n.strip() for n in node.split()) + clean_atty = u'%s\n' % ' '.join( + n.strip() for n in node.split()) if clean_atty.strip(): attorney[u'contact'] += clean_atty else: @@ -715,44 +736,81 @@ def docket_entries(self): if self._docket_entries is not None: return self._docket_entries - # There can be multiple docket entry tables on a single docket page. See - # https://github.com/freelawproject/courtlistener/issues/762. ∴ we need - # to identify the first table, and all following tables. The following - # tables lack column headers, so we have to use the preceding-sibling - # tables to make sure it's right. + # There can be multiple docket entry tables on a single docket + # page. See + # https://github.com/freelawproject/courtlistener/issues/762. + # ∴ we need to identify the first table, and all following + # tables. The following tables lack column headers, so we have + # to use the preceding-sibling tables to make sure it's right. docket_header = './/text()[contains(., "Docket Text")]' - bankr_multi_doc = 'not(.//text()[contains(., "Total file size of selected documents")])' + bankr_multi_doc = ('not(.//text()[contains(.,' + '"Total file size of selected documents")])') footer_multi_doc = 'not(.//text()[contains(., "Footer format:")])' docket_entry_rows = self.tree.xpath( '//table' - '[preceding-sibling::table[{dh}] or {dh}]' - '[{b_multi_doc}]' - '[{footer_multi_doc}]' + '[preceding-sibling::table[{dh}] or {dh}]' + '[{b_multi_doc}]' + '[{footer_multi_doc}]' '/tbody/tr'.format( dh=docket_header, b_multi_doc=bankr_multi_doc, footer_multi_doc=footer_multi_doc, ) - )[1:] # Skip the first row. + ) + + CELL_XPATH = u'(./td[not(./input)] | ./th[not(./input)])' + + if not docket_entry_rows: + return [] + + def normalize_whitespace(str): + ''' Split (on whitespace) and then rejoin with spaces. + Replaces non-terminal runs of whitespace with a single space. + ''' + return ' '.join(str.split()) + + raw_date_kind = normalize_whitespace( + docket_entry_rows[0].xpath(CELL_XPATH)[0].text_content()) + if raw_date_kind == 'Date Entered': + date_kind = u'date_entered' + elif raw_date_kind in ['Date Filed', 'Filing Date', 'Docket Date']: + date_kind = u'date_filed' + else: + raise AssertionError('Unknown date(?) kind <%s>' % raw_date_kind) docket_entries = [] - for row in docket_entry_rows: + # Skip the first row. + for row in docket_entry_rows[1:]: de = {} - cells = row.xpath(u'./td[not(./input)]') + cells = row.xpath(CELL_XPATH) if len(cells) == 4: # In some instances, the document entry table has an extra # column. See almb, 92-04963 del cells[1] - date_filed_str = force_unicode(cells[0].text_content()) - if not date_filed_str: + date_str = force_unicode(cells[0].text_content()) + if not date_str: # Some older dockets have missing dates. Press on. continue - de[u'date_filed'] = convert_date_string(date_filed_str) + de[date_kind] = convert_date_string(date_str) de[u'document_number'] = self._get_document_number(cells[1]) - de[u'pacer_doc_id'] = self._get_pacer_doc_id(cells[1], - de[u'document_number']) + de[u'pacer_doc_id'] = self._get_pacer_doc_id( + cells[1], de[u'document_number']) de[u'description'] = self._get_description(cells) + + # If there's a '(Entered: xx/yy/zzzz)' notation at end, use it! + match = re.search(r'\(Entered: (\d{2}/\d{2}/\d{4})\)$', + de[u'description']) + if match: + date_entered = convert_date_string(match.group(1)) + if u'date_entered' in de: + assert de[u'date_entered'] == date_entered, ( + 'Date Entered column (%s) does not match parsed value ' + 'from end of description (%s)' % (de[u'date_entered'], + date_entered)) + else: + de[u'date_entered'] = date_entered + if not de[u'document_number']: # Minute order. Skip for now. continue @@ -791,11 +849,11 @@ def query(self, pacer_case_id, date_range_type='Filed', date_start=None, were entered into PACER or the date they were filed. :param date_start: The start date for the date range (as a date object) :param date_end: The end date for the date range (as a date object) - :param doc_num_start: A range of documents can be requested. This is the - lower bound of their ID numbers. + :param doc_num_start: A range of documents can be + requested. This is the lower bound of their ID numbers. :param doc_num_end: The upper bound of the requested documents. - :param show_parties_and_counsel: Whether to show the parties and counsel - in a case (note this adds expense). + :param show_parties_and_counsel: Whether to show the parties and + counsel in a case (note this adds expense). :param show_terminated_parties: Whether to show terminated parties in a case (note this adds expense). :param show_list_of_member_cases: Whether to show a list of member @@ -808,6 +866,7 @@ def query(self, pacer_case_id, date_range_type='Filed', date_start=None, :param order_by: The ordering desired for the results. :return: None. Instead sets self.response attribute and runs self.parse() + """ # Set up and sanity tests assert self.session is not None, \ @@ -890,8 +949,8 @@ def _set_metadata_values(self): u'/ancestor::table[not(.//center)][last()]' )[0] cells = table.xpath(u'.//td') - # Convert the
separated content into text strings, treating as much - # as possible as HTML. + # Convert the
separated content into text strings, + # treating as much as possible as HTML. values = [] for cell in cells: clean_texts = [clean_string(s) for s in self._br_split(cell)] @@ -909,8 +968,8 @@ def _get_pacer_doc_id(cell, document_number): # column in their docket report. urls = cell.xpath(u'.//a') if len(urls) == 0: - # Docket entry exists, but cannot download document (it's sealed - # or otherwise unavailable in PACER). + # Docket entry exists, but cannot download document + # (it's sealed or otherwise unavailable in PACER). return None for url in urls: if url.text_content().strip() == document_number: @@ -928,9 +987,9 @@ def _get_document_number(self, cell): if words: first_word = re.sub(u'[\s\u00A0]', '', words[0]) if self.court_id == u'txnb': - # txnb merges the second and third columns, so if the first word - # is a number, return it. Otherwise, assume doc number isn't - # listed for the item. + # txnb merges the second and third columns, so if the + # first word is a number, return it. Otherwise, assume + # doc number isn't listed for the item. if first_word.isdigit(): return first_word else: @@ -942,10 +1001,10 @@ def _get_description(self, cells): return force_unicode(cells[2].text_content()) s = force_unicode(cells[1].text_content()) - # In txnb the second and third columns of the docket entries are - # combined. The field can have one of four formats. Attempt the most - # detailed first, then work our way down to just giving up and capturing - # it all. + # In txnb the second and third columns of the docket entries + # are combined. The field can have one of four + # formats. Attempt the most detailed first, then work our way + # down to just giving up and capturing it all. ws = u'[\s\u00A0]' # Whitespace including nbsp regexes = [ # 2 (23 pgs; 4 docs) Blab blah (happens when attachments exist and @@ -1054,6 +1113,7 @@ def _get_judge(self, regex): judge_str = judge_str.split('to:')[1] return normalize_judge_string(judge_str)[0] + if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: python -m juriscraper.pacer.docket_report filepath") diff --git a/tests/examples/pacer/dockets/district/mad_134453.html b/tests/examples/pacer/dockets/district/mad_134453.html new file mode 100644 index 000000000..368d3cf26 --- /dev/null +++ b/tests/examples/pacer/dockets/district/mad_134453.html @@ -0,0 +1,28 @@ +CM/ECF - USDC Massachusetts - Version 6.1.1.2u as of 7/29/2017 +
+ + +
CLOSED,CONSOLIDATED,LEAD,STAYED
+

United States District Court
+District of Massachusetts (Boston)
+CIVIL DOCKET FOR CASE #: 1:11-cv-10230-MLW

+ + + +

Arkansas Teacher Retirement System v. State Street Corporation et al
+Assigned to: Judge Mark L. Wolf
+Demand: $5,000,000
+
related Case: 1:11-cv-12049-MLW
+Cause: 28:1332 Diversity-(Citizenship)

Date Filed: 02/10/2011
+Date Terminated: 06/23/2014
+Jury Demand: Plaintiff
+Nature of Suit: 370 Other Fraud
+Jurisdiction: Diversity
+
+ + + + + +
Date Entered#Docket Text
06/06/2018251  MOTION To seal document. by State Street Bank & Trust Company. (Attachments: # 1 Exhibit Redacted Motion)(Franklin, Yvonne) (Entered: 06/06/2018)
06/06/2018256 Judge Mark L. Wolf: "...[I]t is hereby ORDERED that Labaton shall file, by 9:00 a.m. on June 7, 2018, versions of its submissions with redactions consistent with the standards discussed in the May 16, 2018 Order in this case, see Docket No. 223, and any other jurisprudence which Labaton cites with its June 7, 2018 submissions. Any failure to submit a properly redacted version of the June 5, 2018 submissions may, among other things, result in the denial of Labaton's motion to impound." ORDER entered. Associated Cases: 1:11-cv-10230-MLW, 1:11-cv-12049-MLW, 1:12-cv-11698-MLW(Bono, Christine) (Entered: 06/06/2018)
06/06/2018258 AFFIDAVIT of George Hopkins by Arkansas Teacher Retirement System. (Kearney, Kristen) (Entered: 06/06/2018)

+
PACER Service Center
Transaction Receipt
06/06/2018 17:49:24
PACER Login: jhawkinson0:3355704:0 Client Code:
Description: Docket Report Search Criteria: 1:11-cv-10230-MLW Start date: 6/6/2018
Billable Pages: 1 Cost: 0.10
\ No newline at end of file diff --git a/tests/examples/pacer/dockets/district/mad_134453.json b/tests/examples/pacer/dockets/district/mad_134453.json new file mode 100644 index 000000000..81e617f48 --- /dev/null +++ b/tests/examples/pacer/dockets/district/mad_134453.json @@ -0,0 +1,37 @@ +{ + "assigned_to_str": "Mark L. Wolf", + "case_name": "Arkansas Teacher Retirement System v. State Street Corporation", + "cause": "28:1332 Diversity-(Citizenship)", + "court_id": "mad", + "date_converted": null, + "date_discharged": null, + "date_filed": "2011-02-10", + "date_terminated": "2014-06-23", + "demand": "$5,000,000", + "docket_entries": [ + { + "date_entered": "2018-06-06", + "description": "MOTION To seal document. by State Street Bank & Trust Company. (Attachments: # 1 Exhibit Redacted Motion)(Franklin, Yvonne) (Entered: 06/06/2018)", + "document_number": "251", + "pacer_doc_id": "09508729824" + }, + { + "date_entered": "2018-06-06", + "description": "Judge Mark L. Wolf: \"...[I]t is hereby ORDERED that Labaton shall file, by 9:00 a.m. on June 7, 2018, versions of its submissions with redactions consistent with the standards discussed in the May 16, 2018 Order in this case, see Docket No. 223, and any other jurisprudence which Labaton cites with its June 7, 2018 submissions. Any failure to submit a properly redacted version of the June 5, 2018 submissions may, among other things, result in the denial of Labaton's motion to impound.\" ORDER entered. Associated Cases: 1:11-cv-10230-MLW, 1:11-cv-12049-MLW, 1:12-cv-11698-MLW(Bono, Christine) (Entered: 06/06/2018)", + "document_number": "256", + "pacer_doc_id": "09508730306" + }, + { + "date_entered": "2018-06-06", + "description": "AFFIDAVIT of George Hopkins by Arkansas Teacher Retirement System. (Kearney, Kristen) (Entered: 06/06/2018)", + "document_number": "258", + "pacer_doc_id": "09508732451" + } + ], + "docket_number": "1:11-cv-10230", + "jurisdiction": "Diversity", + "jury_demand": "Plaintiff", + "nature_of_suit": "370 Other Fraud", + "parties": [], + "referred_to_str": "" +} \ No newline at end of file