From 153a10d901a407932a0ed3be5d8488b1c0d476fa Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Thu, 7 Jun 2018 02:16:00 -0400 Subject: [PATCH 1/5] docket_report.py: flake8 for PEP8 flake8 juriscraper/pacer/docket_report.py juriscraper/pacer/docket_report.py:96:80: E501 line too long (80 > 79 characters) juriscraper/pacer/docket_report.py:142:80: E501 line too long (80 > 79 characters) juriscraper/pacer/docket_report.py:166:80: E501 line too long (83 > 79 characters) juriscraper/pacer/docket_report.py:168:80: E501 line too long (88 > 79 characters) juriscraper/pacer/docket_report.py:174:80: E501 line too long (80 > 79 characters) juriscraper/pacer/docket_report.py:176:80: E501 line too long (95 > 79 characters) juriscraper/pacer/docket_report.py:184:80: E501 line too long (86 > 79 characters) juriscraper/pacer/docket_report.py:198:80: E501 line too long (91 > 79 characters) juriscraper/pacer/docket_report.py:201:80: E501 line too long (98 > 79 characters) juriscraper/pacer/docket_report.py:206:80: E501 line too long (96 > 79 characters) juriscraper/pacer/docket_report.py:208:80: E501 line too long (110 > 79 characters) juriscraper/pacer/docket_report.py:212:80: E501 line too long (107 > 79 characters) juriscraper/pacer/docket_report.py:319:80: E501 line too long (82 > 79 characters) juriscraper/pacer/docket_report.py:336:80: E501 line too long (87 > 79 characters) juriscraper/pacer/docket_report.py:338:80: E501 line too long (92 > 79 characters) juriscraper/pacer/docket_report.py:366:80: E501 line too long (80 > 79 characters) juriscraper/pacer/docket_report.py:483:80: E501 line too long (80 > 79 characters) juriscraper/pacer/docket_report.py:684:80: E501 line too long (80 > 79 characters) juriscraper/pacer/docket_report.py:695:80: E501 line too long (84 > 79 characters) juriscraper/pacer/docket_report.py:718:80: E501 line too long (80 > 79 characters) juriscraper/pacer/docket_report.py:724:80: E501 line too long (96 > 79 characters) juriscraper/pacer/docket_report.py:728:15: E131 continuation line unaligned for hanging indent juriscraper/pacer/docket_report.py:754:80: E501 line too long (80 > 79 characters) juriscraper/pacer/docket_report.py:794:80: E501 line too long (80 > 79 characters) juriscraper/pacer/docket_report.py:797:80: E501 line too long (80 > 79 characters) juriscraper/pacer/docket_report.py:893:80: E501 line too long (80 > 79 characters) juriscraper/pacer/docket_report.py:912:80: E501 line too long (80 > 79 characters) juriscraper/pacer/docket_report.py:931:80: E501 line too long (80 > 79 characters) juriscraper/pacer/docket_report.py:947:80: E501 line too long (80 > 79 characters) juriscraper/pacer/docket_report.py:1057:1: E305 expected 2 blank lines after class or function definition, found 1 --- juriscraper/pacer/docket_report.py | 129 +++++++++++++++++------------ 1 file changed, 77 insertions(+), 52 deletions(-) diff --git a/juriscraper/pacer/docket_report.py b/juriscraper/pacer/docket_report.py index 345950a0b..55b978df9 100644 --- a/juriscraper/pacer/docket_report.py +++ b/juriscraper/pacer/docket_report.py @@ -93,8 +93,9 @@ def _get_value(self, regex, query_strings, cast_to_date=False): return convert_date_string(m.group(1)) hit = m.group(1) if "date filed" not in hit.lower(): - # Safety check. Sometimes a match is made against the merged - # text string, including its headers. This is wrong. + # Safety check. Sometimes a match is made against + # the merged text string, including its + # headers. This is wrong. return hit if cast_to_date: @@ -139,7 +140,8 @@ def get_datetime_from_tree(self, path, cast_to_date=False): logger.debug("Couldn't parse date: %s" % s) return None else: - d = d.replace(tzinfo=d.tzinfo or gettz('UTC')) # Set it to UTC. + # Set it to UTC. + d = d.replace(tzinfo=d.tzinfo or gettz('UTC')) if cast_to_date is True: return d.date() return d @@ -163,17 +165,21 @@ class DocketReport(BaseDocketReport, BaseReport): case_name_str = r"(?:Case\s+title:\s+)?(.*\bv\.?\s.*)" case_name_regex = re.compile(case_name_str) case_name_i_regex = re.compile(case_name_str, flags=re.IGNORECASE) - case_title_regex = re.compile(r"(?:Case\s+title:\s+)(.*)", flags=re.IGNORECASE) + case_title_regex = re.compile(r"(?:Case\s+title:\s+)(.*)", + flags=re.IGNORECASE) in_re_regex = re.compile(r"(\bIN\s+RE:\s+.*)", flags=re.IGNORECASE) - in_the_matter_regex = re.compile(r"(\bIn\s+the\s+matter\s+.*)", flags=re.IGNORECASE) + in_the_matter_regex = re.compile(r"(\bIn\s+the\s+matter\s+.*)", + flags=re.IGNORECASE) case_name_regexes = [ case_name_regex, case_name_i_regex, case_title_regex, in_re_regex, in_the_matter_regex, ] date_filed_regex = re.compile(r'Date [fF]iled:\s+(%s)' % date_regex) - date_converted_regex = re.compile(r'Date [Cc]onverted:\s+(%s)' % date_regex) + date_converted_regex = re.compile( + r'Date [Cc]onverted:\s+(%s)' % date_regex) # Be careful this does not match "Joint debtor discharged" field. - date_discharged_regex = re.compile(r'(?:Date|Debtor)\s+[Dd]ischarged:\s+(%s)' % date_regex) + date_discharged_regex = re.compile( + r'(?:Date|Debtor)\s+[Dd]ischarged:\s+(%s)' % date_regex) assigned_to_regex = r'Assigned to:\s+(.*)' referred_to_regex = r'Referred to:\s+(.*)' cause_regex = re.compile(r'Cause:\s+(.*)') @@ -181,7 +187,8 @@ class DocketReport(BaseDocketReport, BaseReport): jury_demand_regex = re.compile(r'Jury Demand:\s+(.*)') jurisdiction_regex = re.compile(r'Jurisdiction:\s+(.*)') demand_regex = re.compile(r'^Demand:\s+(.*)') - docket_number_dist_regex = re.compile(r"((\d{1,2}:)?\d\d-[a-zA-Z]{1,4}-\d{1,10})") + docket_number_dist_regex = re.compile( + r"((\d{1,2}:)?\d\d-[a-zA-Z]{1,4}-\d{1,10})") docket_number_bankr_regex = re.compile(r"(?:#:\s+)?((\d-)?\d\d-\d*)") offense_regex = re.compile( r'highest\s+offense.*(?Popening|terminated)', flags=re.I) @@ -195,21 +202,26 @@ class DocketReport(BaseDocketReport, BaseReport): 'is_adversary_proceeding'] ERROR_STRINGS = BaseReport.ERROR_STRINGS + [ - "The report may take a long time to run because this case has many docket entries", + ("The report may take a long time to run because this case has " + "many docket entries"), "The page ID does not exist. Please enter a valid page ID number. ", "There are no documents in this case.", - "Incomplete request. Please try your query again by choosing the Query or Reports option", + ("Incomplete request. Please try your query again by choosing the " + "Query or Reports option"), "To accept charges shown below, click on the 'View Report' button", "Unable to create PDF file.", "This case was administratively closed", "The start date must be less than or equal to the end date", - "The starting document number must be less than or equal to the ending document number", + ("The starting document number must be less than or equal to " + "the ending document number"), "Case not found.", - "Either you do not have permission to view the document, or the document does not exist in the case.", + ("Either you do not have permission to view the document, " + "or the document does not exist in the case."), "Format: text", "Server timeout waiting for the HTTP request from the client.", "The case type was.*but it must be", - "This case is in the process of being opened, please check back later for additional information.", + ("This case is in the process of being opened, " + "please check back later for additional information."), "Submission already made, please wait for response from server", ] @@ -314,9 +326,12 @@ def parties(self): # document table has bold/underline/italic text. path = ( '//tr[' - ' ./td[1]//i/b/text() or ' # Bankruptcy - ' ./td[1]//b/u/text() or ' # Regular district - ' ./td[1]//b/text()[contains(., "-----")]' # Adversary proceedings + # Bankruptcy + ' ./td[1]//i/b/text() or ' + # Regular district + ' ./td[1]//b/u/text() or ' + # Adversary proceedings + ' ./td[1]//b/text()[contains(., "-----")]' ']/../tr' ) party_rows = self.tree.xpath(path) @@ -333,9 +348,11 @@ def parties(self): if should_continue: continue - name_path = u'.//b[not(./parent::i)][not(./u)][not(contains(., "------"))]' + name_path = (u'.//b[not(./parent::i)]' + '[not(./u)][not(contains(., "------"))]') is_party_name_cell = (len(cells[0].xpath(name_path)) > 0) - prev_has_disposition = prev is not None and 'Disposition' in prev.text_content() + prev_has_disposition = prev is not None and \ + 'Disposition' in prev.text_content() if is_party_name_cell and not prev_has_disposition: element = cells[0].xpath(name_path)[0] party[u'name'] = force_unicode(element.text_content().strip()) @@ -362,9 +379,10 @@ def parties(self): parties.append(party) if self.is_adversary_proceeding: - # In adversary proceedings, there are multiple rows under one - # party type header. Nuke the bulk of the party dict, except for - # the type so that it's ready for the next iteration. + # In adversary proceedings, there are multiple rows + # under one party type header. Nuke the bulk of the + # party dict, except for the type so that it's ready + # for the next iteration. party = {u'type': party[u'type']} else: party = {} @@ -479,9 +497,10 @@ def _add_criminal_data_to_parties(self, parties, party_rows): :param party_rows: The trs with party/criminal data :return: None """ - # Because criminal data spans multiple trs, the way we do this is by - # keeping track of which party we're currently working on. Then, when we - # get useful criminal data, we add it to that party. + # Because criminal data spans multiple trs, the way we do this + # is by keeping track of which party we're currently working + # on. Then, when we get useful criminal data, we add it to + # that party. empty_criminal_data = { u'counts': [], u'complaints': [], @@ -681,7 +700,8 @@ def _get_attorneys(cell): """ attorneys = [] for atty_node in cell.xpath('.//b'): - name_parts = force_unicode(atty_node.text_content().strip()).split() + name_parts = force_unicode( + atty_node.text_content().strip()).split() attorney = { u'name': u' '.join(name_parts), u'roles': [], @@ -692,7 +712,8 @@ def _get_attorneys(cell): # noinspection PyProtectedMember if isinstance(node, (etree._ElementStringResult, etree._ElementUnicodeResult)): - clean_atty = u'%s\n' % ' '.join(n.strip() for n in node.split()) + clean_atty = u'%s\n' % ' '.join( + n.strip() for n in node.split()) if clean_atty.strip(): attorney[u'contact'] += clean_atty else: @@ -715,19 +736,21 @@ def docket_entries(self): if self._docket_entries is not None: return self._docket_entries - # There can be multiple docket entry tables on a single docket page. See - # https://github.com/freelawproject/courtlistener/issues/762. ∴ we need - # to identify the first table, and all following tables. The following - # tables lack column headers, so we have to use the preceding-sibling - # tables to make sure it's right. + # There can be multiple docket entry tables on a single docket + # page. See + # https://github.com/freelawproject/courtlistener/issues/762. + # ∴ we need to identify the first table, and all following + # tables. The following tables lack column headers, so we have + # to use the preceding-sibling tables to make sure it's right. docket_header = './/text()[contains(., "Docket Text")]' - bankr_multi_doc = 'not(.//text()[contains(., "Total file size of selected documents")])' + bankr_multi_doc = ('not(.//text()[contains(.,' + '"Total file size of selected documents")])') footer_multi_doc = 'not(.//text()[contains(., "Footer format:")])' docket_entry_rows = self.tree.xpath( '//table' - '[preceding-sibling::table[{dh}] or {dh}]' - '[{b_multi_doc}]' - '[{footer_multi_doc}]' + '[preceding-sibling::table[{dh}] or {dh}]' + '[{b_multi_doc}]' + '[{footer_multi_doc}]' '/tbody/tr'.format( dh=docket_header, b_multi_doc=bankr_multi_doc, @@ -750,8 +773,8 @@ def docket_entries(self): continue de[u'date_filed'] = convert_date_string(date_filed_str) de[u'document_number'] = self._get_document_number(cells[1]) - de[u'pacer_doc_id'] = self._get_pacer_doc_id(cells[1], - de[u'document_number']) + de[u'pacer_doc_id'] = self._get_pacer_doc_id( + cells[1], de[u'document_number']) de[u'description'] = self._get_description(cells) if not de[u'document_number']: # Minute order. Skip for now. @@ -791,11 +814,11 @@ def query(self, pacer_case_id, date_range_type='Filed', date_start=None, were entered into PACER or the date they were filed. :param date_start: The start date for the date range (as a date object) :param date_end: The end date for the date range (as a date object) - :param doc_num_start: A range of documents can be requested. This is the - lower bound of their ID numbers. + :param doc_num_start: A range of documents can be + requested. This is the lower bound of their ID numbers. :param doc_num_end: The upper bound of the requested documents. - :param show_parties_and_counsel: Whether to show the parties and counsel - in a case (note this adds expense). + :param show_parties_and_counsel: Whether to show the parties and + counsel in a case (note this adds expense). :param show_terminated_parties: Whether to show terminated parties in a case (note this adds expense). :param show_list_of_member_cases: Whether to show a list of member @@ -808,6 +831,7 @@ def query(self, pacer_case_id, date_range_type='Filed', date_start=None, :param order_by: The ordering desired for the results. :return: None. Instead sets self.response attribute and runs self.parse() + """ # Set up and sanity tests assert self.session is not None, \ @@ -890,8 +914,8 @@ def _set_metadata_values(self): u'/ancestor::table[not(.//center)][last()]' )[0] cells = table.xpath(u'.//td') - # Convert the
separated content into text strings, treating as much - # as possible as HTML. + # Convert the
separated content into text strings, + # treating as much as possible as HTML. values = [] for cell in cells: clean_texts = [clean_string(s) for s in self._br_split(cell)] @@ -909,8 +933,8 @@ def _get_pacer_doc_id(cell, document_number): # column in their docket report. urls = cell.xpath(u'.//a') if len(urls) == 0: - # Docket entry exists, but cannot download document (it's sealed - # or otherwise unavailable in PACER). + # Docket entry exists, but cannot download document + # (it's sealed or otherwise unavailable in PACER). return None for url in urls: if url.text_content().strip() == document_number: @@ -928,9 +952,9 @@ def _get_document_number(self, cell): if words: first_word = re.sub(u'[\s\u00A0]', '', words[0]) if self.court_id == u'txnb': - # txnb merges the second and third columns, so if the first word - # is a number, return it. Otherwise, assume doc number isn't - # listed for the item. + # txnb merges the second and third columns, so if the + # first word is a number, return it. Otherwise, assume + # doc number isn't listed for the item. if first_word.isdigit(): return first_word else: @@ -942,10 +966,10 @@ def _get_description(self, cells): return force_unicode(cells[2].text_content()) s = force_unicode(cells[1].text_content()) - # In txnb the second and third columns of the docket entries are - # combined. The field can have one of four formats. Attempt the most - # detailed first, then work our way down to just giving up and capturing - # it all. + # In txnb the second and third columns of the docket entries + # are combined. The field can have one of four + # formats. Attempt the most detailed first, then work our way + # down to just giving up and capturing it all. ws = u'[\s\u00A0]' # Whitespace including nbsp regexes = [ # 2 (23 pgs; 4 docs) Blab blah (happens when attachments exist and @@ -1054,6 +1078,7 @@ def _get_judge(self, regex): judge_str = judge_str.split('to:')[1] return normalize_judge_string(judge_str)[0] + if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: python -m juriscraper.pacer.docket_report filepath") From 68790c9d9102ab1fc84c3a93adf5f46ed628d23e Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Thu, 7 Jun 2018 03:09:26 -0400 Subject: [PATCH 2/5] DocketReport: Abstract CELL_XPATH, adjust loop Use a constant, CELL_XPATH, for finding cells in the table Don't exclude header rows from `docket_entry_rows`. These lend clarity to the next commit. --- juriscraper/pacer/docket_report.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/juriscraper/pacer/docket_report.py b/juriscraper/pacer/docket_report.py index 55b978df9..d60c45405 100644 --- a/juriscraper/pacer/docket_report.py +++ b/juriscraper/pacer/docket_report.py @@ -756,12 +756,15 @@ def docket_entries(self): b_multi_doc=bankr_multi_doc, footer_multi_doc=footer_multi_doc, ) - )[1:] # Skip the first row. + ) + CELL_XPATH = u'./td[not(./input)]' + docket_entries = [] - for row in docket_entry_rows: + # Skip the first row. + for row in docket_entry_rows[1:]: de = {} - cells = row.xpath(u'./td[not(./input)]') + cells = row.xpath(CELL_XPATH) if len(cells) == 4: # In some instances, the document entry table has an extra # column. See almb, 92-04963 From 4c848b818b000bad827e2e46bc26ed3c051ff4d2 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Thu, 7 Jun 2018 03:17:33 -0400 Subject: [PATCH 3/5] DocketReport: handle 'Date Entered' option A PACER docket report can be run with either 'Date Entered' or 'Date Filed' dates. Previous code assumed, roughly, "A date is a date," but that's not so. It's common for the date entered to lag the date filed by a day, and then that means different people who run the docket report with different values for that radio button would cause different dates to appear in the CL `date_filed` field. So here we check the column heading of the first column against several known choices, or we throw an exception if it doesn't make sense. We broaden the CELL_XPATH expression to include as well as , since in a few BK cases that's required. We also normalize the column heading whitespace with a helper function normalize_whitespace() that arguably could live elsewhere. There's some ambiguity about what 'Docket Date' means (is it filing or entering?), but since it only appers historically, we'll place it where we have always done so -- as the `date_filed`. This needs freelawproject/courtlistener#840 which adds support for the new `date_entered` field. --- juriscraper/pacer/docket_report.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/juriscraper/pacer/docket_report.py b/juriscraper/pacer/docket_report.py index d60c45405..954bfbd57 100644 --- a/juriscraper/pacer/docket_report.py +++ b/juriscraper/pacer/docket_report.py @@ -758,8 +758,26 @@ def docket_entries(self): ) ) - CELL_XPATH = u'./td[not(./input)]' - + CELL_XPATH = u'(./td[not(./input)] | ./th[not(./input)])' + + if not docket_entry_rows: + return [] + + def normalize_whitespace(str): + ''' Split (on whitespace) and then rejoin with spaces. + Replaces non-terminal runs of whitespace with a single space. + ''' + return ' '.join(str.split()) + + raw_date_kind = normalize_whitespace( + docket_entry_rows[0].xpath(CELL_XPATH)[0].text_content()) + if raw_date_kind == 'Date Entered': + date_kind = u'date_entered' + elif raw_date_kind in ['Date Filed', 'Filing Date', 'Docket Date']: + date_kind = u'date_filed' + else: + raise AssertionError('Unknown date(?) kind <%s>' % raw_date_kind) + docket_entries = [] # Skip the first row. for row in docket_entry_rows[1:]: @@ -770,11 +788,11 @@ def docket_entries(self): # column. See almb, 92-04963 del cells[1] - date_filed_str = force_unicode(cells[0].text_content()) - if not date_filed_str: + date_str = force_unicode(cells[0].text_content()) + if not date_str: # Some older dockets have missing dates. Press on. continue - de[u'date_filed'] = convert_date_string(date_filed_str) + de[date_kind] = convert_date_string(date_str) de[u'document_number'] = self._get_document_number(cells[1]) de[u'pacer_doc_id'] = self._get_pacer_doc_id( cells[1], de[u'document_number']) From c2b89203f7f3a2be9f5af55eb2ecf0fa8d26c98b Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Thu, 7 Jun 2018 03:35:17 -0400 Subject: [PATCH 4/5] DocketReport: tests for the 'Date Entered' report variant --- .../pacer/dockets/district/mad_134453.html | 28 ++++++++++++++ .../pacer/dockets/district/mad_134453.json | 37 +++++++++++++++++++ 2 files changed, 65 insertions(+) create mode 100644 tests/examples/pacer/dockets/district/mad_134453.html create mode 100644 tests/examples/pacer/dockets/district/mad_134453.json diff --git a/tests/examples/pacer/dockets/district/mad_134453.html b/tests/examples/pacer/dockets/district/mad_134453.html new file mode 100644 index 000000000..368d3cf26 --- /dev/null +++ b/tests/examples/pacer/dockets/district/mad_134453.html @@ -0,0 +1,28 @@ +CM/ECF - USDC Massachusetts - Version 6.1.1.2u as of 7/29/2017 +
+ + +
CLOSED,CONSOLIDATED,LEAD,STAYED
+

United States District Court
+District of Massachusetts (Boston)
+CIVIL DOCKET FOR CASE #: 1:11-cv-10230-MLW

+ + + +

Arkansas Teacher Retirement System v. State Street Corporation et al
+Assigned to: Judge Mark L. Wolf
+Demand: $5,000,000
+
related Case: 1:11-cv-12049-MLW
+Cause: 28:1332 Diversity-(Citizenship)

Date Filed: 02/10/2011
+Date Terminated: 06/23/2014
+Jury Demand: Plaintiff
+Nature of Suit: 370 Other Fraud
+Jurisdiction: Diversity
+
+ + + + + +
Date Entered#Docket Text
06/06/2018251  MOTION To seal document. by State Street Bank & Trust Company. (Attachments: # 1 Exhibit Redacted Motion)(Franklin, Yvonne) (Entered: 06/06/2018)
06/06/2018256 Judge Mark L. Wolf: "...[I]t is hereby ORDERED that Labaton shall file, by 9:00 a.m. on June 7, 2018, versions of its submissions with redactions consistent with the standards discussed in the May 16, 2018 Order in this case, see Docket No. 223, and any other jurisprudence which Labaton cites with its June 7, 2018 submissions. Any failure to submit a properly redacted version of the June 5, 2018 submissions may, among other things, result in the denial of Labaton's motion to impound." ORDER entered. Associated Cases: 1:11-cv-10230-MLW, 1:11-cv-12049-MLW, 1:12-cv-11698-MLW(Bono, Christine) (Entered: 06/06/2018)
06/06/2018258 AFFIDAVIT of George Hopkins by Arkansas Teacher Retirement System. (Kearney, Kristen) (Entered: 06/06/2018)

+
PACER Service Center
Transaction Receipt
06/06/2018 17:49:24
PACER Login: jhawkinson0:3355704:0 Client Code:
Description: Docket Report Search Criteria: 1:11-cv-10230-MLW Start date: 6/6/2018
Billable Pages: 1 Cost: 0.10
\ No newline at end of file diff --git a/tests/examples/pacer/dockets/district/mad_134453.json b/tests/examples/pacer/dockets/district/mad_134453.json new file mode 100644 index 000000000..81e617f48 --- /dev/null +++ b/tests/examples/pacer/dockets/district/mad_134453.json @@ -0,0 +1,37 @@ +{ + "assigned_to_str": "Mark L. Wolf", + "case_name": "Arkansas Teacher Retirement System v. State Street Corporation", + "cause": "28:1332 Diversity-(Citizenship)", + "court_id": "mad", + "date_converted": null, + "date_discharged": null, + "date_filed": "2011-02-10", + "date_terminated": "2014-06-23", + "demand": "$5,000,000", + "docket_entries": [ + { + "date_entered": "2018-06-06", + "description": "MOTION To seal document. by State Street Bank & Trust Company. (Attachments: # 1 Exhibit Redacted Motion)(Franklin, Yvonne) (Entered: 06/06/2018)", + "document_number": "251", + "pacer_doc_id": "09508729824" + }, + { + "date_entered": "2018-06-06", + "description": "Judge Mark L. Wolf: \"...[I]t is hereby ORDERED that Labaton shall file, by 9:00 a.m. on June 7, 2018, versions of its submissions with redactions consistent with the standards discussed in the May 16, 2018 Order in this case, see Docket No. 223, and any other jurisprudence which Labaton cites with its June 7, 2018 submissions. Any failure to submit a properly redacted version of the June 5, 2018 submissions may, among other things, result in the denial of Labaton's motion to impound.\" ORDER entered. Associated Cases: 1:11-cv-10230-MLW, 1:11-cv-12049-MLW, 1:12-cv-11698-MLW(Bono, Christine) (Entered: 06/06/2018)", + "document_number": "256", + "pacer_doc_id": "09508730306" + }, + { + "date_entered": "2018-06-06", + "description": "AFFIDAVIT of George Hopkins by Arkansas Teacher Retirement System. (Kearney, Kristen) (Entered: 06/06/2018)", + "document_number": "258", + "pacer_doc_id": "09508732451" + } + ], + "docket_number": "1:11-cv-10230", + "jurisdiction": "Diversity", + "jury_demand": "Plaintiff", + "nature_of_suit": "370 Other Fraud", + "parties": [], + "referred_to_str": "" +} \ No newline at end of file From 4d56072ce83db2b5479dc011676687846e13075c Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sat, 9 Jun 2018 20:53:58 -0400 Subject: [PATCH 5/5] DocketReport: Parse description's "(Entered" Throw an assertion if it differs from the Date Entered column. But in the normal case, we'll get Date Filed from the first column and Date Entered from the end of the description's parenthetical note. --- juriscraper/pacer/docket_report.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/juriscraper/pacer/docket_report.py b/juriscraper/pacer/docket_report.py index 954bfbd57..998736b11 100644 --- a/juriscraper/pacer/docket_report.py +++ b/juriscraper/pacer/docket_report.py @@ -797,6 +797,20 @@ def normalize_whitespace(str): de[u'pacer_doc_id'] = self._get_pacer_doc_id( cells[1], de[u'document_number']) de[u'description'] = self._get_description(cells) + + # If there's a '(Entered: xx/yy/zzzz)' notation at end, use it! + match = re.search(r'\(Entered: (\d{2}/\d{2}/\d{4})\)$', + de[u'description']) + if match: + date_entered = convert_date_string(match.group(1)) + if u'date_entered' in de: + assert de[u'date_entered'] == date_entered, ( + 'Date Entered column (%s) does not match parsed value ' + 'from end of description (%s)' % (de[u'date_entered'], + date_entered)) + else: + de[u'date_entered'] = date_entered + if not de[u'document_number']: # Minute order. Skip for now. continue