freelawproject · johnhawkinson · Jun 7, 2018 · Jun 7, 2018 · Jun 7, 2018 · Jun 7, 2018
diff --git a/juriscraper/pacer/docket_report.py b/juriscraper/pacer/docket_report.py
@@ -93,8 +93,9 @@ def _get_value(self, regex, query_strings, cast_to_date=False):
                     return convert_date_string(m.group(1))
                 hit = m.group(1)
                 if "date filed" not in hit.lower():
-                    # Safety check. Sometimes a match is made against the merged
-                    # text string, including its headers. This is wrong.
+                    # Safety check. Sometimes a match is made against
+                    # the merged text string, including its
+                    # headers. This is wrong.
                     return hit
 
         if cast_to_date:
@@ -139,7 +140,8 @@ def get_datetime_from_tree(self, path, cast_to_date=False):
                 logger.debug("Couldn't parse date: %s" % s)
                 return None
             else:
-                d = d.replace(tzinfo=d.tzinfo or gettz('UTC'))  # Set it to UTC.
+                # Set it to UTC.
+                d = d.replace(tzinfo=d.tzinfo or gettz('UTC'))
                 if cast_to_date is True:
                     return d.date()
                 return d
@@ -163,25 +165,30 @@ class DocketReport(BaseDocketReport, BaseReport):
     case_name_str = r"(?:Case\s+title:\s+)?(.*\bv\.?\s.*)"
     case_name_regex = re.compile(case_name_str)
     case_name_i_regex = re.compile(case_name_str, flags=re.IGNORECASE)
-    case_title_regex = re.compile(r"(?:Case\s+title:\s+)(.*)", flags=re.IGNORECASE)
+    case_title_regex = re.compile(r"(?:Case\s+title:\s+)(.*)",
+                                  flags=re.IGNORECASE)
     in_re_regex = re.compile(r"(\bIN\s+RE:\s+.*)", flags=re.IGNORECASE)
-    in_the_matter_regex = re.compile(r"(\bIn\s+the\s+matter\s+.*)", flags=re.IGNORECASE)
+    in_the_matter_regex = re.compile(r"(\bIn\s+the\s+matter\s+.*)",
+                                     flags=re.IGNORECASE)
     case_name_regexes = [
         case_name_regex, case_name_i_regex, case_title_regex, in_re_regex,
         in_the_matter_regex,
     ]
     date_filed_regex = re.compile(r'Date [fF]iled:\s+(%s)' % date_regex)
-    date_converted_regex = re.compile(r'Date [Cc]onverted:\s+(%s)' % date_regex)
+    date_converted_regex = re.compile(
+        r'Date [Cc]onverted:\s+(%s)' % date_regex)
     # Be careful this does not match "Joint debtor discharged" field.
-    date_discharged_regex = re.compile(r'(?:Date|Debtor)\s+[Dd]ischarged:\s+(%s)' % date_regex)
+    date_discharged_regex = re.compile(
+        r'(?:Date|Debtor)\s+[Dd]ischarged:\s+(%s)' % date_regex)
     assigned_to_regex = r'Assigned to:\s+(.*)'
     referred_to_regex = r'Referred to:\s+(.*)'
     cause_regex = re.compile(r'Cause:\s+(.*)')
     nos_regex = re.compile(r'Nature of Suit:\s+(.*)')
     jury_demand_regex = re.compile(r'Jury Demand:\s+(.*)')
     jurisdiction_regex = re.compile(r'Jurisdiction:\s+(.*)')
     demand_regex = re.compile(r'^Demand:\s+(.*)')
-    docket_number_dist_regex = re.compile(r"((\d{1,2}:)?\d\d-[a-zA-Z]{1,4}-\d{1,10})")
+    docket_number_dist_regex = re.compile(
+        r"((\d{1,2}:)?\d\d-[a-zA-Z]{1,4}-\d{1,10})")
     docket_number_bankr_regex = re.compile(r"(?:#:\s+)?((\d-)?\d\d-\d*)")
     offense_regex = re.compile(
         r'highest\s+offense.*(?P<status>opening|terminated)', flags=re.I)
@@ -195,21 +202,26 @@ class DocketReport(BaseDocketReport, BaseReport):
                    'is_adversary_proceeding']
 
     ERROR_STRINGS = BaseReport.ERROR_STRINGS + [
-        "The report may take a long time to run because this case has many docket entries",
+        ("The report may take a long time to run because this case has "
+            "many docket entries"),
         "The page ID does not exist. Please enter a valid page ID number. ",
         "There are no documents in this case.",
-        "Incomplete request. Please try your query again by choosing the Query or Reports option",
+        ("Incomplete request. Please try your query again by choosing the "
+            "Query or Reports option"),
         "To accept charges shown below, click on the 'View Report' button",
         "Unable to create PDF file.",
         "This case was administratively closed",
         "The start date must be less than or equal to the end date",
-        "The starting document number must be less than or equal to the ending document number",
+        ("The starting document number must be less than or equal to "
+            "the ending document number"),
         "Case not found.",
-        "Either you do not have permission to view the document, or the document does not exist in the case.",
+        ("Either you do not have permission to view the document, "
+            "or the document does not exist in the case."),
         "Format: text",
         "Server timeout waiting for the HTTP request from the client.",
         "The case type was.*but it must be",
-        "This case is in the process of being opened, please check back later for additional information.",
+        ("This case is in the process of being opened, "
+            "please check back later for additional information."),
         "Submission already made, please wait for response from server",
     ]
 
@@ -314,9 +326,12 @@ def parties(self):
         # document table has bold/underline/italic text.
         path = (
             '//tr['
-            '    ./td[1]//i/b/text() or '  # Bankruptcy
-            '    ./td[1]//b/u/text() or '  # Regular district
-            '    ./td[1]//b/text()[contains(., "-----")]'  # Adversary proceedings
+            # Bankruptcy
+            '    ./td[1]//i/b/text() or '
+            # Regular district
+            '    ./td[1]//b/u/text() or '
+            # Adversary proceedings
+            '    ./td[1]//b/text()[contains(., "-----")]'
             ']/../tr'
         )
         party_rows = self.tree.xpath(path)
@@ -333,9 +348,11 @@ def parties(self):
             if should_continue:
                 continue
 
-            name_path = u'.//b[not(./parent::i)][not(./u)][not(contains(., "------"))]'
+            name_path = (u'.//b[not(./parent::i)]'
+                         '[not(./u)][not(contains(., "------"))]')
             is_party_name_cell = (len(cells[0].xpath(name_path)) > 0)
-            prev_has_disposition = prev is not None and 'Disposition' in prev.text_content()
+            prev_has_disposition = prev is not None and \
+                'Disposition' in prev.text_content()
             if is_party_name_cell and not prev_has_disposition:
                 element = cells[0].xpath(name_path)[0]
                 party[u'name'] = force_unicode(element.text_content().strip())
@@ -362,9 +379,10 @@ def parties(self):
                 parties.append(party)
 
             if self.is_adversary_proceeding:
-                # In adversary proceedings, there are multiple rows under one
-                # party type header. Nuke the bulk of the party dict, except for
-                # the type so that it's ready for the next iteration.
+                # In adversary proceedings, there are multiple rows
+                # under one party type header. Nuke the bulk of the
+                # party dict, except for the type so that it's ready
+                # for the next iteration.
                 party = {u'type': party[u'type']}
             else:
                 party = {}
@@ -479,9 +497,10 @@ def _add_criminal_data_to_parties(self, parties, party_rows):
         :param party_rows: The trs with party/criminal data
         :return: None
         """
-        # Because criminal data spans multiple trs, the way we do this is by
-        # keeping track of which party we're currently working on. Then, when we
-        # get useful criminal data, we add it to that party.
+        # Because criminal data spans multiple trs, the way we do this
+        # is by keeping track of which party we're currently working
+        # on. Then, when we get useful criminal data, we add it to
+        # that party.
         empty_criminal_data = {
             u'counts': [],
             u'complaints': [],
@@ -681,7 +700,8 @@ def _get_attorneys(cell):
         """
         attorneys = []
         for atty_node in cell.xpath('.//b'):
-            name_parts = force_unicode(atty_node.text_content().strip()).split()
+            name_parts = force_unicode(
+                atty_node.text_content().strip()).split()
             attorney = {
                 u'name': u' '.join(name_parts),
                 u'roles': [],
@@ -692,7 +712,8 @@ def _get_attorneys(cell):
                 # noinspection PyProtectedMember
                 if isinstance(node, (etree._ElementStringResult,
                                      etree._ElementUnicodeResult)):
-                    clean_atty = u'%s\n' % ' '.join(n.strip() for n in node.split())
+                    clean_atty = u'%s\n' % ' '.join(
+                        n.strip() for n in node.split())
                     if clean_atty.strip():
                         attorney[u'contact'] += clean_atty
                 else:
@@ -715,44 +736,81 @@ def docket_entries(self):
         if self._docket_entries is not None:
             return self._docket_entries
 
-        # There can be multiple docket entry tables on a single docket page. See
-        # https://github.com/freelawproject/courtlistener/issues/762. ∴ we need
-        # to identify the first table, and all following tables. The following
-        # tables lack column headers, so we have to use the preceding-sibling
-        # tables to make sure it's right.
+        # There can be multiple docket entry tables on a single docket
+        # page. See
+        # https://github.com/freelawproject/courtlistener/issues/762.
+        # ∴ we need to identify the first table, and all following
+        # tables. The following tables lack column headers, so we have
+        # to use the preceding-sibling tables to make sure it's right.
         docket_header = './/text()[contains(., "Docket Text")]'
-        bankr_multi_doc = 'not(.//text()[contains(., "Total file size of selected documents")])'
+        bankr_multi_doc = ('not(.//text()[contains(.,'
+                           '"Total file size of selected documents")])')
         footer_multi_doc = 'not(.//text()[contains(., "Footer format:")])'
         docket_entry_rows = self.tree.xpath(
             '//table'
-              '[preceding-sibling::table[{dh}] or {dh}]'
-              '[{b_multi_doc}]'
-              '[{footer_multi_doc}]'
+            '[preceding-sibling::table[{dh}] or {dh}]'
+            '[{b_multi_doc}]'
+            '[{footer_multi_doc}]'
             '/tbody/tr'.format(
                 dh=docket_header,
                 b_multi_doc=bankr_multi_doc,
                 footer_multi_doc=footer_multi_doc,
             )
-        )[1:]  # Skip the first row.
+        )
+
+        CELL_XPATH = u'(./td[not(./input)] | ./th[not(./input)])'
+
+        if not docket_entry_rows:
+            return []
+
+        def normalize_whitespace(str):
+            ''' Split (on whitespace) and then rejoin with spaces.
+            Replaces non-terminal runs of whitespace with a single space.
+            '''
+            return ' '.join(str.split())
+
+        raw_date_kind = normalize_whitespace(
+            docket_entry_rows[0].xpath(CELL_XPATH)[0].text_content())
+        if raw_date_kind == 'Date Entered':
+            date_kind = u'date_entered'
+        elif raw_date_kind in ['Date Filed', 'Filing Date', 'Docket Date']:
+            date_kind = u'date_filed'
+        else:
+            raise AssertionError('Unknown date(?) kind <%s>' % raw_date_kind)
 
         docket_entries = []
-        for row in docket_entry_rows:
+        # Skip the first row.
+        for row in docket_entry_rows[1:]:
             de = {}
-            cells = row.xpath(u'./td[not(./input)]')
+            cells = row.xpath(CELL_XPATH)
             if len(cells) == 4:
                 # In some instances, the document entry table has an extra
                 # column. See almb, 92-04963
                 del cells[1]
 
-            date_filed_str = force_unicode(cells[0].text_content())
-            if not date_filed_str:
+            date_str = force_unicode(cells[0].text_content())
+            if not date_str:
                 # Some older dockets have missing dates. Press on.
                 continue
-            de[u'date_filed'] = convert_date_string(date_filed_str)
+            de[date_kind] = convert_date_string(date_str)
             de[u'document_number'] = self._get_document_number(cells[1])
-            de[u'pacer_doc_id'] = self._get_pacer_doc_id(cells[1],
-                                                         de[u'document_number'])
+            de[u'pacer_doc_id'] = self._get_pacer_doc_id(
+                cells[1], de[u'document_number'])
             de[u'description'] = self._get_description(cells)
+
+            # If there's a '(Entered: xx/yy/zzzz)' notation at end, use it!
+            match = re.search(r'\(Entered: (\d{2}/\d{2}/\d{4})\)$',
+                              de[u'description'])
+            if match:
+                date_entered = convert_date_string(match.group(1))
+                if u'date_entered' in de:
+                    assert de[u'date_entered'] == date_entered, (
+                        'Date Entered column (%s) does not match parsed value '
+                        'from end of description (%s)' % (de[u'date_entered'],
+                                                          date_entered))
+                else:
+                    de[u'date_entered'] = date_entered
+
             if not de[u'document_number']:
                 # Minute order. Skip for now.
                 continue
@@ -791,11 +849,11 @@ def query(self, pacer_case_id, date_range_type='Filed', date_start=None,
         were entered into PACER or the date they were filed.
         :param date_start: The start date for the date range (as a date object)
         :param date_end: The end date for the date range (as a date object)
-        :param doc_num_start: A range of documents can be requested. This is the
-        lower bound of their ID numbers.
+        :param doc_num_start: A range of documents can be
+        requested. This is the lower bound of their ID numbers.
         :param doc_num_end: The upper bound of the requested documents.
-        :param show_parties_and_counsel: Whether to show the parties and counsel
-        in a case (note this adds expense).
+        :param show_parties_and_counsel: Whether to show the parties and
+        counsel in a case (note this adds expense).
         :param show_terminated_parties: Whether to show terminated parties in a
         case (note this adds expense).
         :param show_list_of_member_cases: Whether to show a list of member
@@ -808,6 +866,7 @@ def query(self, pacer_case_id, date_range_type='Filed', date_start=None,
         :param order_by: The ordering desired for the results.
         :return: None. Instead sets self.response attribute and runs
         self.parse()
+
         """
         # Set up and sanity tests
         assert self.session is not None, \
@@ -890,8 +949,8 @@ def _set_metadata_values(self):
             u'/ancestor::table[not(.//center)][last()]'
         )[0]
         cells = table.xpath(u'.//td')
-        # Convert the <br> separated content into text strings, treating as much
-        # as possible as HTML.
+        # Convert the <br> separated content into text strings,
+        # treating as much as possible as HTML.
         values = []
         for cell in cells:
             clean_texts = [clean_string(s) for s in self._br_split(cell)]
@@ -909,8 +968,8 @@ def _get_pacer_doc_id(cell, document_number):
             # column in their docket report.
             urls = cell.xpath(u'.//a')
             if len(urls) == 0:
-                # Docket entry exists, but cannot download document (it's sealed
-                # or otherwise unavailable in PACER).
+                # Docket entry exists, but cannot download document
+                # (it's sealed or otherwise unavailable in PACER).
                 return None
             for url in urls:
                 if url.text_content().strip() == document_number:
@@ -928,9 +987,9 @@ def _get_document_number(self, cell):
         if words:
             first_word = re.sub(u'[\s\u00A0]', '', words[0])
             if self.court_id == u'txnb':
-                # txnb merges the second and third columns, so if the first word
-                # is a number, return it. Otherwise, assume doc number isn't
-                # listed for the item.
+                # txnb merges the second and third columns, so if the
+                # first word is a number, return it. Otherwise, assume
+                # doc number isn't listed for the item.
                 if first_word.isdigit():
                     return first_word
             else:
@@ -942,10 +1001,10 @@ def _get_description(self, cells):
             return force_unicode(cells[2].text_content())
 
         s = force_unicode(cells[1].text_content())
-        # In txnb the second and third columns of the docket entries are
-        # combined. The field can have one of four formats. Attempt the most
-        # detailed first, then work our way down to just giving up and capturing
-        # it all.
+        # In txnb the second and third columns of the docket entries
+        # are combined. The field can have one of four
+        # formats. Attempt the most detailed first, then work our way
+        # down to just giving up and capturing it all.
         ws = u'[\s\u00A0]'  # Whitespace including nbsp
         regexes = [
             # 2 (23 pgs; 4 docs) Blab blah (happens when attachments exist and
@@ -1054,6 +1113,7 @@ def _get_judge(self, regex):
             judge_str = judge_str.split('to:')[1]
             return normalize_judge_string(judge_str)[0]
 
+
 if __name__ == "__main__":
     if len(sys.argv) != 2:
         print("Usage: python -m juriscraper.pacer.docket_report filepath")