Skip to content

Commit

Permalink
fix(pacer.email): improve bankruptcy short description
Browse files Browse the repository at this point in the history
- Add `casb` examples
- Add examples with a "Chapter.." string for all courts except nyeb, for which I couldn't find any
- Solved bugs pointed out in code review
- Now using saved raw case numbers to clean up the subject. This simplifies the process
  • Loading branch information
grossir committed Jan 8, 2025
1 parent 57476e1 commit 42df1c4
Show file tree
Hide file tree
Showing 20 changed files with 1,155 additions and 124 deletions.
52 changes: 29 additions & 23 deletions juriscraper/pacer/email.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def __init__(self, court_id):
self.docket_numbers = []
self.subject = None
self.case_names = []
self.raw_docket_numbers = set()
if self.court_id.endswith("b"):
self.is_bankruptcy = True
else:
Expand Down Expand Up @@ -169,19 +170,20 @@ def _parse_docket_number(
if self._is_appellate():
path = self._sibling_path("Case Number")
case_number = self._xpath_text_0(current_node, f"{path}/a")
self.raw_docket_numbers.add(case_number)
return case_number, self._return_default_dn_components()

path = self._sibling_path("Case Number")
docket_numbers_str = current_node.xpath(f"{path}/a/text()")
self.raw_docket_numbers.update(set(docket_numbers_str))
docket_number, docket_number_components = (
self._parse_docket_number_strs(
current_node.xpath(f"{path}/a/text()")
)
self._parse_docket_number_strs(docket_numbers_str)
)
if not docket_number:
docket_numbers_str = current_node.xpath(f"{path}/p/a/text()")
self.raw_docket_numbers.update(set(docket_numbers_str))
docket_number, docket_number_components = (
self._parse_docket_number_strs(
current_node.xpath(f"{path}/p/a/text()")
)
self._parse_docket_number_strs(docket_numbers_str)
)
return docket_number, docket_number_components

Expand All @@ -199,6 +201,7 @@ def _parse_docket_number_plain(
email_body = self.tree.text_content()
regex = r"Case Number:(.*)"
docket_number = re.findall(regex, email_body)
self.raw_docket_numbers.update(set(docket_number))
return self._parse_docket_number_strs(docket_number)

def _get_date_filed(self) -> date:
Expand Down Expand Up @@ -396,7 +399,6 @@ def _get_dockets(self) -> DocketType:
docket_number, docket_number_components = (
self._parse_docket_number_plain()
)
docket_number = docket_number
# Cache the docket number for its later use.
self.docket_numbers.append(docket_number)

Expand Down Expand Up @@ -551,12 +553,17 @@ def _parse_bankruptcy_short_description(self, subject: str) -> str:
:return: The parsed short description.
"""
# Some courts have subjects like
# `Multiple Cases "{docket} {case name} Close Adversary Case"`
if "close adversary case" in subject.lower():
return "Close Adversary Case"
# `Multiple Cases "{docket} {case name} Close [Aa]dversary [Cc]ase" - {initials}`
if close_adv_match := re.search(
r"close adversary case", subject, flags=re.IGNORECASE
):
return close_adv_match.group(0)

short_description = ""
for part in self.docket_numbers + self.case_names:

raw_docket_numbers = [i.strip() for i in self.raw_docket_numbers]
raw_docket_numbers.sort(key=len, reverse=True) # use longest first
for part in raw_docket_numbers + self.docket_numbers + self.case_names:
subject = subject.replace(part, " ")

# Sometimes the full case name is not used in the `subject`
Expand All @@ -567,20 +574,19 @@ def _parse_bankruptcy_short_description(self, subject: str) -> str:
subject = subject.replace(case_name.split(" and ")[0], " ")

# Deletes:
# - extra docket number 'components', such as `federal_dn_judge_initials_assigned`
# these are usually 3 letters long. However, we want to keep some special acronyms
# such as MOR (Merchant of Record?)
# - "NEF: " placeholder
component_regex = r"((?!-MOR)(\-[A-Z]{2,}))|(\-[a-z]{2,})|(NEF:? )"
if self.court_id in ["paeb", "pamb", "casb"]:
# keeps the "Chapter ..." description on the short description
chapter_regex = r"(C[Hh][- ]?(13|7|9|11))|(C[hH][\s-]*$)"
else:
chapter_regex = r"(C[Hh](apter)?[- ]?(13|7|9|11))|(C[hH][\s-]*$)"

cleanup_regex = rf"{component_regex}|{chapter_regex}"
# - "Ch \d{1,2}" abbreviations
cleanup_regex = r"(NEF:? )|(C[Hh][- ]?(7|9|11|13))|(C[hH][\s-]*$)"
subject = re.sub(cleanup_regex, " ", subject)
subject = subject.strip(" -;:, ")

# Courts like `nhb` do not use the "Ch \d{1,2}" abbreviation
# and we must delete the "Chapter..." string; but only once
# See nhb_2 for an example with 2 "Chapter..." strings
if self.court_id in ["nhb"]:
chapter_regex = r"Chapter[- ]?(7|9|11|13)"
subject = re.sub(chapter_regex, " ", subject, 1)

subject = subject.strip(" ;:,- ")
# some courts use "Re: {case name}"
short_description = re.sub("( Re$)|(^Re:? )", "", subject)

Expand Down
43 changes: 29 additions & 14 deletions tests/examples/pacer/nef/s3/cacb_3.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,28 +4,43 @@
"court_id": "cacb",
"dockets": [
{
"case_name": "Maximo Arturo Arriola",
"case_name": "Lee v. Newman",
"date_filed": null,
"docket_entries": [
{
"date_filed": "2022-10-04",
"description": "Request for courtesy Notice of Electronic Filing (NEF) Filed by Glowin, Nichole. (Glowin, Nichole)",
"document_number": "11",
"document_url": "https://ecf.cacb.uscourts.gov/doc1/9730105870952?pdf_header=&magic_num=25971254&de_seq_num=39&caseid=1952165",
"pacer_case_id": "1952165",
"pacer_doc_id": "9730105870952",
"pacer_magic_num": "25971254",
"pacer_seq_no": "39",
"short_description": "Request for courtesy Notice of Electronic Filing (NEF)"
"date_filed": "2023-02-15",
"description": "Order granting stipulation to continue pre-trial conference from 2/28/23 to 4/25/23 @ 10:00 a.m. (BNC-PDF) (Related Doc [75]) Signed on 2/15/2023 (JC6)",
"document_number": "76",
"document_url": "https://ecf.cacb.uscourts.gov/doc1/9730106502427?pdf_header=&magic_num=94356268&de_seq_num=252&caseid=1927388",
"pacer_case_id": "1927388",
"pacer_doc_id": "9730106502427",
"pacer_magic_num": "94356268",
"pacer_seq_no": "252",
"short_description": "ORDER to continue/reschedule hearing (BNC-PDF)"
}
],
"docket_number": "6:22-bk-13643",
"docket_number": "6:21-ap-01071",
"federal_defendant_number": null,
"federal_dn_case_type": "bk",
"federal_dn_judge_initials_assigned": "SY",
"federal_dn_case_type": "ap",
"federal_dn_judge_initials_assigned": "SC",
"federal_dn_judge_initials_referred": null,
"federal_dn_office_code": "6"
}
],
"email_recipients": []
"email_recipients": [
{
"email_addresses": [
"[email protected]",
"[email protected]",
"[email protected]",
"[email protected]",
"[email protected]",
"[email protected]@y.net",
"[email protected]",
"[email protected]",
"[email protected]"
],
"name": ""
}
]
}
82 changes: 48 additions & 34 deletions tests/examples/pacer/nef/s3/cacb_3.txt
Original file line number Diff line number Diff line change
@@ -1,33 +1,33 @@
Return-Path: <court_test@cacb.uscourts.gov>
Return-Path: <cmecfhelpdesk@cacb.uscourts.gov>
Received: from icmecf201.gtwy.uscourts.gov (icmecf201.gtwy.uscourts.gov [63.241.40.204])
by inbound-smtp.us-west-2.amazonaws.com with SMTP id 4qi4f6vf121uih2gdltqsq64ld5fbnn4j72lcq01
for [email protected];
Tue, 04 Oct 2022 21:40:29 +0000 (UTC)
by inbound-smtp.us-west-2.amazonaws.com with SMTP id f086thkp632ni7ho26h2cp3m57475aoo531l0rg1
for [email protected];
Wed, 15 Feb 2023 19:05:58 +0000 (UTC)
X-SES-Spam-Verdict: PASS
X-SES-Virus-Verdict: PASS
Received-SPF: pass (spfCheck: domain of cacb.uscourts.gov designates 63.241.40.204 as permitted sender) client-ip=63.241.40.204; envelope-from=court_test@cacb.uscourts.gov; helo=icmecf201.gtwy.uscourts.gov;
Received-SPF: pass (spfCheck: domain of cacb.uscourts.gov designates 63.241.40.204 as permitted sender) client-ip=63.241.40.204; envelope-from=cmecfhelpdesk@cacb.uscourts.gov; helo=icmecf201.gtwy.uscourts.gov;
Authentication-Results: amazonses.com;
spf=pass (spfCheck: domain of cacb.uscourts.gov designates 63.241.40.204 as permitted sender) client-ip=63.241.40.204; envelope-from=court_test@cacb.uscourts.gov; helo=icmecf201.gtwy.uscourts.gov;
spf=pass (spfCheck: domain of cacb.uscourts.gov designates 63.241.40.204 as permitted sender) client-ip=63.241.40.204; envelope-from=cmecfhelpdesk@cacb.uscourts.gov; helo=icmecf201.gtwy.uscourts.gov;
dmarc=none header.from=cacb.uscourts.gov;
X-SES-RECEIPT: AEFBQUFBQUFBQUFGbGl2NXRlSU1oOHN2UEI5VlY3aks0aVRIeFVwWUVpQW9uaTloT2Z5YUE0QlV6RzA2Skdrc3JnRVpISXRtTWJBZUQ5TytFRk1zS1BQWlkzWWp2dS9OamQyK3pLWGhDaGFOUXB6MG45YmZ0L3ZJcGdWK3V0Slg3dUdBVXZWOCtwblRMSHhyNmdXamJZY09hd2JCYThFRVEzemQzZHZ6TVNvWTRxOGlTYnpoK2xtTDc3SGV4bzRMcEF6aFZCTUYyRzdwOXExK3ozWFFsdkg0WDZpTTZ6QUY0NGVKK2tjN1BEajQweG1nait0VEM4TUVJblBwV1ZZUjR0K2JuRy9TdlNjbFdhMi9McXBIcFdSdHl0TVl0M3V6WjlVb3gzRkhrQS8wQStLUTlUYWZySGo5Q0tTL3J2cExkeFpuQkhWSDZEaEk9
X-SES-DKIM-SIGNATURE: a=rsa-sha256; q=dns/txt; b=kMfg+anNhSY8jonv/o7qiHzGOmonSKX5pNuKtJST5NivHvYVtM/Jt+5sKDciDbbjeEicJ/drZYdEvc2h4od1dTT8B+BeFdzEJW+sOZxWtsbk06xO6PSLfJEFo2i6nPQIzswzkS8VEmwo6VROc79uKqgWpXAbAk/gYdYQMpGsiKo=; c=relaxed/simple; s=gdwg2y3kokkkj5a55z2ilkup5wp5hhxx; d=amazonses.com; t=1664919629; v=1; bh=OmOKVyWfAuxyGyhfWgehPtPWdAl9RFb3IbTsQgePhG4=; h=From:To:Cc:Bcc:Subject:Date:Message-ID:MIME-Version:Content-Type:X-SES-RECEIPT;
X-SES-RECEIPT: AEFBQUFBQUFBQUFGSkRsMElEeHNURW9JdS9OTUpBaDY1WGJYdHRUR3dpd3RCeElONSs1NEhUeUJUejZCMkJ5dFFxVThqNmNTS280SVJhNTEveXp2dkU2SkZ3NUc4SXk5T0k1SUhqWG5lUFdVZ2hWdHlNb1luWlhWWHlRdXdQQzBsQnV6SmhwOVEySVA2ajBCakpKbGgrQk94RDR0RG5Samh5WjhXNTdFWk1ZcnJRV05HVC8zOTdzQW1CdDRLdFdzbDQvSGJLMHdnekljZTBjVGtLdkJLRWtBZ0xHa0ZVeDZFcU9IT0FpZ3NxZjNnVlFVa3Azdk8yWU1Idk5yN2JSTzNpcW1IQzU4VjdrcDdCRlVURWkvSFBMam9QNmw4a0dmK0l1aUhlVFZTZUlsM3lid2F3Q0tpSnp1dUNlVjhVWVo2ZTRGSG51Smtjblk9
X-SES-DKIM-SIGNATURE: a=rsa-sha256; q=dns/txt; b=cyXVrqqoX2hgFA2XPCej1hlEGbluwv2Ox2WNe6t35+t9tfhd9Ph8dOMETcdyzatjzDvONU8enJROV8YWdh8AWOIOMs35snGKrgE667Vv6gc5SrAfeEnlcUpZV0yEsPSgSMLdMF7E3V3lPRExu50gRwqhA3v7RGX1A05XGwkbm0s=; c=relaxed/simple; s=gdwg2y3kokkkj5a55z2ilkup5wp5hhxx; d=amazonses.com; t=1676487959; v=1; bh=Tw3rhcsR47lIUsxr8IkHEZyLE/vCL1ppyfg937qipHc=; h=From:To:Cc:Bcc:Subject:Date:Message-ID:MIME-Version:Content-Type:X-SES-RECEIPT;
X-SBRS: None
X-REMOTE-IP: 156.119.56.188
Received: from cacbdb.cacb.gtwy.dcn ([156.119.56.188])
by icmecf201.gtwy.uscourts.gov with ESMTP; 04 Oct 2022 17:40:28 -0400
by icmecf201.gtwy.uscourts.gov with ESMTP; 15 Feb 2023 14:05:58 -0500
Received: from cacbdb.cacb.gtwy.dcn (localhost.localdomain [127.0.0.1])
by cacbdb.cacb.gtwy.dcn (8.14.4/8.14.4) with ESMTP id 294LeMLp117731;
Tue, 4 Oct 2022 14:40:23 -0700
by cacbdb.cacb.gtwy.dcn (8.14.4/8.14.4) with ESMTP id 31FJ5BJO093254;
Wed, 15 Feb 2023 11:05:13 -0800
Received: (from ecf_web@localhost)
by cacbdb.cacb.gtwy.dcn (8.14.4/8.14.4/Submit) id 294Le2sC117113;
Tue, 4 Oct 2022 14:40:02 -0700
Date: Tue, 4 Oct 2022 14:40:02 -0700
X-Authentication-Warning: cacbdb.cacb.gtwy.dcn: ecf_web set sender to court_test@cacb.uscourts.gov using -f
by cacbdb.cacb.gtwy.dcn (8.14.4/8.14.4/Submit) id 31FJ59NW093057;
Wed, 15 Feb 2023 11:05:09 -0800
Date: Wed, 15 Feb 2023 11:05:09 -0800
X-Authentication-Warning: cacbdb.cacb.gtwy.dcn: ecf_web set sender to cmecfhelpdesk@cacb.uscourts.gov using -f
MIME-Version:1.0
From:court_test@cacb.uscourts.gov
From:cmecfhelpdesk@cacb.uscourts.gov
To:[email protected]
Message-Id:<103122209@cacb.uscourts.gov>
Subject:6:22-bk-13643-SY Request for courtesy Notice of Electronic Filing (NEF)
Message-Id:<103732686@cacb.uscourts.gov>
Subject:6:21-ap-01071-SC ORDER to continue/reschedule hearing (BNC-PDF)
Content-Type: text/html

<p><strong>***NOTE TO PUBLIC ACCESS USERS*** Judicial Conference of the United States policy permits attorneys of record and parties in a case (including pro se litigants) to receive one free electronic copy of all documents filed electronically, if receipt is required by law or directed by the filer. PACER access fees apply to all other users. To avoid later charges, download a copy of each document during this first viewing. However, if the referenced document is a transcript, the free copy and 30-page limit do not apply.</strong></p>
Expand All @@ -41,20 +41,20 @@ Content-Type: text/html
Notice of Electronic Filing
<BR>
<div>
<BR>The following transaction was received from Nichole Glowin entered on 10/4/2022 at 2:40 PM PDT and filed on 10/4/2022
<BR>The following transaction was received from AutoDocket, CIAO-User entered on 02/15/2023 at 10:37 AM PST and filed on 02/15/2023

<BR>



<table border=0 cellspacing=0>
<tr><td><strong>Case Name:</strong>
</td><td>Maximo Arturo Arriola </td></tr>
<tr><td><strong>Case Number:</strong></td><td><A HREF=https://ecf.cacb.uscourts.gov/cgi-bin/DktRpt.pl?1952165>6:22-bk-13643-SY</A></td></tr>
</td><td>Lee v. Newman </td></tr>
<tr><td><strong>Case Number:</strong></td><td><A HREF=https://ecf.cacb.uscourts.gov/cgi-bin/DktRpt.pl?1927388>6:21-ap-01071-SC</A></td></tr>

<tr><td><strong>Document Number:</strong></td>
<td>
<a href='https://ecf.cacb.uscourts.gov/doc1/9730105870952?pdf_header=&magic_num=25971254&de_seq_num=39&caseid=1952165'>11</a>
<a href='https://ecf.cacb.uscourts.gov/doc1/9730106502427?pdf_header=&magic_num=94356268&de_seq_num=252&caseid=1927388'>76</a>
</td></tr>
</table>

Expand All @@ -64,18 +64,15 @@ Notice of Electronic Filing
<p><strong>Docket Text:</strong>

<BR>
Request for courtesy Notice of Electronic Filing (NEF) <i></i> Filed by Glowin, Nichole. (Glowin, Nichole)
Order granting stipulation to continue pre-trial conference from 2/28/23 to 4/25/23 @ 10:00 a.m. (BNC-PDF) (Related Doc # [75]) Signed on 2/15/2023 (JC6)
</p>

<p>The following document(s) are associated with this transaction:</p>
<table>
<STRONG>Document description:</STRONG>Main Document
<BR><STRONG>Original filename:</STRONG>NEF FOR FILING - ARRIOLA.pdf
<BR><STRONG>Electronic document
Stamp:</STRONG>
<BR><TAB>[STAMP bkecfStamp_ID=1106918562 [Date=10/4/2022] [FileNumber=103122207
<BR><TAB>-0] [0a8098691c134d64ab67f9c1a54cd3bca4bed423244f4f0790269da84d36ec4f8
<BR><TAB>7997cb6a58536e8961bfcec236aefc460e38fcd4479c11ae2b06762b7541417]]
<STRONG>Document description:</STRONG>Main Document
<BR><STRONG>Original filename:</STRONG>/data/docs0/ECF/ADI/cacb_live/realtime/documents/cdcbbab3-e2cd-4625-b06f-53522d662225.pdf
<BR><STRONG>Electronic document Stamp:</STRONG>
<BR>[STAMP bkecfStamp_ID=1106918562 [Date=02/15/2023] [FileNumber=103732406-0] [a400daefa90430e46c061a209095155700b50778d5e6b77aaeb77e450c8a44bf3521e6258f250debf76dca79a24273baeb803ee02251a15bfee9f21993921b8a]]
<BR>

</table>
Expand All @@ -85,17 +82,34 @@ Request for courtesy Notice of Electronic Filing (NEF) <i></i> Filed by Glowin,


<BR><B>
6:22-bk-13643-SY Notice will be electronically mailed to:
6:21-ap-01071-SC Notice will be electronically mailed to:
</B>

<BR>

<BR><span class="personName">Suzanne C Grandt on behalf of Interested Party The State Bar of California</span>
<br>[email protected], [email protected]
<BR>
<BR><span class="personName">Howard B Grobstein (TR)</span>
<br>[email protected], [email protected]
<BR>
<BR><span class="personName">Firstname LastName on behalf of Plaintiff Chloe Taekyeong Lee</span>
<br>[email protected], [email protected],[email protected]
<BR>
<BR><span class="personName">Donald W Reid on behalf of Defendant Michael Paul Newman</span>
<br>[email protected], [email protected]
<BR>
<BR><span class="personName"> United States Trustee (RS)</span>
<br>[email protected]
<BR>

<BR>
<B>
6:22-bk-13643-SY Notice will not be electronically mailed to:
6:21-ap-01071-SC Notice will not be electronically mailed to:
</B>

<BR>

<BR>
<BR>

<BR>
Expand All @@ -104,4 +118,4 @@ Request for courtesy Notice of Electronic Filing (NEF) <i></i> Filed by Glowin,
</B>

<BR>
<BR>
<BR>
2 changes: 1 addition & 1 deletion tests/examples/pacer/nef/s3/casb_1.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
"pacer_doc_id": null,
"pacer_magic_num": null,
"pacer_seq_no": null,
"short_description": "7 Notice of Chapter 7 Bankruptcy Case & Meeting of Creditors"
"short_description": "Notice of Chapter 7 Bankruptcy Case & Meeting of Creditors"
}
],
"docket_number": "24-04888",
Expand Down
2 changes: 1 addition & 1 deletion tests/examples/pacer/nef/s3/casb_2.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
"pacer_doc_id": "036021419934",
"pacer_magic_num": "29072456",
"pacer_seq_no": "2",
"short_description": "7 Chapter 7 Voluntary Petition"
"short_description": "Chapter 7 Voluntary Petition"
}
],
"docket_number": "24-04888",
Expand Down
32 changes: 22 additions & 10 deletions tests/examples/pacer/nef/s3/cob.json
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
{
"appellate": false,
"contains_attachments": true,
"contains_attachments": false,
"court_id": "cob",
"dockets": [
{
"case_name": "Delta, LLC",
"date_filed": null,
"docket_entries": [
{
"date_filed": "2023-09-25",
"description": "Motion to Dismiss Case For Other Reasons the debtor entity does not have a need for bankruptcy protection Filed by FilerFirstname FilerLastName on behalf of Delta, LLC. (Attachments: (1) Proposed/Unsigned Order) (FilerLastName, FilerFirstname)",
"document_number": "27",
"document_url": "https://ecf.cob.uscourts.gov/doc1/038040602882?pdf_header=&magic_num=16522018&de_seq_num=95&caseid=516007",
"date_filed": "2023-11-06",
"description": "ORDER ACCEPTING TRUSTEE'S REPORT AND CLOSING CASE. It appearing to the Court that the Trustee has filed a report certifying that the estate in the above-captioned case has been fully administered and that no objections to the report have been filed within 30 days thereafter, it is ORDERED that pursuant to Rule 5009, Federal Rule of Bankruptcy Procedure, there is a presumption that the estate has been fully administered; that the trustee be and hereby is discharged; that all nonexempt property listed by the Debtor and not administered by the trustee is hereby deemed abandoned pursuant to 11 U.S.C. 554(c); and the case hereby is closed pursuant to 11 U.S.C. 350(a). IT IS FURTHER ORDERED that all pending motions requiring notice and a hearing for which no certificate has been tendered in accordance with L.B.R. 9013-1 are deemed abandoned for want of prosecution and denied without prejudice.. (dd)",
"document_number": "33",
"document_url": null,
"pacer_case_id": "516007",
"pacer_doc_id": "038040602882",
"pacer_magic_num": "16522018",
"pacer_seq_no": "95",
"short_description": "Motion to Dismiss Case"
"pacer_doc_id": null,
"pacer_magic_num": null,
"pacer_seq_no": null,
"short_description": "Close Bankruptcy Case"
}
],
"docket_number": "23-14130",
Expand All @@ -27,5 +27,17 @@
"federal_dn_office_code": null
}
],
"email_recipients": []
"email_recipients": [
{
"email_addresses": [
"[email protected]",
"[email protected]",
"[email protected]",
"[email protected]",
"[email protected]",
"[email protected]"
],
"name": ""
}
]
}
Loading

0 comments on commit 42df1c4

Please sign in to comment.