Skip to content

Commit

Permalink
Merge pull request #1255 from freelawproject/fix_pa
Browse files Browse the repository at this point in the history
feat(pa): collect neutral citations and regional citations; also paginate results
  • Loading branch information
flooie authored Jan 8, 2025
2 parents 94b4047 + d878c45 commit ea57762
Show file tree
Hide file tree
Showing 8 changed files with 162 additions and 7 deletions.
22 changes: 18 additions & 4 deletions juriscraper/opinions/united_states/state/pa.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,20 @@ class Site(OpinionSiteLinear):
court = "Supreme"
base_url = "https://www.pacourts.us/api/opinion?"
document_url = "https://www.pacourts.us/assets/opinions/{}/out/{}"
days_interval = 20
days_interval = 1
api_dt_format = "%Y-%m-%dT00:00:00-05:00"
first_opinion_date = datetime(1998, 4, 27)
judge_key = "AuthorCode"
regional_cite_regex = re.compile(r"\d{1,3} A\.3d \d+")

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self.regex = re.compile(r"(.*)(?:[,-]?\s+Nos?\.)(.*)")
self.status = "Published"

now = datetime.now() + timedelta(days=1)
start = now - timedelta(days=7)
now = datetime.now()
start = now - timedelta(days=1)
self.params = {
"startDate": start.strftime(self.api_dt_format),
"endDate": now.strftime(self.api_dt_format),
Expand All @@ -50,9 +51,13 @@ def _process_html(self) -> None:
json_response = self.html

for cluster in json_response["Items"]:
title = cluster["Caption"]
disposition_date = cluster["DispositionDate"].split("T")[0]
title = cluster["Caption"]
name, docket = self.parse_case_title(title)
# A.3d cites seem to exist only for pasuperct
cite = ""
if cite_match := self.regional_cite_regex.search(title):
cite = cite_match.group(0)

for op in cluster["Postings"]:
per_curiam = False
Expand All @@ -75,9 +80,18 @@ def _process_html(self) -> None:
"judge": author_str,
"status": status,
"per_curiam": per_curiam,
"citation": cite,
}
)

if not self.test_mode_enabled() and json_response.get("HasNext"):
next_page = json_response["PageNumber"] + 1
logger.info("Paginating to page %s", next_page)
self.params["pageNumber"] = next_page
self.url = f"{self.base_url}{urlencode(self.params)}"
self.html = self._download()
self._process_html()

def parse_case_title(self, title: str) -> Tuple[str, str]:
"""Separates case_name and docket_number from case string
Expand Down
6 changes: 5 additions & 1 deletion juriscraper/opinions/united_states/state/pacommwct.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,16 @@
from urllib.parse import urlencode

from juriscraper.opinions.united_states.state import pasuperct
from juriscraper.OpinionSite import OpinionSite


class Site(pasuperct.Site):
court = "Commonwealth"
days_interval = 30
first_opinion_date = datetime(1998, 8, 17)
# Deactivate extract_from_text from parent class
# and avoid triggering the example requirement from
# tests.local.test_ScraperExtractFromTextTest
extract_from_text = OpinionSite.extract_from_text

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand Down
16 changes: 14 additions & 2 deletions juriscraper/opinions/united_states/state/pasuperct.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,14 @@

import re
from datetime import datetime
from typing import Dict
from typing import Dict, Optional
from urllib.parse import urlencode

from juriscraper.opinions.united_states.state import pa


class Site(pa.Site):
court = "Superior"
days_interval = 20
first_opinion_date = datetime(1998, 2, 15)
judge_key = "AuthorName"

Expand Down Expand Up @@ -61,3 +60,16 @@ def clean_judge(self, author_str: str) -> str:
" by ", " "
)
return author_str

def extract_from_text(self, scraped_text: str) -> Optional[Dict]:
"""Get neutral citation from the first lines in the first page
Not all scraped opinions have them
"""
neutral_citation_regex = (
r"(?P<volume>\d{4}) (?P<reporter>PA Super) (?P<page>\d+)"
)
if cite_match := re.search(neutral_citation_regex, scraped_text[:200]):
cite_data = cite_match.groupdict()
cite_data["type"] = 8 # Neutral citation
return {"Citation": cite_data}
4 changes: 4 additions & 0 deletions tests/examples/opinions/united_states/pa_example.compare.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "108 MAP 2023",
"judges": "Dougherty, Kevin M.",
"citations": "",
"case_name_shorts": "",
"per_curiam": false
},
Expand All @@ -20,6 +21,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "108 MAP 2023",
"judges": "Brobson, P. Kevin",
"citations": "",
"case_name_shorts": "",
"per_curiam": false
},
Expand All @@ -32,6 +34,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "108 MAP 2023",
"judges": "Wecht, David N.",
"citations": "",
"case_name_shorts": "",
"per_curiam": false
},
Expand All @@ -44,6 +47,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "212 WAL 2024",
"judges": "",
"citations": "",
"case_name_shorts": "",
"per_curiam": false
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "526 C.D. 2023",
"judges": "Wallace",
"citations": "",
"case_name_shorts": "PPB",
"per_curiam": false
},
Expand All @@ -20,6 +21,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "168 M.D. 2023",
"judges": "Wallace",
"citations": "",
"case_name_shorts": "",
"per_curiam": false
},
Expand All @@ -32,6 +34,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "337 M.D. 2023",
"judges": "Covey",
"citations": "",
"case_name_shorts": "",
"per_curiam": false
},
Expand All @@ -44,6 +47,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "379 M.D. 2024",
"judges": "Leadbetter",
"citations": "",
"case_name_shorts": "",
"per_curiam": false
},
Expand All @@ -56,6 +60,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "374 C.D. 2023",
"judges": "Leavitt",
"citations": "",
"case_name_shorts": "",
"per_curiam": false
},
Expand All @@ -68,6 +73,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "716 C.D. 2023",
"judges": "Covey",
"citations": "",
"case_name_shorts": "UCBR",
"per_curiam": false
},
Expand All @@ -80,6 +86,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "651 C.D. 2023",
"judges": "Wallace",
"citations": "",
"case_name_shorts": "S.E.N.",
"per_curiam": false
},
Expand All @@ -92,6 +99,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "469 C.D. 2023",
"judges": "Dumas",
"citations": "",
"case_name_shorts": "",
"per_curiam": false
},
Expand All @@ -104,6 +112,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "804 C.D. 2023",
"judges": "Wojcik",
"citations": "",
"case_name_shorts": "PPB",
"per_curiam": false
},
Expand All @@ -116,6 +125,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "229 C.D. 2022",
"judges": "Ceisler",
"citations": "",
"case_name_shorts": "",
"per_curiam": false
},
Expand All @@ -128,6 +138,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "824 C.D. 2023",
"judges": "Leavitt",
"citations": "",
"case_name_shorts": "",
"per_curiam": false
},
Expand All @@ -140,6 +151,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "264 & 1012 C.D. 2022",
"judges": "Leadbetter",
"citations": "",
"case_name_shorts": "",
"per_curiam": false
},
Expand All @@ -152,6 +164,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "390 C.D. 2023",
"judges": "McCullough",
"citations": "",
"case_name_shorts": "DHS",
"per_curiam": false
},
Expand All @@ -164,6 +177,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "506 C.D. 2023",
"judges": "Covey",
"citations": "",
"case_name_shorts": "",
"per_curiam": false
},
Expand All @@ -176,6 +190,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "791 C.D. 2023",
"judges": "Covey",
"citations": "",
"case_name_shorts": "PPB",
"per_curiam": false
},
Expand All @@ -188,6 +203,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "164 M.D. 2023",
"judges": "Covey",
"citations": "",
"case_name_shorts": "",
"per_curiam": false
},
Expand All @@ -200,6 +216,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "44 C.D. 2023",
"judges": "McCullough. Dumas",
"citations": "",
"case_name_shorts": "",
"per_curiam": false
},
Expand All @@ -212,6 +229,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "35, 371 & 388 C.D. 2023",
"judges": "Wallace",
"citations": "",
"case_name_shorts": "",
"per_curiam": false
},
Expand All @@ -224,6 +242,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "369 M.D. 2023",
"judges": "Ceisler",
"citations": "",
"case_name_shorts": "",
"per_curiam": false
},
Expand All @@ -236,6 +255,7 @@
"date_filed_is_approximate": false,
"docket_numbers": "33 M.D. 2024",
"judges": "Cohn Jubelirer. McCullough",
"citations": "",
"case_name_shorts": "",
"per_curiam": false
}
Expand Down
Loading

0 comments on commit ea57762

Please sign in to comment.