Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix remaining scrapers #763

Merged
merged 18 commits into from
Nov 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 16 additions & 83 deletions juriscraper/opinions/united_states/administrative_agency/asbca.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,97 +8,30 @@
2016-03-17: Website and phone are dead. Scraper disabled in __init__.py.
"""

import re
from datetime import datetime

from juriscraper.lib.string_utils import clean_if_py3, convert_date_string
from juriscraper.OpinionSite import OpinionSite
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


class Site(OpinionSite):
class Site(OpinionSiteLinear):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self.url = (
"http://www.asbca.mil/Decisions/decisions%d.html"
% datetime.today().year
)
self.columns = None
self.back_scrape_iterable = list(range(2013, 2000 - 1, -1))

# Fix broken month names and funky whitespace usage.
def _clean_text(self, text):
text = super()._clean_text(text)
text = text.replace(" ", " ").replace(" ", " ")
text = text.replace("Januray", "January")
text = text.replace("Februrary", "February")
text = re.sub(re.compile(r"[\s]+", flags=re.MULTILINE), " ", text)
return text

def parse_column_names(self):
# Lookup column names and save them for later
self.columns = dict()
path = "//table/tr[1]/td"
i = 1
for column in self.html.xpath(path):
colname = clean_if_py3("".join(column.itertext())).strip()
self.columns[colname] = i
i += 1
return self.columns

def _get_case_dates(self):
self.parse_column_names()
path = "//table/tr[td/a]/td[%d]/text()" % (
self.columns["Decision Date"]
)
return [
self._get_date_object_from_string(date_string)
for date_string in self.html.xpath(path)
]

def _get_date_object_from_string(self, date_string):
date_string = (
clean_if_py3(date_string)
.strip()
.replace(" ,", ", ")
.replace("2104", "2014")
)
return convert_date_string(date_string)

def _get_case_names(self):
path = "//table/tr/td/a[1]"
case_names = [
clean_if_py3("".join(txt.itertext()).strip())
for txt in self.html.xpath(path)
]
return case_names

def _get_download_urls(self):
path = "//table/tr/td/a[1]/@href"
return [clean_if_py3(href).strip() for href in self.html.xpath(path)]

def _get_judges(self):
path = "//table/tr[td/a]/td[%d]/text()" % (self.columns["Judge"],)
return [clean_if_py3(txt).strip() for txt in self.html.xpath(path)]

def _get_docket_numbers(self):
if "ASBCA Number" not in self.columns:
return None
path = "//table/tr[td/a]/td[%d]/text()" % self.columns["ASBCA Number"]
return [
f"ASBCA No. {clean_if_py3(txt).strip()}"
for txt in self.html.xpath(path)
]

def _get_precedential_statuses(self):
return ["Published"] * len(self.case_dates)

def _download_backwards(self, year):
self.url = "http://www.asbca.mil/Decisions/decisions%d.html" % year
if year == 2010:
self.url = "http://www.asbca.mil/Decisions/decisions.html"
self.html = self._download()

def _get_case_name_shorts(self):
# We don't (yet) support short case names for administrative bodies.
return None
self.status = "Published"

def _process_html(self):
for row in self.html.xpath(".//tr")[1:]:
col1, col2, col3, col4 = row.xpath(".//td")
date = col1.text_content().strip()
if not col2.text_content().strip():
continue
name = col3.text_content().strip()
docket = col2.text_content().strip()
url = col3.xpath(".//a/@href")[0]
self.cases.append(
{"date": date, "name": name, "url": url, "docket": docket}
)
102 changes: 29 additions & 73 deletions juriscraper/opinions/united_states/administrative_agency/bia.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,61 +19,12 @@ def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self.url = "https://www.justice.gov/eoir/ag-bia-decisions"
self.article = None
self.volume = 0
self.urls = None

def _process_elements(self, elements) -> Dict[str, Any]:
"""Process the element grouping.

There is no easy way to parse our the content. Unfortunately, the DOJ
admins randomly nest elements and not others. The only consistency
is that they content is always split between HR tags. So we iterate
over the elements in order until we find an HR tag and then process
the content. Rinse Wash Repeat.

Additionally, the only date we have is the year of the decision.

:param elements: The elements between <hr> tags.
:return: Case data
"""
case = {}
bold_text = elements[0].xpath(".//strong[1]/.. | .//b[1]/..")
if not bold_text:
return {}
intro_text = (
elements[0].xpath(".//strong[1]/.. | .//b[1]/..")[0].text_content()
)
intro_text = intro_text.replace(";", ",")
name, cite = intro_text.split(",", 1)
# Unfortunately there are no accessible file dates without PDF parsing
# So we generate a date and mark it as date_filed_is_approximate = True
# This is unset to false after it is extracted from the PDF on CL side.
case["date_filed_is_approximate"] = True
years = re.findall(r"\d{4}", cite)
if not years:
return {}
case["date"] = f"{years[-1]}-07-01"
case["status"] = "Published"
case["citation"] = cite
case["name"] = name
case["url"] = elements[0].xpath(".//a")[0].get("href")
case["docket"] = elements[0].xpath(".//a")[0].text_content()

# Iterate over the P tags that hold the summaries, sometimes
summary = []
for element in elements:
if element.tag == "p":
summary.append(element.text_content())
case["summary"] = "\n".join(summary).strip()
return case
self.status = "Published"

def _process_html(self):
if not self.test_mode_enabled():
# Sort the URLS by volume to enable the backscraper
# We reverse sort the links by volume and choose the first one
# unless we are in a backscraper and then we choose what loop
# we are in.
if not self.urls:
urls = self.html.xpath(
".//table[1]/tbody/tr/td/a[contains(., 'Volume')]"
Expand All @@ -84,30 +35,35 @@ def get_text(elem):

self.urls = sorted(urls, key=get_text, reverse=True)
self.url = self.urls[self.volume].get("href")
# Download the new page of decisions
self.html = super()._download()

# Get the article which will contain all of our content.
article = self.html.xpath(".//article")[0]
# get the last element in the article
# this ends the process_elements method on the final call because no
# hr tag is present on the last decision

last = list(article.iter())[-1]
# Iterate over every tag in the article to separate out the cases.
elements = []
for element in article.iter():
elements.append(element)
# Process the data when the HR tag is found or the last element.
# this loop lets us generate all of the elements and thus all
# the data that we are looking for. The DOJ has random and weird
# HTML that sometimes nests and sometimes doesnt nest elements of
# an opinion.
if element.tag == "hr" or element == last:
case = self._process_elements(elements)
if case:
self.cases.append(case)
elements, case = [], {}
table = self.html.xpath(".//table")[0]
for row in table.xpath(".//strong"):
name = row.text_content().strip().strip(",")
row_text = row.xpath("..")[0].text_content()
if "BIA" not in row_text:
continue
if not name:
continue
citation, year = row_text.split(name)[1].split("(")
cells = row.xpath("..")[0].xpath(
"following-sibling::td[position() <= 2]"
)
if not cells:
continue
url = cells[0].xpath(".//a/@href")
docket = cells[0].xpath(".//a")[0].text_content()
if not url:
continue
self.cases.append(
{
"name": name,
"citation": citation,
"url": url[0],
"docket": docket,
"date": f"{year.split()[1]}-07-01",
"date_filed_is_approximate": True,
}
)

def extract_from_text(self, scraped_text: str) -> Dict[str, Any]:
"""Can we extract the date filed from the text?
Expand Down
16 changes: 7 additions & 9 deletions juriscraper/opinions/united_states/administrative_agency/olc.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,13 @@ def __init__(self, *args, **kwargs):
self.status = "Published"

def _process_html(self):
for row in self.html.xpath(
".//tr[contains(@class , 'even')] | .//tr[contains(@class , 'odd')]"
):
date = get_row_column_text(row, 1)
if "Date of Issuance" in date:
date = date.split("\n")[-1].strip()
name = get_row_column_text(row, 2)
url = get_row_column_links(row, 2)
summary = get_row_column_text(row, 3)
for row in self.html.xpath(".//article"):
name = row.xpath(".//h2")[0].text_content().strip()
url = row.xpath(".//a/@href")[0]
date = row.xpath(".//time")[0].text_content()
if not name:
continue
summary = row.xpath(".//p")[0].text_content()
self.cases.append(
{
"date": date,
Expand Down
107 changes: 25 additions & 82 deletions juriscraper/opinions/united_states/federal_appellate/ca10.py
Original file line number Diff line number Diff line change
@@ -1,96 +1,39 @@
from lxml import html

from juriscraper.AbstractSite import logger
from juriscraper.lib.exceptions import InsanityException
from juriscraper.lib.string_utils import convert_date_string
from juriscraper.OpinionSite import OpinionSite
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


class Site(OpinionSite):
class Site(OpinionSiteLinear):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url = (
"http://www.ca10.uscourts.gov/opinions/new/daily_decisions.rss"
)
self.court_id = self.__module__

def _get_case_names(self):
"""Case name parsing
Expected value for title_string:
&lt;p&gt;Bazan-Martinez v. Garland&lt;/p&gt;
"""
case_names = []
for title_string in self.html.xpath("//item/title/text()"):
try:
p_element = html.etree.fromstring(str(title_string))
title_string = p_element.xpath("/p/text()")[0]
case_names.append(title_string)
except:
logger.error(f"Error while parsing case name: {title_string}")
raise InsanityException(
f"Error while parsing case name: {title_string}"
)
return case_names

def _get_download_urls(self):
return [
html.tostring(e, method="text").decode()
for e in self.html.xpath("//item/link")
]

def _get_case_dates(self):
"""Case date parsing
Expected value for date_string:
&lt;span class=&quot;date-display-single&quot; property=&quot;dc:date&quot; datatype=&quot;xsd:dateTime&quot; content=&quot;2021-11-16T00:00:00-07:00&quot;&gt;Tue Nov 16 2021&lt;/span&gt;
"""
dates = []
for date_string in self.html.xpath("//item/pubdate/text()"):
try:
span_element = html.etree.fromstring(str(date_string))
date_string = span_element.xpath("/span/text()")[0]
dates.append(convert_date_string(date_string))
except:
logger.error(f"Error while parsing case date: {date_string}")
raise InsanityException(
f"Error while parsing case date: {date_string}"
)
return dates

def _get_docket_numbers(self):
"""Case docket parsing
Expected content in description tag:
Docket#: 21-6001 - Date Issued: Mon Nov 15 2021 - Unpublished Order and Judgment
"""
return [
e.split(" - ")[0].split(":")[1]
for e in self.html.xpath("//item/description/text()")
]

def _get_precedential_statuses(self):
"""Case precedential status parsing
Expected content in description tag:
Docket#: 21-5062 - Date Issued: Fri Nov 12 2021 - Unpublished Order and Judgment
Status:
- Published: "Published Opinion"
- Unpublished: "Unpublished Order and Judgment"
"""
return [
"Published"
if "published opinion" in e.split(" - ")[2].lower()
else "Unpublished"
for e in self.html.xpath("//item/description/text()")
]

def _get_lower_courts(self):
"""Case lower court name parsing
namescpace "dc": "http://purl.org/dc/elements/1.1/"
Tags:
- <dc:creator>Board of Immigration Appeals</dc:creator>
"""
return [
e
for e in self.html.xpath(
"//item/creator/text()",
def _process_html(self):
for item in self.html.xpath(".//item"):
for e in item.xpath(
".//description/text()",
namespaces={"dc": "http://purl.org/dc/elements/1.1/"},
):
if "Published Opinion" in e:
status = "Published"
else:
status = "Unpublished"
docket = e.split()[1].strip()
date = convert_date_string(item.xpath(".//pubdate/text()")[0])
formatted_date = date.strftime("%Y-%m-%d")
self.cases.append(
{
"url": html.tostring(item.xpath("link")[0], method="text")
.decode()
.replace("\\n", "")
.strip(),
"name": item.xpath(".//title/text()")[0],
"date": formatted_date,
"status": status,
"docket": docket,
}
)
]
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@
class Site(acca_p.Site):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url = "https://www.jagcnet.army.mil/85257546006DF36B/MODD?OpenView&Count=-1"
self.url = "https://www.jagcnet.army.mil/ACCALibrary/cases/opinions/MO"
self.court_id = self.__module__

def _get_precedential_statuses(self):
return ["Unpublished"] * len(self.case_names)
self.status = "Unpublished"
Loading
Loading