Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(idaho, idahoctapp): update to OpinionSiteLinear #1279

Merged
merged 3 commits into from
Dec 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 34 additions & 60 deletions juriscraper/opinions/united_states/state/idaho_civil.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,13 @@
- 2015-10-20, mlr: Updated due to new page in use.
- 2015-10-23, mlr: Updated to handle annoying situation.
- 2016-02-25 arderyp: Updated to catch "ORDER" (in addition to "Order") in download url text
- 2024-12-30, grossir: updated to OpinionSiteLinear
"""

from lxml import html
from juriscraper.OpinionSiteLinear import OpinionSiteLinear

from juriscraper.lib.string_utils import clean_if_py3, convert_date_string
from juriscraper.OpinionSite import OpinionSite


class Site(OpinionSite):
# Skip first row of table, it's a header
path_table_row_start = "//table//tr[position() > 1]"
class Site(OpinionSiteLinear):
# Skip rows that don't have link in 4th cell with
# either 'Opinion', 'Order', 'ORDER', or 'Amend' in
# the link text
Expand All @@ -30,66 +26,44 @@ class Site(OpinionSite):
'contains(.//text(), "Amended")'
"]"
)
path_conditional_row = f"/td[4]//{path_conditional_anchor}"
path_base = f"{path_table_row_start}[./{path_conditional_row}]"

# https://www.isc.idaho.gov/appeals-court/isc_civil
base_url = "https://www.isc.idaho.gov/appeals-court/"
url_part = "isc_civil"

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url = "https://www.isc.idaho.gov/appeals-court/isc_civil"
self.url = f"{self.base_url}{self.url_part}"
self.court_id = self.__module__
self.status = "Published"

def _get_case_names(self):
case_names = []
path = f"{self.path_base}/td[3]"
for cell in self.html.xpath(path):
name_string = html.tostring(
cell, method="text", encoding="unicode"
def _process_html(self):
row_xpath = f"//table//tr[.//{self.path_conditional_anchor}]"
for row in self.html.xpath(row_xpath):
url = self.get_opinion_url(row).replace("http://", "https://")
self.cases.append(
{
"date": row.xpath("string(td[1])").strip(),
"docket": row.xpath("string(td[2])").strip(),
"name": row.xpath("string(td[3])").strip(),
"url": url,
}
)
name_string = clean_if_py3(name_string).strip()
if name_string:
case_names.append(name_string)
return case_names

def _get_download_urls(self):
# We'll accept an order document if the opinion document
# is missing. But we obviously prefer the opinion doc,
# as indicated in the algorithm below. Since each row
# can list multiple valid links, we will parse all
# acceptable links, take the opinion link if present,
# otherwise take the first acceptable link.
opinion_urls = []
path = f"{self.path_base}/td[4]"
path_link = f".//{self.path_conditional_anchor}"
for cell in self.html.xpath(path):
urls = []
url_opinion = False
for link in cell.xpath(path_link):
text = link.text_content().strip()
url = link.attrib["href"].replace("http://", "https://")
urls.append(url)
if "Opinion" in text:
url_opinion = url
opinion_urls.append(url_opinion if url_opinion else urls[0])
return opinion_urls
def get_opinion_url(self, row) -> str:
"""Get's the URL tagged as an Opinion, if possible

def _get_case_dates(self):
case_dates = []
path = f"{self.path_base}/td[1]"
for cell in self.html.xpath(path):
date_string = html.tostring(
cell, method="text", encoding="unicode"
)
date_string = clean_if_py3(date_string).strip()
if date_string:
date_string = date_string.replace(
"Sept ", "Sep "
) # GIGO! (+1 by arderyp)
case_dates.append(convert_date_string(date_string))
return case_dates
We'll accept an order document if the opinion document
is missing. Since each row can list multiple valid links,
we will parse all acceptable links, take the opinion link
if present, otherwise take the first acceptable link.

:param row: the lxml object of the row
:return: the document URL
"""

def _get_docket_numbers(self):
path = f"{self.path_base}/td[2]//text()"
return [text.strip() for text in self.html.xpath(path) if text.strip()]
for link in row.xpath("td[4]//a"):
if "Opinion" in link.text_content().strip():
return link.xpath("@href")[0]

def _get_precedential_statuses(self):
return ["Published"] * len(self.case_names)
return row.xpath("td[4]//a/@href")[0]
6 changes: 2 additions & 4 deletions juriscraper/opinions/united_states/state/idaho_criminal.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,5 @@


class Site(idaho_civil.Site):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url = "https://www.isc.idaho.gov/appeals-court/isc_criminal"
self.court_id = self.__module__
# https://www.isc.idaho.gov/appeals-court/isc_criminal
url_part = "isc_criminal"
6 changes: 2 additions & 4 deletions juriscraper/opinions/united_states/state/idahoctapp_civil.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,5 @@


class Site(idaho_civil.Site):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url = "https://www.isc.idaho.gov/appeals-court/coa_civil"
self.court_id = self.__module__
# https://www.isc.idaho.gov/appeals-court/coa_civil
url_part = "coa_civil"
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,5 @@


class Site(idaho_civil.Site):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url = "https://www.isc.idaho.gov/appeals-court/coa_criminal"
self.court_id = self.__module__
# https://www.isc.idaho.gov/appeals-court/coa_criminal
url_part = "coa_criminal"
Loading
Loading