Skip to content

Commit

Permalink
Merge pull request #775 from freelawproject/drop-selenium-in-colorado
Browse files Browse the repository at this point in the history
feat(colorado): Remove selenium from Colorado
  • Loading branch information
flooie authored Nov 19, 2023
2 parents 002a135 + 7c1ba0c commit 68a73f6
Show file tree
Hide file tree
Showing 8 changed files with 5,880 additions and 1,078 deletions.
150 changes: 57 additions & 93 deletions juriscraper/opinions/united_states/state/colo.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,120 +9,84 @@
also try submitting the form here: http://www.cobar.org/contact
History:
- 2022-01-31: Updated by William E. Palin
- 2023-01-05: Updated by William E. Palin
- 2023-01-05: Updated by WEP
- 2023-11-19: Drop Selenium by WEP
"""
import datetime
import re
from datetime import date, timedelta

from lxml.html import fromstring, tostring
from dateutil import parser

from juriscraper.OpinionSiteLinearWebDriven import OpinionSiteLinearWebDriven
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


class Site(OpinionSiteLinearWebDriven):
class Site(OpinionSiteLinear):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self.year = datetime.date.today().year
self.month = datetime.date.today().month
self.url = f"https://www.courts.state.co.us/Courts/Supreme_Court/Case_Announcements/Index.cfm?year={self.year}&month={self.month}&Submit=Go"
self.uses_selenium = True
self.six_months_ago = date.today() - timedelta(180)
self.status = "Published"
self.url = f"https://www.courts.state.co.us/Courts/Supreme_Court/Proceedings/Index.cfm"

def _process_html(self):
if not self.test_mode_enabled():
self.initiate_webdriven_session()

if not self.test_mode_enabled():
links = self.html.xpath(".//a[@class='touchable-link']")
else:
links = ["fake_link"]
def match_regex(self, str):
"""Match date regex patterns
for link in links:
if self.test_mode_enabled():
super()._download()
date_filed = "December 19, 2022"
else:
date_filed = link.text_content()
self.webdriver.get(link.get("href"))
# Wait for pdf to render
self.webdriver.implicitly_wait(10000)
# And click next twice to make sure we fully render the PDF content
self.find_element_by_id("next").click()
self.find_element_by_id("next").click()
:param str: Date Str
:return: Date object or none
"""
date_match = re.search(
r"\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b|\b\d{1,2}/\d{1,2}/\d{2,4}\b",
str,
)
if date_match:
return parser.parse(date_match.group(0)).date()
return None

self.html = fromstring(self.webdriver.page_source)
def extract_dates(self, row, partial_date):
"""Extract out one of the many date patterns
for pg in self.html.xpath(".//div[@class='page']"):
if not pg.xpath(".//span[@role='link']/span/@aria-owns"):
continue
anno_id = pg.xpath(".//span[@role='link']/span/@aria-owns")[0]
pg_content = []
for element in pg.xpath(".//span"):
if element.get("class") == "markedContent":
continue
pg_content.append(element.text_content())
pg_text = " ".join(pg_content)

case_name = self.get_case_name(pg)
citation = re.findall(r"20\d{2} CO \d{1,3}M?", pg_text)
if not citation:
# This is how we identify a page with an opinion
continue
citation = citation[0]
judges = ""
m = re.findall(r"JUSTICE\s+\b([\wÁ]+)\b", pg.text_content())
if m:
judges = ", ".join(m).replace("Á", "A")
:param row: Row to process
:param partial_date: Partial date string
:return: Date object and approximate boolean
"""
raw_dates = row.xpath(
"following-sibling::div/p/a/following-sibling::text()"
)
raw_date_str = " ".join(raw_dates)
date = self.match_regex(raw_date_str)
if date:
return date, False
raw_date_str = row.xpath("following-sibling::div/p/a/text()")[0]
date = self.match_regex(raw_date_str)
if date:
return date, False
date_object = datetime.datetime.strptime(partial_date, "%b %Y").date()
return date_object, True

urls = pg.xpath(
f"//a[@id='pdfjs_internal_id_{anno_id}']/@href"
def _process_html(self):
for row in self.html.xpath("//div[@id='Dispositions']/*"):
if row.tag == "a":
docket, name = (
row.xpath("following-sibling::text()")[0]
.strip()
.split(" ", 1)
)
if not urls:
# One time an opinion wasn't linked.
if "\xa0\xa0" not in name:
continue
url = urls[0]
if "Opinions" not in url:
name, partial_date = name.split("\xa0\xa0")
url = row.xpath("following-sibling::div/p/a/@href")[-1]
date, date_filed_is_approximate = self.extract_dates(
row, partial_date
)
if date < self.six_months_ago:
continue
docket = url.split("/")[-1][:-4]

self.cases.append(
{
"date": date_filed,
"name": case_name,
"date": str(date),
"name": name,
"docket": docket,
"status": "Published",
"url": url,
"citation": citation,
"judges": judges,
"date_filed_is_approximate": date_filed_is_approximate,
}
)
if not self.test_mode_enabled():
self.webdriver.quit()

def get_case_name(self, pg) -> str:
"""Get case name from page content
:param pg: Page Element
:return: Cleaned case name
"""
start = False
content = []
for element in pg.xpath(".//*"):
if (
b"markedContent" in tostring(element)
or element.text_content() == ""
):
continue
# This denotes a bolded field e.g. Respondent etc.
if ":" in element.text_content():
start = True
continue
# IDs an italics, e.g. en banc at the end of the name
if b"transform: scaleX(0.8" in tostring(element):
start = False
if not start:
continue

content.append(element.text_content())
case_name = " ".join(content[:-1]).replace(" ,", "").replace(" .", "")
return re.sub(r"\s{2,}", " ", case_name.strip())
116 changes: 21 additions & 95 deletions juriscraper/opinions/united_states/state/coloctapp.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,116 +6,42 @@
- 2022-01-31: Updated by William E. Palin
- 2023-01-05: Updated by William E. Palin
- 2023-11-04: Updated by Honey K. Joule
- 2023-11-19: Updated by William E. Palin
"""


import datetime
import re

from lxml.html import fromstring

from juriscraper.opinions.united_states.state import colo


class Site(colo.Site):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self.month = datetime.date.today().month
self.year = datetime.date.today().year
self.url = f"https://www.courts.state.co.us/Courts/Court_of_Appeals/Case_Announcements/Index.cfm?year={self.year}&month={self.month}&Submit=Go"
self.url = f"https://www.courts.state.co.us/Courts/Court_of_Appeals/Case_Announcements/Index.cfm"
self.status = None

def _process_html(self):
if not self.test_mode_enabled():
self.initiate_webdriven_session()

if not self.test_mode_enabled():
links = self.html.xpath(".//a[@class='touchable-link']")
else:
links = ["fake_link"]

for link in links:
if self.test_mode_enabled():
super()._download()
date_filed = "December 29, 2022"
else:
date_filed = link.text_content()
self.webdriver.get(link.get("href"))
# Wait for pdf to render
self.webdriver.implicitly_wait(10000)
# And click next twice to make sure we fully render the PDF content
self.find_element_by_id("next").click()
self.find_element_by_id("next").click()
self.html = fromstring(self.webdriver.page_source)

pattern = r"(202\d{1}COA\d+)\n(.*?)\n(Division \w+)"
urls_to_add = []
for pg in self.html.xpath(".//div[@class='page']"):
if not pg.xpath(".//span[@role='link']/span/@aria-owns"):
continue
anno_ids = pg.xpath(".//span[@role='link']/span/@aria-owns")
for anno_id in anno_ids:
urls = pg.xpath(
f"//a[@id='pdfjs_internal_id_{anno_id}']/@href"
)
if urls:
url = urls[0]
if url not in urls_to_add:
urls_to_add.append(url)
pg_content = []
for element in pg.xpath(".//span"):
if element.get("class") != "markedContent":
continue
pg_content.append(element.text_content())
pg_text = "\n".join(pg_content)
for match in re.finditer(pattern, pg_text, re.DOTALL):
citation = match.groups()[0]
case_name = self.get_case_name(match.groups()[1])
url = urls_to_add.pop(0)
docket = url.split("/")[-1][:-4]
self.cases.append(
{
"date": date_filed,
"name": case_name,
"docket": docket,
"status": "Published",
"url": url,
"citation": citation,
}
)

if not self.test_mode_enabled():
self.webdriver.quit()

def get_case_name(self, page_content) -> str:
"""Get case name from page content
:param pg: List of text
:return: Cleaned case name
"""
start = False
content = []
for row in page_content.split("\n"):
if not row.strip():
continue
no_go_words = [
"Plaintiff",
"Appellee",
"Defendant",
"Intervenor",
"Appellant",
]
if any(word in row for word in no_go_words):
date = self.html.xpath("//div/p/a/text()")[0]
for row in self.html.xpath("//p"):
modified_string = re.sub(r"\s", "", row.text_content())
if "PUBLISHED" == modified_string[:9]:
self.status = "Published"
continue
m = re.findall(r"Honorable .*, Judge", row)
if m:
start = True
if "UNPUBLISHED" == modified_string[:11]:
self.status = None
continue
m = re.findall(r"[A-Z ,]{5,}", row)
if m and m[0] == row:
start = False
if not start:
if not self.status:
continue
content.append(row.strip(","))
case_name = " ".join(content)
return case_name
docket, name = row.text_content().split(" ", 1)
self.cases.append(
{
"name": name,
"docket": docket,
"date": date,
"status": self.status,
"url": f"https://www.courts.state.co.us/Courts/Court_of_Appeals/Opinion/{self.year}/{docket}-PD.pdf",
}
)
Loading

0 comments on commit 68a73f6

Please sign in to comment.