Merge pull request #775 from freelawproject/drop-selenium-in-colorado

feat(colorado): Remove selenium from Colorado
freelawproject · Nov 19, 2023 · 68a73f6 · 68a73f6
2 parents 002a135 + 7c1ba0c
commit 68a73f6
Show file tree

Hide file tree

Showing 8 changed files with 5,880 additions and 1,078 deletions.
diff --git a/juriscraper/opinions/united_states/state/colo.py b/juriscraper/opinions/united_states/state/colo.py
@@ -9,120 +9,84 @@
          also try submitting the form here: http://www.cobar.org/contact
 History:
     - 2022-01-31: Updated by William E. Palin
-    - 2023-01-05: Updated by William E. Palin
+    - 2023-01-05: Updated by WEP
+    - 2023-11-19: Drop Selenium by WEP
 """
 import datetime
 import re
+from datetime import date, timedelta
 
-from lxml.html import fromstring, tostring
+from dateutil import parser
 
-from juriscraper.OpinionSiteLinearWebDriven import OpinionSiteLinearWebDriven
+from juriscraper.OpinionSiteLinear import OpinionSiteLinear
 
 
-class Site(OpinionSiteLinearWebDriven):
+class Site(OpinionSiteLinear):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.court_id = self.__module__
-        self.year = datetime.date.today().year
-        self.month = datetime.date.today().month
-        self.url = f"https://www.courts.state.co.us/Courts/Supreme_Court/Case_Announcements/Index.cfm?year={self.year}&month={self.month}&Submit=Go"
-        self.uses_selenium = True
+        self.six_months_ago = date.today() - timedelta(180)
+        self.status = "Published"
+        self.url = f"https://www.courts.state.co.us/Courts/Supreme_Court/Proceedings/Index.cfm"
 
-    def _process_html(self):
-        if not self.test_mode_enabled():
-            self.initiate_webdriven_session()
-
-        if not self.test_mode_enabled():
-            links = self.html.xpath(".//a[@class='touchable-link']")
-        else:
-            links = ["fake_link"]
+    def match_regex(self, str):
+        """Match date regex patterns
 
-        for link in links:
-            if self.test_mode_enabled():
-                super()._download()
-                date_filed = "December 19, 2022"
-            else:
-                date_filed = link.text_content()
-                self.webdriver.get(link.get("href"))
-                # Wait for pdf to render
-                self.webdriver.implicitly_wait(10000)
-                # And click next twice to make sure we fully render the PDF content
-                self.find_element_by_id("next").click()
-                self.find_element_by_id("next").click()
+        :param str: Date Str
+        :return: Date object or none
+        """
+        date_match = re.search(
+            r"\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b|\b\d{1,2}/\d{1,2}/\d{2,4}\b",
+            str,
+        )
+        if date_match:
+            return parser.parse(date_match.group(0)).date()
+        return None
 
-                self.html = fromstring(self.webdriver.page_source)
+    def extract_dates(self, row, partial_date):
+        """Extract out one of the many date patterns
 
-            for pg in self.html.xpath(".//div[@class='page']"):
-                if not pg.xpath(".//span[@role='link']/span/@aria-owns"):
-                    continue
-                anno_id = pg.xpath(".//span[@role='link']/span/@aria-owns")[0]
-                pg_content = []
-                for element in pg.xpath(".//span"):
-                    if element.get("class") == "markedContent":
-                        continue
-                    pg_content.append(element.text_content())
-                pg_text = " ".join(pg_content)
-
-                case_name = self.get_case_name(pg)
-                citation = re.findall(r"20\d{2} CO \d{1,3}M?", pg_text)
-                if not citation:
-                    # This is how we identify a page with an opinion
-                    continue
-                citation = citation[0]
-                judges = ""
-                m = re.findall(r"JUSTICE\s+\b([\wÁ]+)\b", pg.text_content())
-                if m:
-                    judges = ", ".join(m).replace("Á", "A")
+        :param row: Row to process
+        :param partial_date: Partial date string
+        :return: Date object and approximate boolean
+        """
+        raw_dates = row.xpath(
+            "following-sibling::div/p/a/following-sibling::text()"
+        )
+        raw_date_str = " ".join(raw_dates)
+        date = self.match_regex(raw_date_str)
+        if date:
+            return date, False
+        raw_date_str = row.xpath("following-sibling::div/p/a/text()")[0]
+        date = self.match_regex(raw_date_str)
+        if date:
+            return date, False
+        date_object = datetime.datetime.strptime(partial_date, "%b %Y").date()
+        return date_object, True
 
-                urls = pg.xpath(
-                    f"//a[@id='pdfjs_internal_id_{anno_id}']/@href"
+    def _process_html(self):
+        for row in self.html.xpath("//div[@id='Dispositions']/*"):
+            if row.tag == "a":
+                docket, name = (
+                    row.xpath("following-sibling::text()")[0]
+                    .strip()
+                    .split(" ", 1)
                 )
-                if not urls:
-                    # One time an opinion wasn't linked.
+                if "\xa0\xa0" not in name:
                     continue
-                url = urls[0]
-                if "Opinions" not in url:
+                name, partial_date = name.split("\xa0\xa0")
+                url = row.xpath("following-sibling::div/p/a/@href")[-1]
+                date, date_filed_is_approximate = self.extract_dates(
+                    row, partial_date
+                )
+                if date < self.six_months_ago:
                     continue
-                docket = url.split("/")[-1][:-4]
-
                 self.cases.append(
                     {
-                        "date": date_filed,
-                        "name": case_name,
+                        "date": str(date),
+                        "name": name,
                         "docket": docket,
-                        "status": "Published",
                         "url": url,
-                        "citation": citation,
-                        "judges": judges,
+                        "date_filed_is_approximate": date_filed_is_approximate,
                     }
                 )
-        if not self.test_mode_enabled():
-            self.webdriver.quit()
-
-    def get_case_name(self, pg) -> str:
-        """Get case name from page content
-
-        :param pg: Page Element
-        :return: Cleaned case name
-        """
-        start = False
-        content = []
-        for element in pg.xpath(".//*"):
-            if (
-                b"markedContent" in tostring(element)
-                or element.text_content() == ""
-            ):
-                continue
-            # This denotes a bolded field e.g. Respondent etc.
-            if ":" in element.text_content():
-                start = True
-                continue
-            # IDs an italics, e.g. en banc at the end of the name
-            if b"transform: scaleX(0.8" in tostring(element):
-                start = False
-            if not start:
-                continue
-
-            content.append(element.text_content())
-        case_name = " ".join(content[:-1]).replace(" ,", "").replace(" .", "")
-        return re.sub(r"\s{2,}", " ", case_name.strip())
diff --git a/juriscraper/opinions/united_states/state/coloctapp.py b/juriscraper/opinions/united_states/state/coloctapp.py
@@ -6,116 +6,42 @@
     - 2022-01-31: Updated by William E. Palin
     - 2023-01-05: Updated by William E. Palin
     - 2023-11-04: Updated by Honey K. Joule
+    - 2023-11-19: Updated by William E. Palin
 """
 
-
 import datetime
 import re
 
-from lxml.html import fromstring
-
 from juriscraper.opinions.united_states.state import colo
 
 
 class Site(colo.Site):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.court_id = self.__module__
-        self.month = datetime.date.today().month
         self.year = datetime.date.today().year
-        self.url = f"https://www.courts.state.co.us/Courts/Court_of_Appeals/Case_Announcements/Index.cfm?year={self.year}&month={self.month}&Submit=Go"
+        self.url = f"https://www.courts.state.co.us/Courts/Court_of_Appeals/Case_Announcements/Index.cfm"
+        self.status = None
 
     def _process_html(self):
-        if not self.test_mode_enabled():
-            self.initiate_webdriven_session()
-
-        if not self.test_mode_enabled():
-            links = self.html.xpath(".//a[@class='touchable-link']")
-        else:
-            links = ["fake_link"]
-
-        for link in links:
-            if self.test_mode_enabled():
-                super()._download()
-                date_filed = "December 29, 2022"
-            else:
-                date_filed = link.text_content()
-                self.webdriver.get(link.get("href"))
-                # Wait for pdf to render
-                self.webdriver.implicitly_wait(10000)
-                # And click next twice to make sure we fully render the PDF content
-                self.find_element_by_id("next").click()
-                self.find_element_by_id("next").click()
-                self.html = fromstring(self.webdriver.page_source)
-
-            pattern = r"(202\d{1}COA\d+)\n(.*?)\n(Division \w+)"
-            urls_to_add = []
-            for pg in self.html.xpath(".//div[@class='page']"):
-                if not pg.xpath(".//span[@role='link']/span/@aria-owns"):
-                    continue
-                anno_ids = pg.xpath(".//span[@role='link']/span/@aria-owns")
-                for anno_id in anno_ids:
-                    urls = pg.xpath(
-                        f"//a[@id='pdfjs_internal_id_{anno_id}']/@href"
-                    )
-                    if urls:
-                        url = urls[0]
-                        if url not in urls_to_add:
-                            urls_to_add.append(url)
-                pg_content = []
-                for element in pg.xpath(".//span"):
-                    if element.get("class") != "markedContent":
-                        continue
-                    pg_content.append(element.text_content())
-                pg_text = "\n".join(pg_content)
-                for match in re.finditer(pattern, pg_text, re.DOTALL):
-                    citation = match.groups()[0]
-                    case_name = self.get_case_name(match.groups()[1])
-                    url = urls_to_add.pop(0)
-                    docket = url.split("/")[-1][:-4]
-                    self.cases.append(
-                        {
-                            "date": date_filed,
-                            "name": case_name,
-                            "docket": docket,
-                            "status": "Published",
-                            "url": url,
-                            "citation": citation,
-                        }
-                    )
-
-        if not self.test_mode_enabled():
-            self.webdriver.quit()
-
-    def get_case_name(self, page_content) -> str:
-        """Get case name from page content
-
-        :param pg: List of text
-        :return: Cleaned case name
-        """
-        start = False
-        content = []
-        for row in page_content.split("\n"):
-            if not row.strip():
-                continue
-            no_go_words = [
-                "Plaintiff",
-                "Appellee",
-                "Defendant",
-                "Intervenor",
-                "Appellant",
-            ]
-            if any(word in row for word in no_go_words):
+        date = self.html.xpath("//div/p/a/text()")[0]
+        for row in self.html.xpath("//p"):
+            modified_string = re.sub(r"\s", "", row.text_content())
+            if "PUBLISHED" == modified_string[:9]:
+                self.status = "Published"
                 continue
-            m = re.findall(r"Honorable .*, Judge", row)
-            if m:
-                start = True
+            if "UNPUBLISHED" == modified_string[:11]:
+                self.status = None
                 continue
-            m = re.findall(r"[A-Z ,]{5,}", row)
-            if m and m[0] == row:
-                start = False
-            if not start:
+            if not self.status:
                 continue
-            content.append(row.strip(","))
-        case_name = " ".join(content)
-        return case_name
+            docket, name = row.text_content().split(" ", 1)
+            self.cases.append(
+                {
+                    "name": name,
+                    "docket": docket,
+                    "date": date,
+                    "status": self.status,
+                    "url": f"https://www.courts.state.co.us/Courts/Court_of_Appeals/Opinion/{self.year}/{docket}-PD.pdf",
+                }
+            )