freelawproject · flooie · Nov 17, 2023 · Nov 14, 2023 · Nov 14, 2023 · Nov 14, 2023
diff --git a/juriscraper/opinions/united_states/administrative_agency/asbca.py b/juriscraper/opinions/united_states/administrative_agency/asbca.py
@@ -8,97 +8,30 @@
     2016-03-17: Website and phone are dead. Scraper disabled in __init__.py.
 """
 
-import re
 from datetime import datetime
 
-from juriscraper.lib.string_utils import clean_if_py3, convert_date_string
-from juriscraper.OpinionSite import OpinionSite
+from juriscraper.OpinionSiteLinear import OpinionSiteLinear
 
 
-class Site(OpinionSite):
+class Site(OpinionSiteLinear):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.court_id = self.__module__
         self.url = (
             "http://www.asbca.mil/Decisions/decisions%d.html"
             % datetime.today().year
         )
-        self.columns = None
-        self.back_scrape_iterable = list(range(2013, 2000 - 1, -1))
-
-    # Fix broken month names and funky whitespace usage.
-    def _clean_text(self, text):
-        text = super()._clean_text(text)
-        text = text.replace("&#160;", " ").replace("&nbsp;", " ")
-        text = text.replace("Januray", "January")
-        text = text.replace("Februrary", "February")
-        text = re.sub(re.compile(r"[\s]+", flags=re.MULTILINE), " ", text)
-        return text
-
-    def parse_column_names(self):
-        # Lookup column names and save them for later
-        self.columns = dict()
-        path = "//table/tr[1]/td"
-        i = 1
-        for column in self.html.xpath(path):
-            colname = clean_if_py3("".join(column.itertext())).strip()
-            self.columns[colname] = i
-            i += 1
-        return self.columns
-
-    def _get_case_dates(self):
-        self.parse_column_names()
-        path = "//table/tr[td/a]/td[%d]/text()" % (
-            self.columns["Decision Date"]
-        )
-        return [
-            self._get_date_object_from_string(date_string)
-            for date_string in self.html.xpath(path)
-        ]
-
-    def _get_date_object_from_string(self, date_string):
-        date_string = (
-            clean_if_py3(date_string)
-            .strip()
-            .replace(" ,", ", ")
-            .replace("2104", "2014")
-        )
-        return convert_date_string(date_string)
-
-    def _get_case_names(self):
-        path = "//table/tr/td/a[1]"
-        case_names = [
-            clean_if_py3("".join(txt.itertext()).strip())
-            for txt in self.html.xpath(path)
-        ]
-        return case_names
-
-    def _get_download_urls(self):
-        path = "//table/tr/td/a[1]/@href"
-        return [clean_if_py3(href).strip() for href in self.html.xpath(path)]
-
-    def _get_judges(self):
-        path = "//table/tr[td/a]/td[%d]/text()" % (self.columns["Judge"],)
-        return [clean_if_py3(txt).strip() for txt in self.html.xpath(path)]
-
-    def _get_docket_numbers(self):
-        if "ASBCA Number" not in self.columns:
-            return None
-        path = "//table/tr[td/a]/td[%d]/text()" % self.columns["ASBCA Number"]
-        return [
-            f"ASBCA No. {clean_if_py3(txt).strip()}"
-            for txt in self.html.xpath(path)
-        ]
-
-    def _get_precedential_statuses(self):
-        return ["Published"] * len(self.case_dates)
-
-    def _download_backwards(self, year):
-        self.url = "http://www.asbca.mil/Decisions/decisions%d.html" % year
-        if year == 2010:
-            self.url = "http://www.asbca.mil/Decisions/decisions.html"
-        self.html = self._download()
-
-    def _get_case_name_shorts(self):
-        # We don't (yet) support short case names for administrative bodies.
-        return None
+        self.status = "Published"
+
+    def _process_html(self):
+        for row in self.html.xpath(".//tr")[1:]:
+            col1, col2, col3, col4 = row.xpath(".//td")
+            date = col1.text_content().strip()
+            if not col2.text_content().strip():
+                continue
+            name = col3.text_content().strip()
+            docket = col2.text_content().strip()
+            url = col3.xpath(".//a/@href")[0]
+            self.cases.append(
+                {"date": date, "name": name, "url": url, "docket": docket}
+            )
diff --git a/juriscraper/opinions/united_states/administrative_agency/bia.py b/juriscraper/opinions/united_states/administrative_agency/bia.py
@@ -19,61 +19,12 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.court_id = self.__module__
         self.url = "https://www.justice.gov/eoir/ag-bia-decisions"
-        self.article = None
         self.volume = 0
         self.urls = None
-
-    def _process_elements(self, elements) -> Dict[str, Any]:
-        """Process the element grouping.
-
-        There is no easy way to parse our the content. Unfortunately, the DOJ
-        admins randomly nest elements and not others. The only consistency
-        is that they content is always split between HR tags.  So we iterate
-        over the elements in order until we find an HR tag and then process
-        the content.  Rinse Wash Repeat.
-
-        Additionally, the only date we have is the year of the decision.
-
-        :param elements: The elements between <hr> tags.
-        :return: Case data
-        """
-        case = {}
-        bold_text = elements[0].xpath(".//strong[1]/.. | .//b[1]/..")
-        if not bold_text:
-            return {}
-        intro_text = (
-            elements[0].xpath(".//strong[1]/.. | .//b[1]/..")[0].text_content()
-        )
-        intro_text = intro_text.replace(";", ",")
-        name, cite = intro_text.split(",", 1)
-        # Unfortunately there are no accessible file dates without PDF parsing
-        # So we generate a date and mark it as date_filed_is_approximate = True
-        # This is unset to false after it is extracted from the PDF on CL side.
-        case["date_filed_is_approximate"] = True
-        years = re.findall(r"\d{4}", cite)
-        if not years:
-            return {}
-        case["date"] = f"{years[-1]}-07-01"
-        case["status"] = "Published"
-        case["citation"] = cite
-        case["name"] = name
-        case["url"] = elements[0].xpath(".//a")[0].get("href")
-        case["docket"] = elements[0].xpath(".//a")[0].text_content()
-
-        # Iterate over the P tags that hold the summaries, sometimes
-        summary = []
-        for element in elements:
-            if element.tag == "p":
-                summary.append(element.text_content())
-        case["summary"] = "\n".join(summary).strip()
-        return case
+        self.status = "Published"
 
     def _process_html(self):
         if not self.test_mode_enabled():
-            # Sort the URLS by volume to enable the backscraper
-            # We reverse sort the links by volume and choose the first one
-            # unless we are in a backscraper and then we choose what loop
-            # we are in.
             if not self.urls:
                 urls = self.html.xpath(
                     ".//table[1]/tbody/tr/td/a[contains(., 'Volume')]"
@@ -84,30 +35,35 @@ def get_text(elem):
 
                 self.urls = sorted(urls, key=get_text, reverse=True)
             self.url = self.urls[self.volume].get("href")
-            # Download the new page of decisions
             self.html = super()._download()
-
-        # Get the article which will contain all of our content.
-        article = self.html.xpath(".//article")[0]
-        # get the last element in the article
-        # this ends the process_elements method on the final call because no
-        # hr tag is present on the last decision
-
-        last = list(article.iter())[-1]
-        # Iterate over every tag in the article to separate out the cases.
-        elements = []
-        for element in article.iter():
-            elements.append(element)
-            # Process the data when the HR tag is found or the last element.
-            # this loop lets us generate all of the elements and thus all
-            # the data that we are looking for.  The DOJ has random and weird
-            # HTML that sometimes nests and sometimes doesnt nest elements of
-            # an opinion.
-            if element.tag == "hr" or element == last:
-                case = self._process_elements(elements)
-                if case:
-                    self.cases.append(case)
-                elements, case = [], {}
+        table = self.html.xpath(".//table")[0]
+        for row in table.xpath(".//strong"):
+            name = row.text_content().strip().strip(",")
+            row_text = row.xpath("..")[0].text_content()
+            if "BIA" not in row_text:
+                continue
+            if not name:
+                continue
+            citation, year = row_text.split(name)[1].split("(")
+            cells = row.xpath("..")[0].xpath(
+                "following-sibling::td[position() <= 2]"
+            )
+            if not cells:
+                continue
+            url = cells[0].xpath(".//a/@href")
+            docket = cells[0].xpath(".//a")[0].text_content()
+            if not url:
+                continue
+            self.cases.append(
+                {
+                    "name": name,
+                    "citation": citation,
+                    "url": url[0],
+                    "docket": docket,
+                    "date": f"{year.split()[1]}-07-01",
+                    "date_filed_is_approximate": True,
+                }
+            )
 
     def extract_from_text(self, scraped_text: str) -> Dict[str, Any]:
         """Can we extract the date filed from the text?

diff --git a/juriscraper/opinions/united_states/administrative_agency/olc.py b/juriscraper/opinions/united_states/administrative_agency/olc.py
@@ -23,15 +23,13 @@ def __init__(self, *args, **kwargs):
         self.status = "Published"
 
     def _process_html(self):
-        for row in self.html.xpath(
-            ".//tr[contains(@class , 'even')] | .//tr[contains(@class , 'odd')]"
-        ):
-            date = get_row_column_text(row, 1)
-            if "Date of Issuance" in date:
-                date = date.split("\n")[-1].strip()
-            name = get_row_column_text(row, 2)
-            url = get_row_column_links(row, 2)
-            summary = get_row_column_text(row, 3)
+        for row in self.html.xpath(".//article"):
+            name = row.xpath(".//h2")[0].text_content().strip()
+            url = row.xpath(".//a/@href")[0]
+            date = row.xpath(".//time")[0].text_content()
+            if not name:
+                continue
+            summary = row.xpath(".//p")[0].text_content()
             self.cases.append(
                 {
                     "date": date,

diff --git a/juriscraper/opinions/united_states/federal_appellate/ca10.py b/juriscraper/opinions/united_states/federal_appellate/ca10.py
@@ -1,96 +1,39 @@
 from lxml import html
 
-from juriscraper.AbstractSite import logger
-from juriscraper.lib.exceptions import InsanityException
 from juriscraper.lib.string_utils import convert_date_string
-from juriscraper.OpinionSite import OpinionSite
+from juriscraper.OpinionSiteLinear import OpinionSiteLinear
 
 
-class Site(OpinionSite):
+class Site(OpinionSiteLinear):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.url = (
             "http://www.ca10.uscourts.gov/opinions/new/daily_decisions.rss"
         )
         self.court_id = self.__module__
 
-    def _get_case_names(self):
-        """Case name parsing
-        Expected value for title_string:
-            &lt;p&gt;Bazan-Martinez v. Garland&lt;/p&gt;
-        """
-        case_names = []
-        for title_string in self.html.xpath("//item/title/text()"):
-            try:
-                p_element = html.etree.fromstring(str(title_string))
-                title_string = p_element.xpath("/p/text()")[0]
-                case_names.append(title_string)
-            except:
-                logger.error(f"Error while parsing case name: {title_string}")
-                raise InsanityException(
-                    f"Error while parsing case name: {title_string}"
-                )
-        return case_names
-
-    def _get_download_urls(self):
-        return [
-            html.tostring(e, method="text").decode()
-            for e in self.html.xpath("//item/link")
-        ]
-
-    def _get_case_dates(self):
-        """Case date parsing
-        Expected value for date_string:
-            &lt;span class=&quot;date-display-single&quot; property=&quot;dc:date&quot; datatype=&quot;xsd:dateTime&quot; content=&quot;2021-11-16T00:00:00-07:00&quot;&gt;Tue Nov 16 2021&lt;/span&gt;
-        """
-        dates = []
-        for date_string in self.html.xpath("//item/pubdate/text()"):
-            try:
-                span_element = html.etree.fromstring(str(date_string))
-                date_string = span_element.xpath("/span/text()")[0]
-                dates.append(convert_date_string(date_string))
-            except:
-                logger.error(f"Error while parsing case date: {date_string}")
-                raise InsanityException(
-                    f"Error while parsing case date: {date_string}"
-                )
-        return dates
-
-    def _get_docket_numbers(self):
-        """Case docket parsing
-        Expected content in description tag:
-            Docket#: 21-6001 - Date Issued: Mon Nov 15 2021 - Unpublished Order and Judgment
-        """
-        return [
-            e.split(" - ")[0].split(":")[1]
-            for e in self.html.xpath("//item/description/text()")
-        ]
-
-    def _get_precedential_statuses(self):
-        """Case precedential status parsing
-        Expected content in description tag:
-            Docket#: 21-5062 - Date Issued: Fri Nov 12 2021 - Unpublished Order and Judgment
-        Status:
-            - Published: "Published Opinion"
-            - Unpublished: "Unpublished Order and Judgment"
-        """
-        return [
-            "Published"
-            if "published opinion" in e.split(" - ")[2].lower()
-            else "Unpublished"
-            for e in self.html.xpath("//item/description/text()")
-        ]
-
-    def _get_lower_courts(self):
-        """Case lower court name parsing
-        namescpace "dc": "http://purl.org/dc/elements/1.1/"
-        Tags:
-            - <dc:creator>Board of Immigration Appeals</dc:creator>
-        """
-        return [
-            e
-            for e in self.html.xpath(
-                "//item/creator/text()",
+    def _process_html(self):
+        for item in self.html.xpath(".//item"):
+            for e in item.xpath(
+                ".//description/text()",
                 namespaces={"dc": "http://purl.org/dc/elements/1.1/"},
+            ):
+                if "Published Opinion" in e:
+                    status = "Published"
+                else:
+                    status = "Unpublished"
+                docket = e.split()[1].strip()
+            date = convert_date_string(item.xpath(".//pubdate/text()")[0])
+            formatted_date = date.strftime("%Y-%m-%d")
+            self.cases.append(
+                {
+                    "url": html.tostring(item.xpath("link")[0], method="text")
+                    .decode()
+                    .replace("\\n", "")
+                    .strip(),
+                    "name": item.xpath(".//title/text()")[0],
+                    "date": formatted_date,
+                    "status": status,
+                    "docket": docket,
+                }
             )
-        ]
diff --git a/juriscraper/opinions/united_states/federal_special/acca_memorandum.py b/juriscraper/opinions/united_states/federal_special/acca_memorandum.py
@@ -11,8 +11,6 @@
 class Site(acca_p.Site):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.url = "https://www.jagcnet.army.mil/85257546006DF36B/MODD?OpenView&Count=-1"
+        self.url = "https://www.jagcnet.army.mil/ACCALibrary/cases/opinions/MO"
         self.court_id = self.__module__
-
-    def _get_precedential_statuses(self):
-        return ["Unpublished"] * len(self.case_names)
+        self.status = "Unpublished"