From ccd9a6ceeb51853436a836c721feb140ce19da35 Mon Sep 17 00:00:00 2001 From: Gianfranco Rossi Date: Wed, 8 Jan 2025 13:38:43 -0500 Subject: [PATCH] fix(ScraperExtractFromText): add fail case testing Solves #1289 - Adds a fail test case to `text_extract_from_text` - Updates bia, bap1, nm and sd scrapers to return an empty dict when no match was found --- .../united_states/administrative_agency/bia.py | 11 +++++++++-- .../united_states/federal_bankruptcy/bap1.py | 7 ++++--- juriscraper/opinions/united_states/state/nm.py | 8 ++++++-- juriscraper/opinions/united_states/state/sd.py | 9 ++++++--- tests/local/test_ScraperExtractFromTextTest.py | 12 ++++++++++++ 5 files changed, 37 insertions(+), 10 deletions(-) diff --git a/juriscraper/opinions/united_states/administrative_agency/bia.py b/juriscraper/opinions/united_states/administrative_agency/bia.py index 9190117b5..9953ed0d5 100644 --- a/juriscraper/opinions/united_states/administrative_agency/bia.py +++ b/juriscraper/opinions/united_states/administrative_agency/bia.py @@ -13,6 +13,7 @@ from datetime import datetime from typing import Any, Dict +from juriscraper.AbstractSite import logger from juriscraper.OpinionSiteLinear import OpinionSiteLinear @@ -70,8 +71,14 @@ def extract_from_text(self, scraped_text: str) -> Dict[str, Any]: date = re.findall( r"Decided (by (Acting\s)?Attorney General )?(.*\d{4})", scraped_text, - )[0][-1] - date_filed = datetime.strptime(date, "%B %d, %Y").strftime("%Y-%m-%d") + ) + if not date: + logger.error("bia: unable to extract_from_text a date_filed") + return {} + + date_filed = datetime.strptime(date[0][-1], "%B %d, %Y").strftime( + "%Y-%m-%d" + ) metadata = { "OpinionCluster": { "date_filed": date_filed, diff --git a/juriscraper/opinions/united_states/federal_bankruptcy/bap1.py b/juriscraper/opinions/united_states/federal_bankruptcy/bap1.py index 5f3b34d3a..e04bc0925 100644 --- a/juriscraper/opinions/united_states/federal_bankruptcy/bap1.py +++ b/juriscraper/opinions/united_states/federal_bankruptcy/bap1.py @@ -167,9 +167,10 @@ def extract_from_text(self, scraped_text: str) -> Dict[str, Any]: """ months = "|".join(calendar.month_name[1:]) date_pattern = re.compile(rf"({months})\s+\d{{1,2}}\s?,?\s+\d{{4}}") - match = re.search(date_pattern, scraped_text) - date_extracted = match.group(0) if match else "" - date_filed = re.sub(r"\s+", " ", date_extracted).strip() + if match := re.search(date_pattern, scraped_text): + date_filed = re.sub(r"\s+", " ", match.group(0)).strip() + else: + return {} metadata = { "OpinionCluster": { diff --git a/juriscraper/opinions/united_states/state/nm.py b/juriscraper/opinions/united_states/state/nm.py index 5600013c7..5d45f075e 100644 --- a/juriscraper/opinions/united_states/state/nm.py +++ b/juriscraper/opinions/united_states/state/nm.py @@ -126,10 +126,14 @@ def extract_from_text(self, scraped_text: str) -> Dict[str, Any]: :param scraped_text: Text of scraped content :return: metadata """ - docket_number = re.findall(r"N[oO]\.\s(.*)", scraped_text)[0] + docket_number = re.findall(r"N[oO]\.\s(.*)", scraped_text) + if not docket_number: + logger.error("nm: unable to extract_from_text a docket_number") + return {} + metadata = { "OpinionCluster": { - "docket_number": docket_number, + "docket_number": docket_number[0], }, } return metadata diff --git a/juriscraper/opinions/united_states/state/sd.py b/juriscraper/opinions/united_states/state/sd.py index 733e4d8f3..cc81adc10 100644 --- a/juriscraper/opinions/united_states/state/sd.py +++ b/juriscraper/opinions/united_states/state/sd.py @@ -142,12 +142,15 @@ def extract_from_text(self, scraped_text: str) -> Dict[str, Any]: """ # The docket number appears to be the first text on the page. - # So I crop the text to avoid any confusion that might occur in the + # So we crop the text to avoid any confusion that might occur in the # body of an opinion. - docket = re.findall(r"#\d+.*-.-\w{3}", scraped_text[:100])[0] + docket = re.findall(r"#\d+.*-.-\w{3}", scraped_text[:100]) + if not docket: + return {} + metadata = { "Docket": { - "docket_number": docket, + "docket_number": docket[0], }, } return metadata diff --git a/tests/local/test_ScraperExtractFromTextTest.py b/tests/local/test_ScraperExtractFromTextTest.py index 717adcb61..2e55825e8 100644 --- a/tests/local/test_ScraperExtractFromTextTest.py +++ b/tests/local/test_ScraperExtractFromTextTest.py @@ -1,4 +1,5 @@ import datetime +import logging import unittest from juriscraper.lib.importer import build_module_list @@ -752,16 +753,27 @@ class ScraperExtractFromText(unittest.TestCase): def test_extract_from_text(self): """Test that extract_from_text returns the expected data.""" + # prevent logger.error calls to be triggered + logging.disable(logging.CRITICAL) for module_string, test_cases in self.test_data.items(): package, module = module_string.rsplit(".", 1) mod = __import__( f"{package}.{module}", globals(), locals(), [module] ) site = mod.Site() + + # ensure that if no data is parsed, a dict is returned + # also, this ensures that there are no uncontrolled exceptions + self.assertTrue( + isinstance( + site.extract_from_text("Lorem ipsum dolorem..."), dict + ) + ) for test_case in test_cases: self.assertEqual( site.extract_from_text(test_case[0]), test_case[1] ) + logging.disable(logging.NOTSET) def test_extract_from_text_properly_implemented(self): """Ensure that extract_from_text is properly implemented."""