diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index 897eddbe..5ac466e6 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -109,6 +109,18 @@ def _get_taxonomy(repo="taxonomy"): return taxonomy_file_paths +def _string_contains_html(s: str) -> bool: + """Detect HTML tags in a string. + + We use this to catch markdown files that may contain html elements since + docling does not support this.""" + # Define a regex to detect HTML tags + html_tag_pattern = re.compile(r"<\/?[a-zA-Z][\s\S]*?>") + + # Check for HTML tags in the content + return bool(html_tag_pattern.search(s)) + + def _get_documents( source: Dict[str, Union[str, List[str]]], skip_checkout: bool = False, @@ -160,6 +172,12 @@ def _get_documents( # Process Markdown files with open(file_path, "r", encoding="utf-8") as file: content = file.read() + if _string_contains_html(content): + raise ValueError(f"Provided markdown file {file_path} contains" + " HTML, which is currently unsupported. Please" + " format your markdown documents without the" + " use of HTML or use a different document" + " filetype.") file_contents.append(content) filepaths.append(Path(file_path)) logger.info( diff --git a/tests/test_taxonomy.py b/tests/test_taxonomy.py index 0828e187..32114fcc 100644 --- a/tests/test_taxonomy.py +++ b/tests/test_taxonomy.py @@ -11,6 +11,7 @@ # First Party from instructlab.sdg.utils import taxonomy +from instructlab.sdg.utils.taxonomy import _string_contains_html TEST_SEED_EXAMPLE = "Can you help me debug this failing unit test?" @@ -85,3 +86,14 @@ def test_read_taxonomy_leaf_nodes( ): seed_example_exists = True assert seed_example_exists is True + + @pytest.mark.parametrize( + "s, contains_html", + [ + ("hello, world!", False), + ("hello,