Skip to content

Commit

Permalink
Check for html in markdown files and error out
Browse files Browse the repository at this point in the history
Signed-off-by: Khaled Sulayman <[email protected]>
  • Loading branch information
khaledsulayman committed Dec 17, 2024
1 parent a6d06d0 commit 9457380
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 0 deletions.
18 changes: 18 additions & 0 deletions src/instructlab/sdg/utils/taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,18 @@ def _get_taxonomy(repo="taxonomy"):
return taxonomy_file_paths


def _string_contains_html(s: str) -> bool:
"""Detect HTML tags in a string.
We use this to catch markdown files that may contain html elements since
docling does not support this."""
# Define a regex to detect HTML tags
html_tag_pattern = re.compile(r"<\/?[a-zA-Z][\s\S]*?>")

# Check for HTML tags in the content
return bool(html_tag_pattern.search(s))


def _get_documents(
source: Dict[str, Union[str, List[str]]],
skip_checkout: bool = False,
Expand Down Expand Up @@ -160,6 +172,12 @@ def _get_documents(
# Process Markdown files
with open(file_path, "r", encoding="utf-8") as file:
content = file.read()
if _string_contains_html(content):
raise ValueError(f"Provided markdown file {file_path} contains"
" HTML, which is currently unsupported. Please"
" format your markdown documents without the"
" use of HTML or use a different document"
" filetype.")
file_contents.append(content)
filepaths.append(Path(file_path))
logger.info(
Expand Down
12 changes: 12 additions & 0 deletions tests/test_taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

# First Party
from instructlab.sdg.utils import taxonomy
from instructlab.sdg.utils.taxonomy import _string_contains_html

TEST_SEED_EXAMPLE = "Can you help me debug this failing unit test?"

Expand Down Expand Up @@ -85,3 +86,14 @@ def test_read_taxonomy_leaf_nodes(
):
seed_example_exists = True
assert seed_example_exists is True

@pytest.mark.parametrize(
"s, contains_html",
[
("hello, world!", False),
("hello, <div>world!</div>", True),
]
)
def test_string_contains_html(self, s, contains_html):
print(taxonomy.__dict__)
assert _string_contains_html(s) == contains_html

0 comments on commit 9457380

Please sign in to comment.