reworkd · asim-shrestha · Dec 6, 2024 · Dec 5, 2024 · Dec 5, 2024 · Dec 5, 2024
diff --git a/core/harambe_core/errors.py b/core/harambe_core/errors.py
@@ -4,6 +4,13 @@ class HarambeException(Exception):
     pass
 
 
+class UnknownHTMLConverter(HarambeException):
+    def __init__(self, converter_type: any) -> None:
+        super().__init__(
+            f"Unknown HTML converter type: {converter_type}"
+        )
+
+
 class GotoError(HarambeException):
     def __init__(self, url: str, status: int) -> None:
         super().__init__(

diff --git a/sdk/harambe/core.py b/sdk/harambe/core.py
@@ -26,6 +26,7 @@
     ResourceRequestHandler,
     ResourceType,
 )
+from harambe.html_converter import HTMLConverterType, get_html_converter
 from harambe.observer import (
     DownloadMeta,
     HTMLMetadata,
@@ -265,6 +266,7 @@ async def capture_html(
         exclude_selectors: List[str] | None = None,
         *,
         soup_transform: Optional[Callable[[BeautifulSoup], None]] = None,
+        html_converter_type: HTMLConverterType = "markdown",
     ) -> HTMLMetadata:
         """
         Capture and download the html content of the document or a specific element. The returned HTML
@@ -273,11 +275,15 @@ async def capture_html(
         :param selector: CSS selector of element to capture. Defaults to "html" for the document element.
         :param exclude_selectors: List of CSS selectors for elements to exclude from capture.
         :param soup_transform: A function to transform the BeautifulSoup html prior to saving.
+        :param html_converter_type: Type of HTML converter to use for the inner text. Defaults to "markdown".
         :return: HTMLMetadata containing download URL, HTML content and inner text.
         :raises ValueError: If the specified selector doesn't match any element.
         """
         html, text = await self._get_html(
-            selector, exclude_selectors or [], soup_transform or (lambda x: None)
+            selector,
+            exclude_selectors or [],
+            soup_transform or (lambda x: None),
+            html_converter_type,
         )
 
         downloads = await self._notify_observers(
@@ -300,6 +306,7 @@ async def _get_html(
         selector: str,
         exclude_selectors: List[str],
         soup_transform: Callable[[BeautifulSoup], None],
+        html_converter_type: HTMLConverterType,
     ) -> Tuple[str, str]:
         element = await self.page.query_selector(selector)
 
@@ -315,7 +322,7 @@ async def _get_html(
 
         soup_transform(soup)
 
-        text = soup.get_text(separator="\n", strip=True)
+        text = get_html_converter(html_converter_type).convert_soup(soup)
 
         return str(soup), text
 

diff --git a/sdk/harambe/html_converter/__init__.py b/sdk/harambe/html_converter/__init__.py
@@ -0,0 +1,20 @@
+from typing import Literal
+
+from harambe_core.errors import UnknownHTMLConverter
+from markdownify import MarkdownConverter
+
+from sdk.harambe.html_converter.html_to_markdown import HTMLToMarkdownConverter
+from sdk.harambe.html_converter.html_to_text import HTMLToTextConverter
+
+HTMLConverterType = Literal["markdown", "text"]
+
+
+def get_html_converter(
+    html_converter_type: HTMLConverterType | None,
+) -> MarkdownConverter:
+    if html_converter_type == "markdown":
+        return HTMLToMarkdownConverter()
+    if html_converter_type == "text":
+        return HTMLToTextConverter()
+    else:
+        raise UnknownHTMLConverter(html_converter_type)
diff --git a/sdk/harambe/html_converter/html_to_markdown.py b/sdk/harambe/html_converter/html_to_markdown.py
@@ -0,0 +1,5 @@
+from markdownify import MarkdownConverter
+
+
+class HTMLToMarkdownConverter(MarkdownConverter):
+    pass
diff --git a/sdk/harambe/html_converter/html_to_text.py b/sdk/harambe/html_converter/html_to_text.py
@@ -0,0 +1,52 @@
+from bs4.element import Tag
+from markdownify import MarkdownConverter
+
+
+class HTMLToTextConverter(MarkdownConverter):
+    """
+    Custom converter to convert data from HTML to text
+
+    Strip out standard markdown syntax like headings, em, strong, a, etc.
+    Include footnotes in brackets
+    """
+
+    def convert_sup(self, el: Tag, text: str, convert_as_inline: bool) -> str:
+        return f"[{text}]"
+
+    def convert_sub(self, el: Tag, text: str, convert_as_inline: bool) -> str:
+        return f"[{text}]"
+
+    def convert_span(self, el: Tag, text: str, convert_as_inline: bool) -> str:
+        if el.get("class") and "sup" in el.get("class"):
+            return f"[{text}]"
+        if el.get("class") and "sub" in el.get("class"):
+            return f"[{text}]"
+        return text
+
+    def convert_h1(self, el: Tag, text: str, convert_as_inline: bool) -> str:
+        return self.convert_p(el, text, convert_as_inline)
+
+    def convert_h2(self, el: Tag, text: str, convert_as_inline: bool) -> str:
+        return self.convert_p(el, text, convert_as_inline)
+
+    def convert_h3(self, el: Tag, text: str, convert_as_inline: bool) -> str:
+        return self.convert_p(el, text, convert_as_inline)
+
+    def convert_h4(self, el: Tag, text: str, convert_as_inline: bool) -> str:
+        return self.convert_p(el, text, convert_as_inline)
+
+    def convert_h5(self, el: Tag, text: str, convert_as_inline: bool) -> str:
+        return self.convert_p(el, text, convert_as_inline)
+
+    def convert_h6(self, el: Tag, text: str, convert_as_inline: bool) -> str:
+        return self.convert_p(el, text, convert_as_inline)
+
+    # Treat inline elements as spans
+    def convert_strong(self, el: Tag, text: str, convert_as_inline: bool) -> str:
+        return self.convert_span(el, text, convert_as_inline)
+
+    def convert_em(self, el: Tag, text: str, convert_as_inline: bool) -> str:
+        return self.convert_span(el, text, convert_as_inline)
+
+    def convert_a(self, el: Tag, text: str, convert_as_inline: bool) -> str:
+        return self.convert_span(el, text, convert_as_inline)
diff --git a/sdk/test/mock_html/heading.html b/sdk/test/mock_html/heading.html
@@ -0,0 +1,9 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+</head>
+<body>
+<h3>Heading</h3>
+</body>
+</html>
diff --git a/sdk/test/test_e2e.py b/sdk/test/test_e2e.py
@@ -497,6 +497,34 @@ async def scraper(sdk: SDK, *args, **kwargs):
     assert "Replaced Text" in replaced_head_data["text"]
 
 
+@pytest.mark.parametrize("harness", [playwright_harness, soup_harness])
+async def test_capture_html_conversion_types(server, observer, harness):
+    url = f"{server}/heading"
+
+    @SDK.scraper("test", "detail", observer=observer)
+    async def scraper(sdk: SDK, *args, **kwargs):
+        markdown_html_metadata = await sdk.capture_html()
+        await sdk.save_data({"text": markdown_html_metadata["text"]})
+
+        text_html_metadata = await sdk.capture_html(html_converter_type="text")
+        await sdk.save_data({"text": text_html_metadata["text"]})
+
+    await SDK.run(
+        scraper=scraper,
+        url=url,
+        schema={},
+        headless=True,
+        harness=harness,
+    )
+
+    assert len(observer.data) == 2
+    # Markdown syntax is used
+    assert observer.data[0]["text"].strip() == "### Heading"
+
+    # Text doesn't include markdown syntax
+    assert observer.data[1]["text"].strip() == "Heading"
+
+
 @pytest.mark.parametrize("harness", [playwright_harness, soup_harness])
 async def test_capture_html_element_not_found(server, observer, harness):
     url = f"{server}/table"