Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

✏️ Better HTML conversion options #98

Merged
merged 10 commits into from
Dec 6, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions core/harambe_core/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,13 @@ class HarambeException(Exception):
pass


class UnknownHTMLConverter(HarambeException):
def __init__(self, converter_type: any) -> None:
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The converter_type parameter should be typed as Any instead of any.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ellipsis is right, it should be Any from the typing module

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good call

super().__init__(
f"Unknown HTML converter type: {converter_type}"
)


class GotoError(HarambeException):
def __init__(self, url: str, status: int) -> None:
super().__init__(
Expand Down
11 changes: 9 additions & 2 deletions sdk/harambe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
ResourceRequestHandler,
ResourceType,
)
from harambe.html_converter import HTMLConverterType, get_html_converter
from harambe.observer import (
DownloadMeta,
HTMLMetadata,
Expand Down Expand Up @@ -265,6 +266,7 @@ async def capture_html(
exclude_selectors: List[str] | None = None,
*,
soup_transform: Optional[Callable[[BeautifulSoup], None]] = None,
html_converter_type: HTMLConverterType = "markdown",
) -> HTMLMetadata:
"""
Capture and download the html content of the document or a specific element. The returned HTML
Expand All @@ -273,11 +275,15 @@ async def capture_html(
:param selector: CSS selector of element to capture. Defaults to "html" for the document element.
:param exclude_selectors: List of CSS selectors for elements to exclude from capture.
:param soup_transform: A function to transform the BeautifulSoup html prior to saving.
:param html_converter_type: Type of HTML converter to use for the inner text. Defaults to "markdown".
:return: HTMLMetadata containing download URL, HTML content and inner text.
:raises ValueError: If the specified selector doesn't match any element.
"""
html, text = await self._get_html(
selector, exclude_selectors or [], soup_transform or (lambda x: None)
selector,
exclude_selectors or [],
soup_transform or (lambda x: None),
html_converter_type,
)

downloads = await self._notify_observers(
Expand All @@ -300,6 +306,7 @@ async def _get_html(
selector: str,
exclude_selectors: List[str],
soup_transform: Callable[[BeautifulSoup], None],
html_converter_type: HTMLConverterType,
) -> Tuple[str, str]:
element = await self.page.query_selector(selector)

Expand All @@ -315,7 +322,7 @@ async def _get_html(

soup_transform(soup)

text = soup.get_text(separator="\n", strip=True)
text = get_html_converter(html_converter_type).convert_soup(soup)

return str(soup), text

Expand Down
20 changes: 20 additions & 0 deletions sdk/harambe/html_converter/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from typing import Literal

from harambe_core.errors import UnknownHTMLConverter
from markdownify import MarkdownConverter

from sdk.harambe.html_converter.html_to_markdown import HTMLToMarkdownConverter
from sdk.harambe.html_converter.html_to_text import HTMLToTextConverter

HTMLConverterType = Literal["markdown", "text"]


def get_html_converter(
html_converter_type: HTMLConverterType | None,
) -> MarkdownConverter:
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type of get_html_converter should be Union[HTMLToMarkdownConverter, HTMLToTextConverter] instead of MarkdownConverter.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no

if html_converter_type == "markdown":
return HTMLToMarkdownConverter()
if html_converter_type == "text":
return HTMLToTextConverter()
else:
raise UnknownHTMLConverter(html_converter_type)
5 changes: 5 additions & 0 deletions sdk/harambe/html_converter/html_to_markdown.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from markdownify import MarkdownConverter


class HTMLToMarkdownConverter(MarkdownConverter):
pass
52 changes: 52 additions & 0 deletions sdk/harambe/html_converter/html_to_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from bs4.element import Tag
from markdownify import MarkdownConverter


class HTMLToTextConverter(MarkdownConverter):
"""
Custom converter to convert data from HTML to text

Strip out standard markdown syntax like headings, em, strong, a, etc.
Include footnotes in brackets
"""

def convert_sup(self, el: Tag, text: str, convert_as_inline: bool) -> str:
return f"[{text}]"

def convert_sub(self, el: Tag, text: str, convert_as_inline: bool) -> str:
return f"[{text}]"

def convert_span(self, el: Tag, text: str, convert_as_inline: bool) -> str:
if el.get("class") and "sup" in el.get("class"):
return f"[{text}]"
if el.get("class") and "sub" in el.get("class"):
return f"[{text}]"
return text

def convert_h1(self, el: Tag, text: str, convert_as_inline: bool) -> str:
return self.convert_p(el, text, convert_as_inline)

def convert_h2(self, el: Tag, text: str, convert_as_inline: bool) -> str:
return self.convert_p(el, text, convert_as_inline)

def convert_h3(self, el: Tag, text: str, convert_as_inline: bool) -> str:
return self.convert_p(el, text, convert_as_inline)

def convert_h4(self, el: Tag, text: str, convert_as_inline: bool) -> str:
return self.convert_p(el, text, convert_as_inline)

def convert_h5(self, el: Tag, text: str, convert_as_inline: bool) -> str:
return self.convert_p(el, text, convert_as_inline)

def convert_h6(self, el: Tag, text: str, convert_as_inline: bool) -> str:
return self.convert_p(el, text, convert_as_inline)

# Treat inline elements as spans
def convert_strong(self, el: Tag, text: str, convert_as_inline: bool) -> str:
return self.convert_span(el, text, convert_as_inline)

def convert_em(self, el: Tag, text: str, convert_as_inline: bool) -> str:
return self.convert_span(el, text, convert_as_inline)

def convert_a(self, el: Tag, text: str, convert_as_inline: bool) -> str:
return self.convert_span(el, text, convert_as_inline)
9 changes: 9 additions & 0 deletions sdk/test/mock_html/heading.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
</head>
<body>
<h3>Heading</h3>
</body>
</html>
28 changes: 28 additions & 0 deletions sdk/test/test_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -497,6 +497,34 @@ async def scraper(sdk: SDK, *args, **kwargs):
assert "Replaced Text" in replaced_head_data["text"]


@pytest.mark.parametrize("harness", [playwright_harness, soup_harness])
async def test_capture_html_conversion_types(server, observer, harness):
url = f"{server}/heading"

@SDK.scraper("test", "detail", observer=observer)
async def scraper(sdk: SDK, *args, **kwargs):
markdown_html_metadata = await sdk.capture_html()
await sdk.save_data({"text": markdown_html_metadata["text"]})

text_html_metadata = await sdk.capture_html(html_converter_type="text")
await sdk.save_data({"text": text_html_metadata["text"]})

await SDK.run(
scraper=scraper,
url=url,
schema={},
headless=True,
harness=harness,
)

assert len(observer.data) == 2
# Markdown syntax is used
assert observer.data[0]["text"].strip() == "### Heading"

# Text doesn't include markdown syntax
assert observer.data[1]["text"].strip() == "Heading"


@pytest.mark.parametrize("harness", [playwright_harness, soup_harness])
async def test_capture_html_element_not_found(server, observer, harness):
url = f"{server}/table"
Expand Down
Loading