Skip to content

Commit

Permalink
feat: group links (#16)
Browse files Browse the repository at this point in the history
* group links, code improvements, v0.0 -> v0.1

* improve code, add clean_llms_data tests

* use h1 as section title, <title> then url path as fallback

* crawler keep all html, renderer strip
  • Loading branch information
MQ37 authored Jan 27, 2025
1 parent 4d0d786 commit c0d4d9e
Show file tree
Hide file tree
Showing 9 changed files with 396 additions and 79 deletions.
2 changes: 1 addition & 1 deletion .actor/actor.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"name": "llmstxt-generator",
"title": "Generate /llms.txt for the given site",
"description": "Generates /llms.txt for the given site",
"version": "0.0",
"version": "0.1",
"buildTag": "latest",
"input": "./input_schema.json",
"storages": {
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
.PHONY: clean install-dev lint type-check unit-test format

DIRS_WITH_CODE = src
DIRS_WITH_CODE = src/ tests/

clean:
rm -rf .mypy_cache .pytest_cache .ruff_cache build dist htmlcov .coverage
Expand Down
2 changes: 1 addition & 1 deletion src/crawler_config.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
CRAWLER_CONFIG = {
'htmlTransformer': 'none',
'keepElementsCssSelector': 'meta[name="description"],meta[name="Description"]\ntitle',
# dummy value is used to prevent the removal of any elements
# changed by get_crawler_actor_config with default value 1
'maxCrawlDepth': 0, # 0 by default for root page only just in case
'maxCrawlPages': 10, # 10 by default, just in case it is not set
Expand Down
69 changes: 66 additions & 3 deletions src/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,74 @@
if TYPE_CHECKING:
from apify_client.clients import KeyValueStoreClientAsync

from src.mytypes import LLMSData

# not using Actor.log because pytest then throws a warning
# about non existent event loop
logger = logging.getLogger('apify')


def get_section_dir_title(section_dir: str, path_titles: dict[str, str]) -> str:
"""Gets the title of the section from the path titles."""
current_dir = section_dir
while (parent_title := path_titles.get(current_dir)) is None:
current_dir = current_dir.rsplit('/', 1)[0]
if not current_dir:
parent_title = section_dir
break
return parent_title


def get_h1_from_html(html: str) -> str | None:
"""Extracts the first h1 tag from the HTML content."""
soup = bs4.BeautifulSoup(html, 'html.parser')
h1 = soup.find('h1')
return h1.getText() if h1 else None


def clean_llms_data(data: LLMSData, section_min_links: int = 2) -> None:
"""Cleans the LLMS data by removing sections with low link count and moving the links to the index section.
:param data: LLMS data to clean
:param section_min_links: Minimum number of links in a section to keep it
and not move the links to the index section
"""
to_remove_sections: set[str] = set()

if 'sections' not in data:
raise ValueError('Missing "sections" attribute in the LLMS data!')

sections = data['sections']

for section_dir, section in sections.items():
# skip the index section
if section_dir == '/':
continue
if len(section['links']) < section_min_links:
to_remove_sections.add(section_dir)

if to_remove_sections:
if '/' not in sections:
sections['/'] = {'title': 'Index', 'links': []}
for section_dir in to_remove_sections:
sections['/']['links'].extend(sections[section_dir]['links'])
del sections[section_dir]


def get_url_path(url: str) -> str:
"""Get the path from the URL."""
url_normalized = normalize_url(url)
parsed_url = urlparse(url_normalized)
return parsed_url.path or '/'


def get_url_path_dir(url: str) -> str:
"""Get the directory path from the URL."""
url_normalized = normalize_url(url)
parsed_url = urlparse(url_normalized)
return parsed_url.path.rsplit('/', 1)[0] or '/'


def normalize_url(url: str) -> str:
"""Normalizes the URL by removing trailing slash."""
parsed_url = urlparse(url)
Expand Down Expand Up @@ -44,8 +107,8 @@ def is_description_suitable(description: str | None) -> bool:
return '\n' not in description


async def get_description_from_kvstore(kvstore: KeyValueStoreClientAsync, html_url: str) -> str | None:
"""Extracts the description from the HTML content stored in the KV store."""
async def get_html_from_kvstore(kvstore: KeyValueStoreClientAsync, html_url: str) -> str | None:
"""Gets the HTML content from the KV store."""
store_id = html_url.split('records/')[-1]
if not (record := await kvstore.get_record(store_id)):
logger.warning(f'Failed to get record with id "{store_id}"!')
Expand All @@ -54,7 +117,7 @@ async def get_description_from_kvstore(kvstore: KeyValueStoreClientAsync, html_u
logger.warning(f'Invalid HTML content for record with id "{store_id}"!')
return None

return get_description_from_html(html)
return str(html)


def get_crawler_actor_config(
Expand Down
76 changes: 49 additions & 27 deletions src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,29 @@

from apify import Actor

from .helpers import get_crawler_actor_config, get_description_from_kvstore, is_description_suitable, normalize_url
from .helpers import (
clean_llms_data,
get_crawler_actor_config,
get_description_from_html,
get_h1_from_html,
get_html_from_kvstore,
get_section_dir_title,
get_url_path,
get_url_path_dir,
is_description_suitable,
normalize_url,
)
from .renderer import render_llms_txt

if TYPE_CHECKING:
from src.mytypes import LLMSData, SectionDict
from src.mytypes import LLMSData

logger = logging.getLogger('apify')

# minimum for the llms.txt generator to process the results
MIN_GENERATOR_RUN_SECS = 60
LOG_POLL_INTERVAL_SECS = 5
SECTION_MIN_LINKS = 2


async def main() -> None:
Expand Down Expand Up @@ -66,7 +79,7 @@ async def main() -> None:
),
# memory limit for the crawler actor so free tier can use this actor
memory_mbytes=4096,
wait=timedelta(seconds=5),
wait=timedelta(seconds=LOG_POLL_INTERVAL_SECS),
timeout=timeout_crawler,
)
if actor_run_details is None:
Expand All @@ -82,7 +95,7 @@ async def main() -> None:
if status_msg is not None:
await Actor.set_status_message(status_msg)
last_status_msg = status_msg
await asyncio.sleep(5)
await asyncio.sleep(LOG_POLL_INTERVAL_SECS)

if not (run := await run_client.wait_for_finish()):
msg = 'Failed to get the "apify/website-content-crawler" actor run details!'
Expand All @@ -97,41 +110,48 @@ async def main() -> None:
hostname = urlparse(url).hostname
root_title = hostname

data: LLMSData = {'title': root_title, 'description': None, 'details': None, 'sections': []}
# add all pages to index section for now
section: SectionDict = {'title': 'Index', 'links': []}
data: LLMSData = {'title': root_title, 'description': None, 'details': None, 'sections': {}}
sections = data['sections']

is_dataset_empty = True
path_titles: dict[str, str] = {}
sections_to_fill_title = []
async for item in run_dataset.iterate_items():
is_dataset_empty = False
item_url = item.get('url')
logger.info(f'Processing page: {item_url}')
if item_url is None:
if (item_url := item.get('url')) is None:
logger.warning('Missing "url" attribute in dataset item!')
continue
html_url = item.get('htmlUrl')
if html_url is None:
logger.info(f'Processing page: {item_url}')
if (html_url := item.get('htmlUrl')) is None:
logger.warning('Missing "htmlUrl" attribute in dataset item!')
continue

html = await get_html_from_kvstore(run_store, html_url)
metadata = item.get('metadata', {})
description = metadata.get('description') or (get_description_from_html(html) if html else None)
title = (get_h1_from_html(html) if html else None) or metadata.get('title')
path_titles[get_url_path(item_url)] = title

# handle input root url separately
is_root = normalize_url(item_url) == url_normalized
if is_root:
description = await get_description_from_kvstore(run_store, html_url)
data['description'] = description if is_description_suitable(description) else None
continue

metadata = item.get('metadata', {})
description = metadata.get('description')
title = metadata.get('title')

# extract description from HTML, crawler might not have extracted it
if description is None:
description = await get_description_from_kvstore(run_store, html_url)

if not is_description_suitable(description):
description = None

section['links'].append({'url': item_url, 'title': title, 'description': description})
section_dir = get_url_path_dir(item_url)
section_title = path_titles.get(section_dir)
if section_dir not in sections:
sections[section_dir] = {'title': section_title or section_dir, 'links': []}
if section_title is None:
sections_to_fill_title.append(section_dir)

sections[section_dir]['links'].append(
{
'url': item_url,
'title': title,
'description': description if is_description_suitable(description) else None,
}
)

if is_dataset_empty:
msg = (
Expand All @@ -140,9 +160,11 @@ async def main() -> None:
)
raise RuntimeError(msg)

if section['links']:
data['sections'].append(section)
for section_dir in sections_to_fill_title:
sections[section_dir]['title'] = get_section_dir_title(section_dir, path_titles)

# move sections with less than SECTION_MIN_LINKS to the root
clean_llms_data(data)
output = render_llms_txt(data)

# save into kv-store as a file to be able to download it
Expand Down
2 changes: 1 addition & 1 deletion src/mytypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,4 @@ class LLMSData(TypedDict):
title: str
description: str | None
details: str | None
sections: list[SectionDict]
sections: dict[str, SectionDict]
22 changes: 12 additions & 10 deletions src/renderer.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,20 +30,22 @@ def render_llms_txt(data: LLMSData) -> str:
- [Example](https://example.com): Example description
"""
result = [f"# {data['title']}\n\n"]
result = [f"# {data['title'].strip()}\n\n"]

if data.get('description'):
result.append(f"> {data['description']}\n\n")
if description := data.get('description'):
result.append(f'> {description.strip()}\n\n')

if data.get('details'):
result.append(f"{data['details']}\n\n")
if details := data.get('details'):
result.append(f'{details.strip()}\n\n')

for section in data.get('sections', []):
result.append(f"## {section['title']}\n\n")
for section_dir in sorted(data.get('sections', {})):
section = data['sections'][section_dir]
result.append(f"## {section['title'].strip()}\n\n")
for link in section.get('links', []):
link_str = f"- [{link['title']}]({link['url']})"
if link.get('description'):
link_str += f": {link['description']}"
link_str = f"- [{link['title'].strip()}]({link['url'].strip()})"
if link_description := link.get('description'):
link_str += f': {link_description.strip()}'
result.append(f'{link_str}\n')
result.append('\n')

return ''.join(result)
Loading

0 comments on commit c0d4d9e

Please sign in to comment.