feat: group links (#16)

* group links, code improvements, v0.0 -> v0.1 * improve code, add clean_llms_data tests * use h1 as section title, <title> then url path as fallback * crawler keep all html, renderer strip
apify · Jan 27, 2025 · c0d4d9e · c0d4d9e
1 parent 4d0d786
commit c0d4d9e
Show file tree

Hide file tree

Showing 9 changed files with 396 additions and 79 deletions.
diff --git a/.actor/actor.json b/.actor/actor.json
@@ -3,7 +3,7 @@
   "name": "llmstxt-generator",
   "title": "Generate /llms.txt for the given site",
   "description": "Generates /llms.txt for the given site",
-  "version": "0.0",
+  "version": "0.1",
   "buildTag": "latest",
   "input": "./input_schema.json",
   "storages": {

diff --git a/Makefile b/Makefile
@@ -1,6 +1,6 @@
 .PHONY: clean install-dev lint type-check unit-test format
 
-DIRS_WITH_CODE = src
+DIRS_WITH_CODE = src/ tests/
 
 clean:
 	rm -rf .mypy_cache .pytest_cache .ruff_cache build dist htmlcov .coverage

diff --git a/src/crawler_config.py b/src/crawler_config.py
@@ -1,6 +1,6 @@
 CRAWLER_CONFIG = {
     'htmlTransformer': 'none',
-    'keepElementsCssSelector': 'meta[name="description"],meta[name="Description"]\ntitle',
+    # dummy value is used to prevent the removal of any elements
     # changed by get_crawler_actor_config with default value 1
     'maxCrawlDepth': 0,  # 0 by default for root page only just in case
     'maxCrawlPages': 10,  # 10 by default, just in case it is not set

diff --git a/src/helpers.py b/src/helpers.py
@@ -12,11 +12,74 @@
 if TYPE_CHECKING:
     from apify_client.clients import KeyValueStoreClientAsync
 
+    from src.mytypes import LLMSData
+
 # not using Actor.log because pytest then throws a warning
 # about non existent event loop
 logger = logging.getLogger('apify')
 
 
+def get_section_dir_title(section_dir: str, path_titles: dict[str, str]) -> str:
+    """Gets the title of the section from the path titles."""
+    current_dir = section_dir
+    while (parent_title := path_titles.get(current_dir)) is None:
+        current_dir = current_dir.rsplit('/', 1)[0]
+        if not current_dir:
+            parent_title = section_dir
+            break
+    return parent_title
+
+
+def get_h1_from_html(html: str) -> str | None:
+    """Extracts the first h1 tag from the HTML content."""
+    soup = bs4.BeautifulSoup(html, 'html.parser')
+    h1 = soup.find('h1')
+    return h1.getText() if h1 else None
+
+
+def clean_llms_data(data: LLMSData, section_min_links: int = 2) -> None:
+    """Cleans the LLMS data by removing sections with low link count and moving the links to the index section.
+
+    :param data: LLMS data to clean
+    :param section_min_links: Minimum number of links in a section to keep it
+    and not move the links to the index section
+    """
+    to_remove_sections: set[str] = set()
+
+    if 'sections' not in data:
+        raise ValueError('Missing "sections" attribute in the LLMS data!')
+
+    sections = data['sections']
+
+    for section_dir, section in sections.items():
+        # skip the index section
+        if section_dir == '/':
+            continue
+        if len(section['links']) < section_min_links:
+            to_remove_sections.add(section_dir)
+
+    if to_remove_sections:
+        if '/' not in sections:
+            sections['/'] = {'title': 'Index', 'links': []}
+        for section_dir in to_remove_sections:
+            sections['/']['links'].extend(sections[section_dir]['links'])
+            del sections[section_dir]
+
+
+def get_url_path(url: str) -> str:
+    """Get the path from the URL."""
+    url_normalized = normalize_url(url)
+    parsed_url = urlparse(url_normalized)
+    return parsed_url.path or '/'
+
+
+def get_url_path_dir(url: str) -> str:
+    """Get the directory path from the URL."""
+    url_normalized = normalize_url(url)
+    parsed_url = urlparse(url_normalized)
+    return parsed_url.path.rsplit('/', 1)[0] or '/'
+
+
 def normalize_url(url: str) -> str:
     """Normalizes the URL by removing trailing slash."""
     parsed_url = urlparse(url)
@@ -44,8 +107,8 @@ def is_description_suitable(description: str | None) -> bool:
     return '\n' not in description
 
 
-async def get_description_from_kvstore(kvstore: KeyValueStoreClientAsync, html_url: str) -> str | None:
-    """Extracts the description from the HTML content stored in the KV store."""
+async def get_html_from_kvstore(kvstore: KeyValueStoreClientAsync, html_url: str) -> str | None:
+    """Gets the HTML content from the KV store."""
     store_id = html_url.split('records/')[-1]
     if not (record := await kvstore.get_record(store_id)):
         logger.warning(f'Failed to get record with id "{store_id}"!')
@@ -54,7 +117,7 @@ async def get_description_from_kvstore(kvstore: KeyValueStoreClientAsync, html_u
         logger.warning(f'Invalid HTML content for record with id "{store_id}"!')
         return None
 
-    return get_description_from_html(html)
+    return str(html)
 
 
 def get_crawler_actor_config(

diff --git a/src/main.py b/src/main.py
@@ -8,16 +8,29 @@
 
 from apify import Actor
 
-from .helpers import get_crawler_actor_config, get_description_from_kvstore, is_description_suitable, normalize_url
+from .helpers import (
+    clean_llms_data,
+    get_crawler_actor_config,
+    get_description_from_html,
+    get_h1_from_html,
+    get_html_from_kvstore,
+    get_section_dir_title,
+    get_url_path,
+    get_url_path_dir,
+    is_description_suitable,
+    normalize_url,
+)
 from .renderer import render_llms_txt
 
 if TYPE_CHECKING:
-    from src.mytypes import LLMSData, SectionDict
+    from src.mytypes import LLMSData
 
 logger = logging.getLogger('apify')
 
 # minimum for the llms.txt generator to process the results
 MIN_GENERATOR_RUN_SECS = 60
+LOG_POLL_INTERVAL_SECS = 5
+SECTION_MIN_LINKS = 2
 
 
 async def main() -> None:
@@ -66,7 +79,7 @@ async def main() -> None:
             ),
             # memory limit for the crawler actor so free tier can use this actor
             memory_mbytes=4096,
-            wait=timedelta(seconds=5),
+            wait=timedelta(seconds=LOG_POLL_INTERVAL_SECS),
             timeout=timeout_crawler,
         )
         if actor_run_details is None:
@@ -82,7 +95,7 @@ async def main() -> None:
                 if status_msg is not None:
                     await Actor.set_status_message(status_msg)
                 last_status_msg = status_msg
-            await asyncio.sleep(5)
+            await asyncio.sleep(LOG_POLL_INTERVAL_SECS)
 
         if not (run := await run_client.wait_for_finish()):
             msg = 'Failed to get the "apify/website-content-crawler" actor run details!'
@@ -97,41 +110,48 @@ async def main() -> None:
         hostname = urlparse(url).hostname
         root_title = hostname
 
-        data: LLMSData = {'title': root_title, 'description': None, 'details': None, 'sections': []}
-        # add all pages to index section for now
-        section: SectionDict = {'title': 'Index', 'links': []}
+        data: LLMSData = {'title': root_title, 'description': None, 'details': None, 'sections': {}}
+        sections = data['sections']
 
         is_dataset_empty = True
+        path_titles: dict[str, str] = {}
+        sections_to_fill_title = []
         async for item in run_dataset.iterate_items():
             is_dataset_empty = False
-            item_url = item.get('url')
-            logger.info(f'Processing page: {item_url}')
-            if item_url is None:
+            if (item_url := item.get('url')) is None:
                 logger.warning('Missing "url" attribute in dataset item!')
                 continue
-            html_url = item.get('htmlUrl')
-            if html_url is None:
+            logger.info(f'Processing page: {item_url}')
+            if (html_url := item.get('htmlUrl')) is None:
                 logger.warning('Missing "htmlUrl" attribute in dataset item!')
                 continue
 
+            html = await get_html_from_kvstore(run_store, html_url)
+            metadata = item.get('metadata', {})
+            description = metadata.get('description') or (get_description_from_html(html) if html else None)
+            title = (get_h1_from_html(html) if html else None) or metadata.get('title')
+            path_titles[get_url_path(item_url)] = title
+
+            # handle input root url separately
             is_root = normalize_url(item_url) == url_normalized
             if is_root:
-                description = await get_description_from_kvstore(run_store, html_url)
                 data['description'] = description if is_description_suitable(description) else None
                 continue
 
-            metadata = item.get('metadata', {})
-            description = metadata.get('description')
-            title = metadata.get('title')
-
-            # extract description from HTML, crawler might not have extracted it
-            if description is None:
-                description = await get_description_from_kvstore(run_store, html_url)
-
-            if not is_description_suitable(description):
-                description = None
-
-            section['links'].append({'url': item_url, 'title': title, 'description': description})
+            section_dir = get_url_path_dir(item_url)
+            section_title = path_titles.get(section_dir)
+            if section_dir not in sections:
+                sections[section_dir] = {'title': section_title or section_dir, 'links': []}
+                if section_title is None:
+                    sections_to_fill_title.append(section_dir)
+
+            sections[section_dir]['links'].append(
+                {
+                    'url': item_url,
+                    'title': title,
+                    'description': description if is_description_suitable(description) else None,
+                }
+            )
 
         if is_dataset_empty:
             msg = (
@@ -140,9 +160,11 @@ async def main() -> None:
             )
             raise RuntimeError(msg)
 
-        if section['links']:
-            data['sections'].append(section)
+        for section_dir in sections_to_fill_title:
+            sections[section_dir]['title'] = get_section_dir_title(section_dir, path_titles)
 
+        # move sections with less than SECTION_MIN_LINKS to the root
+        clean_llms_data(data)
         output = render_llms_txt(data)
 
         # save into kv-store as a file to be able to download it

diff --git a/src/mytypes.py b/src/mytypes.py
@@ -24,4 +24,4 @@ class LLMSData(TypedDict):
     title: str
     description: str | None
     details: str | None
-    sections: list[SectionDict]
+    sections: dict[str, SectionDict]
diff --git a/src/renderer.py b/src/renderer.py
@@ -30,20 +30,22 @@ def render_llms_txt(data: LLMSData) -> str:
     - [Example](https://example.com): Example description
 
     """
-    result = [f"# {data['title']}\n\n"]
+    result = [f"# {data['title'].strip()}\n\n"]
 
-    if data.get('description'):
-        result.append(f"> {data['description']}\n\n")
+    if description := data.get('description'):
+        result.append(f'> {description.strip()}\n\n')
 
-    if data.get('details'):
-        result.append(f"{data['details']}\n\n")
+    if details := data.get('details'):
+        result.append(f'{details.strip()}\n\n')
 
-    for section in data.get('sections', []):
-        result.append(f"## {section['title']}\n\n")
+    for section_dir in sorted(data.get('sections', {})):
+        section = data['sections'][section_dir]
+        result.append(f"## {section['title'].strip()}\n\n")
         for link in section.get('links', []):
-            link_str = f"- [{link['title']}]({link['url']})"
-            if link.get('description'):
-                link_str += f": {link['description']}"
+            link_str = f"- [{link['title'].strip()}]({link['url'].strip()})"
+            if link_description := link.get('description'):
+                link_str += f': {link_description.strip()}'
             result.append(f'{link_str}\n')
+        result.append('\n')
 
     return ''.join(result)