diff --git a/config/lethain.yaml b/config/lethain.yaml index 2827406..51a0267 100644 --- a/config/lethain.yaml +++ b/config/lethain.yaml @@ -1,5 +1,5 @@ vectara: - corpus_id: 160 + corpus_id: 240 customer_id: 1526022105 reindex: false diff --git a/core/crawler.py b/core/crawler.py index 076a348..88b8110 100644 --- a/core/crawler.py +++ b/core/crawler.py @@ -1,5 +1,4 @@ from omegaconf import OmegaConf, DictConfig -from slugify import slugify import requests from bs4 import BeautifulSoup from urllib.parse import urljoin @@ -8,6 +7,7 @@ from core.indexer import Indexer from core.pdf_convert import PDFConverter from core.utils import binary_extensions, doc_extensions +from slugify import slugify get_headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0", diff --git a/core/indexer.py b/core/indexer.py index 683c3aa..9d895e9 100644 --- a/core/indexer.py +++ b/core/indexer.py @@ -3,9 +3,8 @@ import os from typing import Tuple, Dict, Any, List, Optional -from slugify import slugify import time -from core.utils import create_session_with_retries +from slugify import slugify from bs4 import BeautifulSoup @@ -14,7 +13,8 @@ import nbformat import markdown import docutils.core -from core.utils import html_to_text, detect_language, get_file_size_in_MB + +from core.utils import html_to_text, detect_language, get_file_size_in_MB, create_session_with_retries from core.extract import get_content_and_title from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError diff --git a/core/utils.py b/core/utils.py index b51c24b..719666d 100644 --- a/core/utils.py +++ b/core/utils.py @@ -76,3 +76,4 @@ def get_file_size_in_MB(file_path: str) -> float: file_size_bytes = os.path.getsize(file_path) file_size_MB = file_size_bytes / (1024 * 1024) return file_size_MB + diff --git a/crawlers/github_crawler.py b/crawlers/github_crawler.py index 5727263..f197c86 100644 --- a/crawlers/github_crawler.py +++ b/crawlers/github_crawler.py @@ -5,12 +5,23 @@ from attrdict import AttrDict import logging import base64 +from datetime import datetime +import markdown from ratelimiter import RateLimiter -from core.utils import create_session_with_retries +from core.utils import create_session_with_retries, html_to_text from typing import List, Any +def convert_date(date_str: str) -> str: + # Remove the 'Z' at the end and parse the date string to a datetime object + date_obj = datetime.fromisoformat(date_str.replace("Z", "")) + + # Format the datetime object to a string in the format YYYY-MM-DD + normal_date = date_obj.strftime("%Y-%m-%d") + + return normal_date + class Github(object): def __init__(self, repo: str, owner: str, token: str) -> None: self.repo = repo @@ -30,7 +41,7 @@ def get_issues(self, state: str) -> List[Any]: logging.info(f"Error retrieving issues: {response.status_code}, {response.text}") return [] - def get_comments(self, issue_number: str) -> List[Any]: + def get_issue_comments(self, issue_number: str) -> List[Any]: api_url = f"https://api.github.com/repos/{self.owner}/{self.repo}/issues/{issue_number}/comments" headers = {"Authorization": f"Bearer {self.token}", "Accept": "application/vnd.github+json"} response = self.session.get(api_url, headers=headers) @@ -39,7 +50,27 @@ def get_comments(self, issue_number: str) -> List[Any]: else: logging.info(f"Error retrieving comments: {response.status_code}, {response.text}") return [] + + def get_pull_requests(self, state: str) -> List[Any]: + # state can be "open", "closed", "all", or "merged" + api_url = f"https://api.github.com/repos/{self.owner}/{self.repo}/pulls?state={state}" + headers = {"Authorization": f"Bearer {self.token}", "Accept": "application/vnd.github+json"} + response = self.session.get(api_url, headers=headers) + if response.status_code == 200: + return list(response.json()) + else: + logging.info(f"Error retrieving pull requests: {response.status_code}, {response.text}") + return [] + def get_pr_comments(self, pull_number: int) -> List[Any]: + api_url = f"https://api.github.com/repos/{self.owner}/{self.repo}/pulls/{pull_number}/comments" + headers = {"Authorization": f"Bearer {self.token}", "Accept": "application/vnd.github+json"} + response = self.session.get(api_url, headers=headers) + if response.status_code == 200: + return list(response.json()) + else: + logging.info(f"Error retrieving comments for pull request #{pull_number}: {response.status_code}, {response.text}") + return [] class GithubCrawler(Crawler): @@ -77,6 +108,7 @@ def crawl_code_folder(self, base_url: str, path: str = "") -> None: logging.info(f"Failed to retrieve content for {fname} with url {url}: {e}") continue + text_content = html_to_text(markdown.markdown(file_content)) metadata = {'file': fname, 'source': 'github', 'url': url} code_doc = { 'documentId': f'github-{item["path"]}', @@ -85,33 +117,87 @@ def crawl_code_folder(self, base_url: str, path: str = "") -> None: 'metadataJson': json.dumps(metadata), 'section': [{ 'title': 'markdown', - 'text': file_content, + 'text': text_content, }] } + logging.info(f"Indexing codebase markdown: {item['path']}") self.indexer.index_document(code_doc) elif item["type"] == "dir": self.crawl_code_folder(base_url, path=item["path"]) + def add_comments(self, doc: dict, comments: List[Any]) -> None: + for d_comment in comments: + comment = AttrDict(d_comment) + metadata = { + 'id': comment.id, 'url': comment.html_url, 'source': 'github', + 'author': comment.user.login, 'created_at': convert_date(comment.created_at), 'updated_at': convert_date(comment.updated_at) + } + doc['section'].append({ + 'title': f'comment by {comment.user.login}', + 'text': comment.body, + 'metadataJson': json.dumps(metadata), + }) + def crawl_repo(self, repo: str, owner: str, token: str) -> None: + # create github object g = Github(repo, owner, token) - issues = g.get_issues("all") + # Extract and index pull requests + prs = g.get_pull_requests("all") + for d_pr in prs: + pr = AttrDict(d_pr) + doc_metadata = { + 'source': 'github', + 'id': pr.id, + 'number': pr.number, + 'url': pr.html_url, + 'title': pr.title, + 'state': pr.state, + 'author': pr.user.login, + 'created_at': convert_date(pr.created_at), + 'updated_at': convert_date(pr.updated_at) + } + pr_doc = { + 'documentId': f'github-{repo}-pr-{pr.number}', + 'title': pr.title, + 'metadataJson': json.dumps(doc_metadata), + 'section': [{ + 'title': pr.title, + 'text': pr.body, + }] + } + + comments = g.get_pr_comments(pr.number) + if len(comments)>0: + logging.info(f"Adding {len(comments)} comments for repo {repo}, PR {pr.number}") + self.add_comments(pr_doc, comments) + else: + logging.info(f"No comments for repo {repo}, PR {pr.number}") + + # index everything + try: + self.indexer.index_document(pr_doc) + except Exception as e: + logging.info(f"Error {e} indexing comment for repo {repo} document {pr_doc}") + continue + + # Extract and index issues and comments + issues = g.get_issues("all") for d_issue in issues: # Extract issue metadata issue = AttrDict(d_issue) - issue_id = f'github-issue-{issue.id}' title = issue.title description = issue.body - created_at = str(issue.created_at) - updated_at = str(issue.updated_at) + created_at = convert_date(issue.created_at) + updated_at = convert_date(issue.updated_at) labels = [label.name for label in issue.labels] author = issue.user.login metadata = {'issue_number': issue.number, 'labels': labels, 'source': 'github', 'url': issue.html_url, 'state': issue.state} issue_doc = { - 'documentId': f'github-issue-{issue_id}', + 'documentId': f'github-{repo}-issue-{issue.number}', 'title': title, 'description': description, 'metadataJson': json.dumps(metadata), @@ -125,50 +211,29 @@ def crawl_repo(self, repo: str, owner: str, token: str) -> None: }) }] } - logging.info(f"Indexing issue: {issue.id}") - self.indexer.index_document(issue_doc) - # Extract and index comments - comments = g.get_comments(issue.number) + # Extract comments + comments = g.get_issue_comments(issue.number) if len(comments)>0: - logging.info(f"Indexing {len(comments)} comments for issue {issue.number}") + logging.info(f"Adding {len(comments)} comments for repo {repo} issue {issue.number}") + self.add_comments(issue_doc, comments) else: - logging.info(f"No comments for issue {issue.number}") - - for d_comment in comments: - comment = AttrDict(d_comment) - comment_id = comment.id - comment_text = comment.body - comment_author = comment.user.login - comment_created_at = str(comment.created_at) - metadata = {'comment_id': comment.id, 'url': comment.html_url, 'source': 'github'} - - comment_doc = { - 'documentId': f'github-comment-{comment_id}', - 'title': title, - 'description': comment_text, - 'metadataJson': json.dumps(metadata), - 'section': [{ - 'title': 'comment', - 'text': comment_text, - 'metadataJson': json.dumps({ - 'author': comment_author, - 'created_at': comment_created_at, - 'updated_at': updated_at - }) - }] - } - try: - self.indexer.index_document(comment_doc) - except Exception as e: - logging.info(f"Error {e} indexing comment document {comment_doc}") - continue + logging.info(f"No comments for repo {repo}, issue {issue.number}") + # index everything + logging.info(f"Indexing issue: {issue.number}") + try: + self.indexer.index_document(issue_doc) + except Exception as e: + logging.info(f"Error {e} indexing repo {repo}, comment document {issue_doc}") + continue + + + # Extract and index codebase if requested if self.crawl_code: base_url = f"https://api.github.com/repos/{owner}/{repo}" self.crawl_code_folder(base_url) - def crawl(self) -> None: for repo in self.repos: logging.info(f"Crawling repo {repo}") diff --git a/crawlers/hackernews_crawler.py b/crawlers/hackernews_crawler.py index c47ff0e..2a18ace 100644 --- a/crawlers/hackernews_crawler.py +++ b/crawlers/hackernews_crawler.py @@ -3,8 +3,9 @@ import logging from core.crawler import Crawler import os -from slugify import slugify from core.utils import html_to_text, create_session_with_retries +from slugify import slugify + from typing import List def get_comments(kids: List[str], entrypoint: str) -> List[str]: diff --git a/crawlers/hubspot_crawler.py b/crawlers/hubspot_crawler.py index b2ccf6c..2973ede 100644 --- a/crawlers/hubspot_crawler.py +++ b/crawlers/hubspot_crawler.py @@ -3,7 +3,6 @@ from omegaconf import OmegaConf import requests from core.utils import clean_email_text -from slugify import slugify from presidio_analyzer import AnalyzerEngine from presidio_anonymizer import AnonymizerEngine import datetime diff --git a/crawlers/s3_crawler.py b/crawlers/s3_crawler.py index 00893b7..fcf2a82 100644 --- a/crawlers/s3_crawler.py +++ b/crawlers/s3_crawler.py @@ -1,11 +1,10 @@ -import logging import pathlib -from slugify import slugify import boto3 import os from typing import List, Tuple from core.crawler import Crawler +from slugify import slugify def list_files_in_s3_bucket(bucket_name: str, prefix: str) -> List[str]: """