improvements to github crawler (#37)

added PRs to github crawler + refactor to simplify
vectara · Oct 11, 2023 · 4355ce4 · 4355ce4
1 parent 57ed013
commit 4355ce4
Show file tree

Hide file tree

Showing 8 changed files with 118 additions and 53 deletions.
diff --git a/config/lethain.yaml b/config/lethain.yaml
@@ -1,5 +1,5 @@
 vectara:
-  corpus_id: 160
+  corpus_id: 240
   customer_id: 1526022105
   reindex: false
 

diff --git a/core/crawler.py b/core/crawler.py
@@ -1,5 +1,4 @@
 from omegaconf import OmegaConf, DictConfig
-from slugify import slugify
 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin
@@ -8,6 +7,7 @@
 from core.indexer import Indexer
 from core.pdf_convert import PDFConverter
 from core.utils import binary_extensions, doc_extensions
+from slugify import slugify
 
 get_headers = {
     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0",

diff --git a/core/indexer.py b/core/indexer.py
@@ -3,9 +3,8 @@
 import os
 from typing import Tuple, Dict, Any, List, Optional
 
-from slugify import slugify         
 import time
-from core.utils import create_session_with_retries
+from slugify import slugify
 
 from bs4 import BeautifulSoup
 
@@ -14,7 +13,8 @@
 import nbformat
 import markdown
 import docutils.core
-from core.utils import html_to_text, detect_language, get_file_size_in_MB
+
+from core.utils import html_to_text, detect_language, get_file_size_in_MB, create_session_with_retries
 from core.extract import get_content_and_title
 
 from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError

diff --git a/core/utils.py b/core/utils.py
@@ -76,3 +76,4 @@ def get_file_size_in_MB(file_path: str) -> float:
     file_size_bytes = os.path.getsize(file_path)
     file_size_MB = file_size_bytes / (1024 * 1024)    
     return file_size_MB
+
diff --git a/crawlers/github_crawler.py b/crawlers/github_crawler.py
@@ -5,12 +5,23 @@
 from attrdict import AttrDict
 import logging
 import base64
+from datetime import datetime
+import markdown
 
 from ratelimiter import RateLimiter
-from core.utils import create_session_with_retries
+from core.utils import create_session_with_retries, html_to_text
 
 from typing import List, Any
 
+def convert_date(date_str: str) -> str:
+    # Remove the 'Z' at the end and parse the date string to a datetime object
+    date_obj = datetime.fromisoformat(date_str.replace("Z", ""))
+
+    # Format the datetime object to a string in the format YYYY-MM-DD
+    normal_date = date_obj.strftime("%Y-%m-%d")
+
+    return normal_date
+
 class Github(object):
     def __init__(self, repo: str, owner: str, token: str) -> None:
         self.repo = repo
@@ -30,7 +41,7 @@ def get_issues(self, state: str) -> List[Any]:
             logging.info(f"Error retrieving issues: {response.status_code}, {response.text}")
             return []
 
-    def get_comments(self, issue_number: str) -> List[Any]:
+    def get_issue_comments(self, issue_number: str) -> List[Any]:
         api_url = f"https://api.github.com/repos/{self.owner}/{self.repo}/issues/{issue_number}/comments"
         headers = {"Authorization": f"Bearer {self.token}", "Accept": "application/vnd.github+json"}
         response = self.session.get(api_url, headers=headers)
@@ -39,7 +50,27 @@ def get_comments(self, issue_number: str) -> List[Any]:
         else:
             logging.info(f"Error retrieving comments: {response.status_code}, {response.text}")
             return []
+
+    def get_pull_requests(self, state: str) -> List[Any]:
+        # state can be "open", "closed", "all", or "merged"
+        api_url = f"https://api.github.com/repos/{self.owner}/{self.repo}/pulls?state={state}"
+        headers = {"Authorization": f"Bearer {self.token}", "Accept": "application/vnd.github+json"}
+        response = self.session.get(api_url, headers=headers)
+        if response.status_code == 200:
+            return list(response.json())
+        else:
+            logging.info(f"Error retrieving pull requests: {response.status_code}, {response.text}")
+            return []        
 
+    def get_pr_comments(self, pull_number: int) -> List[Any]:
+        api_url = f"https://api.github.com/repos/{self.owner}/{self.repo}/pulls/{pull_number}/comments"
+        headers = {"Authorization": f"Bearer {self.token}", "Accept": "application/vnd.github+json"}
+        response = self.session.get(api_url, headers=headers)
+        if response.status_code == 200:
+            return list(response.json())
+        else:
+            logging.info(f"Error retrieving comments for pull request #{pull_number}: {response.status_code}, {response.text}")
+            return []
 
 class GithubCrawler(Crawler):
 
@@ -77,6 +108,7 @@ def crawl_code_folder(self, base_url: str, path: str = "") -> None:
                         logging.info(f"Failed to retrieve content for {fname} with url {url}: {e}")
                         continue
 
+                    text_content = html_to_text(markdown.markdown(file_content))
                     metadata = {'file': fname, 'source': 'github', 'url': url}
                     code_doc = {
                         'documentId': f'github-{item["path"]}',
@@ -85,33 +117,87 @@ def crawl_code_folder(self, base_url: str, path: str = "") -> None:
                         'metadataJson': json.dumps(metadata),
                         'section': [{
                             'title': 'markdown',
-                            'text': file_content,
+                            'text': text_content,
                         }]
                     }
+
                     logging.info(f"Indexing codebase markdown: {item['path']}")
                     self.indexer.index_document(code_doc)
             elif item["type"] == "dir":
                 self.crawl_code_folder(base_url, path=item["path"])
 
+    def add_comments(self, doc: dict, comments: List[Any]) -> None:
+        for d_comment in comments:
+            comment = AttrDict(d_comment)
+            metadata = {
+                'id': comment.id, 'url': comment.html_url, 'source': 'github',
+                'author': comment.user.login, 'created_at': convert_date(comment.created_at), 'updated_at': convert_date(comment.updated_at)
+            }
+            doc['section'].append({
+                'title': f'comment by {comment.user.login}',
+                'text': comment.body,
+                'metadataJson': json.dumps(metadata),
+            })
+
     def crawl_repo(self, repo: str, owner: str, token: str) -> None:
 
+        # create github object
         g = Github(repo, owner, token)
-        issues = g.get_issues("all")
 
+        # Extract and index pull requests
+        prs = g.get_pull_requests("all")
+        for d_pr in prs:
+            pr = AttrDict(d_pr)
+            doc_metadata = {
+                'source': 'github',
+                'id': pr.id, 
+                'number': pr.number,
+                'url': pr.html_url, 
+                'title': pr.title,
+                'state': pr.state,
+                'author': pr.user.login,
+                'created_at': convert_date(pr.created_at),
+                'updated_at': convert_date(pr.updated_at)
+            }
+            pr_doc = {
+                'documentId': f'github-{repo}-pr-{pr.number}',
+                'title': pr.title,
+                'metadataJson': json.dumps(doc_metadata),
+                'section': [{
+                    'title': pr.title,
+                    'text': pr.body,
+                }]
+            }
+
+            comments = g.get_pr_comments(pr.number)
+            if len(comments)>0:
+                logging.info(f"Adding {len(comments)} comments for repo {repo}, PR {pr.number}")
+                self.add_comments(pr_doc, comments)
+            else:
+                logging.info(f"No comments for repo {repo}, PR {pr.number}")
+
+            # index everything
+            try:
+                self.indexer.index_document(pr_doc)
+            except Exception as e:
+                logging.info(f"Error {e} indexing comment for repo {repo} document {pr_doc}")
+                continue
+
+        # Extract and index issues and comments
+        issues = g.get_issues("all")
         for d_issue in issues:
             # Extract issue metadata
             issue = AttrDict(d_issue)
-            issue_id = f'github-issue-{issue.id}'
             title = issue.title
             description = issue.body
-            created_at = str(issue.created_at)
-            updated_at = str(issue.updated_at)
+            created_at = convert_date(issue.created_at)
+            updated_at = convert_date(issue.updated_at)
             labels = [label.name for label in issue.labels]
             author = issue.user.login
             metadata = {'issue_number': issue.number, 'labels': labels, 'source': 'github', 'url': issue.html_url, 'state': issue.state}
 
             issue_doc = {
-                'documentId': f'github-issue-{issue_id}',
+                'documentId': f'github-{repo}-issue-{issue.number}',
                 'title': title,
                 'description': description,
                 'metadataJson': json.dumps(metadata),
@@ -125,50 +211,29 @@ def crawl_repo(self, repo: str, owner: str, token: str) -> None:
                     })
                 }]
             }
-            logging.info(f"Indexing issue: {issue.id}")
-            self.indexer.index_document(issue_doc)
 
-            # Extract and index comments
-            comments = g.get_comments(issue.number)
+            # Extract comments
+            comments = g.get_issue_comments(issue.number)
             if len(comments)>0:
-                logging.info(f"Indexing {len(comments)} comments for issue {issue.number}")
+                logging.info(f"Adding {len(comments)} comments for repo {repo} issue {issue.number}")
+                self.add_comments(issue_doc, comments)
             else:
-                logging.info(f"No comments for issue {issue.number}")
-
-            for d_comment in comments:
-                comment = AttrDict(d_comment)
-                comment_id = comment.id
-                comment_text = comment.body
-                comment_author = comment.user.login
-                comment_created_at = str(comment.created_at)
-                metadata = {'comment_id': comment.id, 'url': comment.html_url, 'source': 'github'}
-
-                comment_doc = {
-                    'documentId': f'github-comment-{comment_id}',
-                    'title': title,
-                    'description': comment_text,
-                    'metadataJson': json.dumps(metadata),
-                    'section': [{
-                        'title': 'comment',
-                        'text': comment_text,
-                        'metadataJson': json.dumps({
-                            'author': comment_author,
-                            'created_at': comment_created_at,
-                            'updated_at': updated_at
-                        })
-                    }]
-                }
-                try:
-                    self.indexer.index_document(comment_doc)
-                except Exception as e:
-                    logging.info(f"Error {e} indexing comment document {comment_doc}")
-                    continue
+                logging.info(f"No comments for repo {repo}, issue {issue.number}")
 
+            # index everything
+            logging.info(f"Indexing issue: {issue.number}")
+            try:
+                self.indexer.index_document(issue_doc)
+            except Exception as e:
+                logging.info(f"Error {e} indexing repo {repo}, comment document {issue_doc}")
+                continue
+
+
+        # Extract and index codebase if requested
         if self.crawl_code:
             base_url = f"https://api.github.com/repos/{owner}/{repo}"
             self.crawl_code_folder(base_url)
 
-
     def crawl(self) -> None:
         for repo in self.repos:
             logging.info(f"Crawling repo {repo}")

diff --git a/crawlers/hackernews_crawler.py b/crawlers/hackernews_crawler.py
@@ -3,8 +3,9 @@
 import logging
 from core.crawler import Crawler
 import os
-from slugify import slugify
 from core.utils import html_to_text, create_session_with_retries
+from slugify import slugify
+
 from typing import List
 
 def get_comments(kids: List[str], entrypoint: str) -> List[str]:

diff --git a/crawlers/hubspot_crawler.py b/crawlers/hubspot_crawler.py
@@ -3,7 +3,6 @@
 from omegaconf import OmegaConf
 import requests
 from core.utils import clean_email_text
-from slugify import slugify
 from presidio_analyzer import AnalyzerEngine
 from presidio_anonymizer import AnonymizerEngine
 import datetime

diff --git a/crawlers/s3_crawler.py b/crawlers/s3_crawler.py
@@ -1,11 +1,10 @@
-import logging
 import pathlib
-from slugify import slugify
 import boto3
 import os
 from typing import List, Tuple
 
 from core.crawler import Crawler
+from slugify import slugify
 
 def list_files_in_s3_bucket(bucket_name: str, prefix: str) -> List[str]:
     """
Original file line number	Diff line number	Diff line change
Expand Up		@@ -76,3 +76,4 @@ def get_file_size_in_MB(file_path: str) -> float:
		file_size_bytes = os.path.getsize(file_path)
		file_size_MB = file_size_bytes / (1024 * 1024)
		return file_size_MB