Skip to content

Commit

Permalink
improvements to github crawler (#37)
Browse files Browse the repository at this point in the history
added PRs to github crawler + refactor to simplify
  • Loading branch information
ofermend authored Oct 11, 2023
1 parent 57ed013 commit 4355ce4
Show file tree
Hide file tree
Showing 8 changed files with 118 additions and 53 deletions.
2 changes: 1 addition & 1 deletion config/lethain.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
vectara:
corpus_id: 160
corpus_id: 240
customer_id: 1526022105
reindex: false

Expand Down
2 changes: 1 addition & 1 deletion core/crawler.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from omegaconf import OmegaConf, DictConfig
from slugify import slugify
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
Expand All @@ -8,6 +7,7 @@
from core.indexer import Indexer
from core.pdf_convert import PDFConverter
from core.utils import binary_extensions, doc_extensions
from slugify import slugify

get_headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0",
Expand Down
6 changes: 3 additions & 3 deletions core/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,8 @@
import os
from typing import Tuple, Dict, Any, List, Optional

from slugify import slugify
import time
from core.utils import create_session_with_retries
from slugify import slugify

from bs4 import BeautifulSoup

Expand All @@ -14,7 +13,8 @@
import nbformat
import markdown
import docutils.core
from core.utils import html_to_text, detect_language, get_file_size_in_MB

from core.utils import html_to_text, detect_language, get_file_size_in_MB, create_session_with_retries
from core.extract import get_content_and_title

from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
Expand Down
1 change: 1 addition & 0 deletions core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,3 +76,4 @@ def get_file_size_in_MB(file_path: str) -> float:
file_size_bytes = os.path.getsize(file_path)
file_size_MB = file_size_bytes / (1024 * 1024)
return file_size_MB

153 changes: 109 additions & 44 deletions crawlers/github_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,23 @@
from attrdict import AttrDict
import logging
import base64
from datetime import datetime
import markdown

from ratelimiter import RateLimiter
from core.utils import create_session_with_retries
from core.utils import create_session_with_retries, html_to_text

from typing import List, Any

def convert_date(date_str: str) -> str:
# Remove the 'Z' at the end and parse the date string to a datetime object
date_obj = datetime.fromisoformat(date_str.replace("Z", ""))

# Format the datetime object to a string in the format YYYY-MM-DD
normal_date = date_obj.strftime("%Y-%m-%d")

return normal_date

class Github(object):
def __init__(self, repo: str, owner: str, token: str) -> None:
self.repo = repo
Expand All @@ -30,7 +41,7 @@ def get_issues(self, state: str) -> List[Any]:
logging.info(f"Error retrieving issues: {response.status_code}, {response.text}")
return []

def get_comments(self, issue_number: str) -> List[Any]:
def get_issue_comments(self, issue_number: str) -> List[Any]:
api_url = f"https://api.github.com/repos/{self.owner}/{self.repo}/issues/{issue_number}/comments"
headers = {"Authorization": f"Bearer {self.token}", "Accept": "application/vnd.github+json"}
response = self.session.get(api_url, headers=headers)
Expand All @@ -39,7 +50,27 @@ def get_comments(self, issue_number: str) -> List[Any]:
else:
logging.info(f"Error retrieving comments: {response.status_code}, {response.text}")
return []

def get_pull_requests(self, state: str) -> List[Any]:
# state can be "open", "closed", "all", or "merged"
api_url = f"https://api.github.com/repos/{self.owner}/{self.repo}/pulls?state={state}"
headers = {"Authorization": f"Bearer {self.token}", "Accept": "application/vnd.github+json"}
response = self.session.get(api_url, headers=headers)
if response.status_code == 200:
return list(response.json())
else:
logging.info(f"Error retrieving pull requests: {response.status_code}, {response.text}")
return []

def get_pr_comments(self, pull_number: int) -> List[Any]:
api_url = f"https://api.github.com/repos/{self.owner}/{self.repo}/pulls/{pull_number}/comments"
headers = {"Authorization": f"Bearer {self.token}", "Accept": "application/vnd.github+json"}
response = self.session.get(api_url, headers=headers)
if response.status_code == 200:
return list(response.json())
else:
logging.info(f"Error retrieving comments for pull request #{pull_number}: {response.status_code}, {response.text}")
return []

class GithubCrawler(Crawler):

Expand Down Expand Up @@ -77,6 +108,7 @@ def crawl_code_folder(self, base_url: str, path: str = "") -> None:
logging.info(f"Failed to retrieve content for {fname} with url {url}: {e}")
continue

text_content = html_to_text(markdown.markdown(file_content))
metadata = {'file': fname, 'source': 'github', 'url': url}
code_doc = {
'documentId': f'github-{item["path"]}',
Expand All @@ -85,33 +117,87 @@ def crawl_code_folder(self, base_url: str, path: str = "") -> None:
'metadataJson': json.dumps(metadata),
'section': [{
'title': 'markdown',
'text': file_content,
'text': text_content,
}]
}

logging.info(f"Indexing codebase markdown: {item['path']}")
self.indexer.index_document(code_doc)
elif item["type"] == "dir":
self.crawl_code_folder(base_url, path=item["path"])

def add_comments(self, doc: dict, comments: List[Any]) -> None:
for d_comment in comments:
comment = AttrDict(d_comment)
metadata = {
'id': comment.id, 'url': comment.html_url, 'source': 'github',
'author': comment.user.login, 'created_at': convert_date(comment.created_at), 'updated_at': convert_date(comment.updated_at)
}
doc['section'].append({
'title': f'comment by {comment.user.login}',
'text': comment.body,
'metadataJson': json.dumps(metadata),
})

def crawl_repo(self, repo: str, owner: str, token: str) -> None:

# create github object
g = Github(repo, owner, token)
issues = g.get_issues("all")

# Extract and index pull requests
prs = g.get_pull_requests("all")
for d_pr in prs:
pr = AttrDict(d_pr)
doc_metadata = {
'source': 'github',
'id': pr.id,
'number': pr.number,
'url': pr.html_url,
'title': pr.title,
'state': pr.state,
'author': pr.user.login,
'created_at': convert_date(pr.created_at),
'updated_at': convert_date(pr.updated_at)
}
pr_doc = {
'documentId': f'github-{repo}-pr-{pr.number}',
'title': pr.title,
'metadataJson': json.dumps(doc_metadata),
'section': [{
'title': pr.title,
'text': pr.body,
}]
}

comments = g.get_pr_comments(pr.number)
if len(comments)>0:
logging.info(f"Adding {len(comments)} comments for repo {repo}, PR {pr.number}")
self.add_comments(pr_doc, comments)
else:
logging.info(f"No comments for repo {repo}, PR {pr.number}")

# index everything
try:
self.indexer.index_document(pr_doc)
except Exception as e:
logging.info(f"Error {e} indexing comment for repo {repo} document {pr_doc}")
continue

# Extract and index issues and comments
issues = g.get_issues("all")
for d_issue in issues:
# Extract issue metadata
issue = AttrDict(d_issue)
issue_id = f'github-issue-{issue.id}'
title = issue.title
description = issue.body
created_at = str(issue.created_at)
updated_at = str(issue.updated_at)
created_at = convert_date(issue.created_at)
updated_at = convert_date(issue.updated_at)
labels = [label.name for label in issue.labels]
author = issue.user.login
metadata = {'issue_number': issue.number, 'labels': labels, 'source': 'github', 'url': issue.html_url, 'state': issue.state}

issue_doc = {
'documentId': f'github-issue-{issue_id}',
'documentId': f'github-{repo}-issue-{issue.number}',
'title': title,
'description': description,
'metadataJson': json.dumps(metadata),
Expand All @@ -125,50 +211,29 @@ def crawl_repo(self, repo: str, owner: str, token: str) -> None:
})
}]
}
logging.info(f"Indexing issue: {issue.id}")
self.indexer.index_document(issue_doc)

# Extract and index comments
comments = g.get_comments(issue.number)
# Extract comments
comments = g.get_issue_comments(issue.number)
if len(comments)>0:
logging.info(f"Indexing {len(comments)} comments for issue {issue.number}")
logging.info(f"Adding {len(comments)} comments for repo {repo} issue {issue.number}")
self.add_comments(issue_doc, comments)
else:
logging.info(f"No comments for issue {issue.number}")

for d_comment in comments:
comment = AttrDict(d_comment)
comment_id = comment.id
comment_text = comment.body
comment_author = comment.user.login
comment_created_at = str(comment.created_at)
metadata = {'comment_id': comment.id, 'url': comment.html_url, 'source': 'github'}

comment_doc = {
'documentId': f'github-comment-{comment_id}',
'title': title,
'description': comment_text,
'metadataJson': json.dumps(metadata),
'section': [{
'title': 'comment',
'text': comment_text,
'metadataJson': json.dumps({
'author': comment_author,
'created_at': comment_created_at,
'updated_at': updated_at
})
}]
}
try:
self.indexer.index_document(comment_doc)
except Exception as e:
logging.info(f"Error {e} indexing comment document {comment_doc}")
continue
logging.info(f"No comments for repo {repo}, issue {issue.number}")

# index everything
logging.info(f"Indexing issue: {issue.number}")
try:
self.indexer.index_document(issue_doc)
except Exception as e:
logging.info(f"Error {e} indexing repo {repo}, comment document {issue_doc}")
continue


# Extract and index codebase if requested
if self.crawl_code:
base_url = f"https://api.github.com/repos/{owner}/{repo}"
self.crawl_code_folder(base_url)


def crawl(self) -> None:
for repo in self.repos:
logging.info(f"Crawling repo {repo}")
Expand Down
3 changes: 2 additions & 1 deletion crawlers/hackernews_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
import logging
from core.crawler import Crawler
import os
from slugify import slugify
from core.utils import html_to_text, create_session_with_retries
from slugify import slugify

from typing import List

def get_comments(kids: List[str], entrypoint: str) -> List[str]:
Expand Down
1 change: 0 additions & 1 deletion crawlers/hubspot_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from omegaconf import OmegaConf
import requests
from core.utils import clean_email_text
from slugify import slugify
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
import datetime
Expand Down
3 changes: 1 addition & 2 deletions crawlers/s3_crawler.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
import logging
import pathlib
from slugify import slugify
import boto3
import os
from typing import List, Tuple

from core.crawler import Crawler
from slugify import slugify

def list_files_in_s3_bucket(bucket_name: str, prefix: str) -> List[str]:
"""
Expand Down

0 comments on commit 4355ce4

Please sign in to comment.