Skip to content

Commit

Permalink
feat: revamp index discovery
Browse files Browse the repository at this point in the history
* now searches GH for all pkgs with a root `lakefile.lean`
* can output a hierarchical directory of metadata
  • Loading branch information
tydeu committed Dec 1, 2023
1 parent d9a366e commit e510ade
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 44 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/index.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:
- name: Checkout
uses: actions/checkout@v3
- name: Query Repositories
run: ./query.py -o index.json -v -L ${{github.ref_name == 'master' && 1000 || 75}}
run: ./query.py -o index.json -v -L ${{github.ref_name != 'master' && 100 || 0}}
env:
GH_TOKEN: ${{ github.token }}
- name: Upload Index
Expand Down
141 changes: 100 additions & 41 deletions query.py
Original file line number Diff line number Diff line change
@@ -1,55 +1,101 @@
#!/usr/bin/env python3
from datetime import datetime
import argparse
import itertools
import os
import subprocess
import requests
import json
import logging

def query(limit: int):
query = "language:Lean stars:>1 sort:stars"
fields = ["fullName","description","license","createdAt","updatedAt","pushedAt","url","homepage","stargazersCount"]
child = subprocess.run([
"gh", "search", "repos",
*query.split(' '), "-L", str(limit), "--json", ','.join(fields),
], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
# from https://antonz.org/page-iterator/
def paginate(iterable, page_size):
it = iter(iterable)
slicer = lambda: list(itertools.islice(it, page_size))
return iter(slicer, [])

def run_cmd(*args: str) -> bytes:
child = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if child.returncode != 0:
raise RuntimeError(child.stderr.decode().strip())
return json.loads(child.stdout)
elif len(child.stderr) > 0:
logging.error(child.stderr.decode())
return child.stdout

REPO_QUERY="""
query($repoIds: [ID!]!) {
nodes(ids: $repoIds) {
... on Repository {
nameWithOwner
description
licenseInfo {
spdxId
}
createdAt
updatedAt
pushedAt
url
homepageUrl
stargazerCount
}
}
}
"""

def query_repo_data(repoIds: 'list[str]') -> dict:
results = list()
for page in paginate(repoIds, 100):
fields = list()
for id in page:
fields.append('-f')
fields.append(f'repoIds[]={id}')
out = run_cmd(
'gh', 'api', 'graphql',
'-f', f'query={REPO_QUERY}', *fields
)
results = results + json.loads(out)['data']['nodes']
return results

def filter_lake(repo):
repo = repo['fullName']
url = f"https://raw.githubusercontent.com/{repo}/HEAD/lakefile.lean"
logging.debug(f"querying {url}")
code = requests.head(url, allow_redirects=True).status_code
log = f"status {code} for repository {repo}"
if code == 404:
logging.debug(log)
return False
elif code == 200:
logging.debug(log)
return True
# NOTE: GitHub limits code searches to 10 requests/min, which is 1000 results.
# Thus, the strategy used here will need to change when we hit that limit.
def query_lake_repos(limit: int) -> 'list[str]':
query='filename:lakefile.lean path:/'
if limit == 0:
out = run_cmd(
'gh', 'api', 'search/code',
'--paginate', '--cache', '1h',
'-X', 'GET', '-f', f'q={query}',
'-q', '.items[] | .repository.node_id'
)
else:
logging.error(f"status {code} for repository {repo}")
return False
out = run_cmd(
'gh', 'search', 'code', *query.split(' '), '-L', str(limit),
'--json', 'path,repository', '-q', '.[] | .repository.id'
)
return out.decode().splitlines()

def enrich(repo: dict):
repo['id'] = repo['fullName'].replace("-", "--").replace("/", "-")
repo['fullName'] = repo.pop('nameWithOwner')
repo['id'] = repo['fullName'].replace('-', '--').replace('/', '-')
repo['owner'], repo['name'] = repo['fullName'].split('/')
repo['license'] = repo['license']['key']
repo['stars'] = repo.pop('stargazersCount')
repo['updatedAt'] = max(repo['updatedAt'], repo['pushedAt'])
del repo['pushedAt']
info = repo.pop('licenseInfo')
spdxId = None if info is None else info['spdxId']
spdxId = None if spdxId in ['NONE', 'NOASSERTION'] else spdxId
repo['license'] = spdxId
repo['homepage'] = repo.pop('homepageUrl')
repo['stars'] = repo.pop('stargazerCount')
repo['updatedAt'] = max(repo['updatedAt'], repo.pop('pushedAt'))
return repo

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-L', '--limit', type=int, default=50,
help='(max) number of results to query from GitHub')
parser.add_argument('-L', '--limit', type=int, default=100,
help='(max) number of results to query from GitHub (0 for no limit)')
parser.add_argument('-X', '--exclusions', default="query-exclusions.txt",
help='file containing repos to exclude')
parser.add_argument('-o', '--output',
help='file to output results')
parser.add_argument('-o', '--output-manifest',
help='file to output result manifest')
parser.add_argument('-D', '--index-dir', default=None,
help='directory to output hierarchical index')
parser.add_argument('-q', '--quiet', dest="verbosity", action='store_const', const=0, default=1,
help='print no logging information')
parser.add_argument('-v', '--verbose', dest="verbosity", action='store_const', const=2,
Expand All @@ -68,20 +114,33 @@ def enrich(repo: dict):
with open(args.exclusions, 'r') as f:
for line in f: exclusions.add(line.strip())

repos = query(args.limit)
logging.info(f"found {len(repos)} (max {args.limit}) notable Lean repositories")
repos = filter(lambda repo: repo['fullName'] not in exclusions, repos)
repos = filter(filter_lake, repos)
def curate(pkg: dict):
return pkg['fullName'] not in exclusions and pkg['stars'] > 1

repos = query_lake_repos(args.limit)
logging.info(f"found {len(repos)} repositories with root lakefiles")

repos = query_repo_data(repos)
pkgs = map(enrich, repos)
pkgs = filter(curate, pkgs)
pkgs = sorted(pkgs, key=lambda pkg: pkg['stars'], reverse=True)
pkgs = list(pkgs)
logging.info(f"found {len(pkgs)} notable Lean repositories with lakefiles")
logging.info(f"found {len(pkgs)} notable repositories")

if args.index_dir is not None:
for pkg in pkgs:
pkg_dir = os.path.join(args.index_dir, pkg['owner'], pkg['name'])
if not os.path.exists(pkg_dir):
os.makedirs(pkg_dir)
with open(os.path.join(pkg_dir, "metadata.json"), 'w') as f:
f.write(json.dumps(pkg, indent=2))

data = {
'fetchedAt': datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
'packages': pkgs
}
if args.output is None:
print(json.dumps(data))
if args.output_manifest is None:
print(json.dumps(data, indent=2))
else:
with open(args.output, 'w') as f:
f.write(json.dumps(data))
with open(args.output_manifest, 'w') as f:
f.write(json.dumps(data, indent=2))
4 changes: 2 additions & 2 deletions site/pages/packages/[id].vue
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ defineOgImage({
hasNoDescription: !pkg.description,
})
const formatLicense = (id: string) => {
switch (id) {
const formatLicense = (id: string | null) => {
switch (id && id.toLowerCase()) {
case 'apache-2.0':
return "Apache 2.0"
case "bsd-2-clause":
Expand Down

0 comments on commit e510ade

Please sign in to comment.