feat: revamp index discovery

* now searches GH for all pkgs with a root `lakefile.lean` * can output a hierarchical directory of metadata
leanprover · Dec 1, 2023 · e510ade · e510ade
1 parent d9a366e
commit e510ade
Show file tree

Hide file tree

Showing 3 changed files with 103 additions and 44 deletions.
diff --git a/.github/workflows/index.yaml b/.github/workflows/index.yaml
@@ -22,7 +22,7 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v3
       - name: Query Repositories
-        run: ./query.py -o index.json -v -L ${{github.ref_name == 'master' && 1000 || 75}}
+        run: ./query.py -o index.json -v -L ${{github.ref_name != 'master' && 100 || 0}}
         env:
           GH_TOKEN: ${{ github.token }}
       - name: Upload Index

diff --git a/query.py b/query.py
@@ -1,55 +1,101 @@
 #!/usr/bin/env python3
 from datetime import datetime
 import argparse
+import itertools
+import os
 import subprocess
-import requests
 import json
 import logging
 
-def query(limit: int):
-  query = "language:Lean stars:>1 sort:stars"
-  fields = ["fullName","description","license","createdAt","updatedAt","pushedAt","url","homepage","stargazersCount"]
-  child = subprocess.run([
-    "gh", "search", "repos",
-    *query.split(' '), "-L", str(limit), "--json", ','.join(fields),
-  ], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+# from https://antonz.org/page-iterator/
+def paginate(iterable, page_size):
+  it = iter(iterable)
+  slicer = lambda: list(itertools.islice(it, page_size))
+  return iter(slicer, [])
+
+def run_cmd(*args: str) -> bytes:
+  child = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
   if child.returncode != 0:
     raise RuntimeError(child.stderr.decode().strip())
-  return json.loads(child.stdout)
+  elif len(child.stderr) > 0:
+    logging.error(child.stderr.decode())
+  return child.stdout
+
+REPO_QUERY="""
+query($repoIds: [ID!]!) {
+  nodes(ids: $repoIds) {
+    ... on Repository {
+      nameWithOwner
+      description
+      licenseInfo {
+        spdxId
+      }
+      createdAt
+      updatedAt
+      pushedAt
+      url
+      homepageUrl
+      stargazerCount
+    }
+  }
+}
+"""
+
+def query_repo_data(repoIds: 'list[str]') -> dict:
+  results = list()
+  for page in paginate(repoIds, 100):
+    fields = list()
+    for id in page:
+      fields.append('-f')
+      fields.append(f'repoIds[]={id}')
+    out = run_cmd(
+      'gh', 'api', 'graphql',
+      '-f', f'query={REPO_QUERY}', *fields
+    )
+    results = results + json.loads(out)['data']['nodes']
+  return results
 
-def filter_lake(repo):
-  repo = repo['fullName']
-  url = f"https://raw.githubusercontent.com/{repo}/HEAD/lakefile.lean"
-  logging.debug(f"querying {url}")
-  code = requests.head(url, allow_redirects=True).status_code
-  log = f"status {code} for repository {repo}"
-  if code == 404:
-    logging.debug(log)
-    return False
-  elif code == 200:
-    logging.debug(log)
-    return True
+# NOTE: GitHub limits code searches to 10 requests/min, which is 1000 results.
+# Thus, the strategy used here will need to change when we hit that limit.
+def query_lake_repos(limit: int) -> 'list[str]':
+  query='filename:lakefile.lean path:/'
+  if limit == 0:
+    out = run_cmd(
+      'gh', 'api', 'search/code',
+      '--paginate', '--cache', '1h',
+      '-X', 'GET', '-f', f'q={query}',
+      '-q', '.items[] | .repository.node_id'
+    )
   else:
-    logging.error(f"status {code} for repository {repo}")
-    return False
+    out = run_cmd(
+      'gh', 'search', 'code', *query.split(' '), '-L', str(limit),
+      '--json', 'path,repository', '-q', '.[] | .repository.id'
+    )
+  return out.decode().splitlines()
 
 def enrich(repo: dict):
-  repo['id'] = repo['fullName'].replace("-", "--").replace("/", "-")
+  repo['fullName'] = repo.pop('nameWithOwner')
+  repo['id'] = repo['fullName'].replace('-', '--').replace('/', '-')
   repo['owner'], repo['name'] = repo['fullName'].split('/')
-  repo['license'] = repo['license']['key']
-  repo['stars'] = repo.pop('stargazersCount')
-  repo['updatedAt'] = max(repo['updatedAt'], repo['pushedAt'])
-  del repo['pushedAt']
+  info = repo.pop('licenseInfo')
+  spdxId = None if info is None else info['spdxId']
+  spdxId = None if spdxId in ['NONE', 'NOASSERTION'] else spdxId
+  repo['license'] = spdxId
+  repo['homepage'] = repo.pop('homepageUrl')
+  repo['stars'] = repo.pop('stargazerCount')
+  repo['updatedAt'] = max(repo['updatedAt'], repo.pop('pushedAt'))
   return repo
 
 if __name__ == "__main__":
   parser = argparse.ArgumentParser()
-  parser.add_argument('-L', '--limit', type=int, default=50,
-    help='(max) number of results to query from GitHub')
+  parser.add_argument('-L', '--limit', type=int, default=100,
+    help='(max) number of results to query from GitHub (0 for no limit)')
   parser.add_argument('-X', '--exclusions', default="query-exclusions.txt",
     help='file containing repos to exclude')
-  parser.add_argument('-o', '--output',
-    help='file to output results')
+  parser.add_argument('-o', '--output-manifest',
+    help='file to output result manifest')
+  parser.add_argument('-D', '--index-dir', default=None,
+    help='directory to output hierarchical index')
   parser.add_argument('-q', '--quiet', dest="verbosity", action='store_const', const=0, default=1,
     help='print no logging information')
   parser.add_argument('-v', '--verbose', dest="verbosity", action='store_const', const=2,
@@ -68,20 +114,33 @@ def enrich(repo: dict):
   with open(args.exclusions, 'r') as f:
     for line in f: exclusions.add(line.strip())
 
-  repos = query(args.limit)
-  logging.info(f"found {len(repos)} (max {args.limit}) notable Lean repositories")
-  repos = filter(lambda repo: repo['fullName'] not in exclusions, repos)
-  repos = filter(filter_lake, repos)
+  def curate(pkg: dict):
+    return pkg['fullName'] not in exclusions and pkg['stars'] > 1
+
+  repos = query_lake_repos(args.limit)
+  logging.info(f"found {len(repos)} repositories with root lakefiles")
+
+  repos = query_repo_data(repos)
   pkgs = map(enrich, repos)
+  pkgs = filter(curate, pkgs)
+  pkgs = sorted(pkgs, key=lambda pkg: pkg['stars'], reverse=True)
   pkgs = list(pkgs)
-  logging.info(f"found {len(pkgs)} notable Lean repositories with lakefiles")
+  logging.info(f"found {len(pkgs)} notable repositories")
+
+  if args.index_dir is not None:
+    for pkg in pkgs:
+      pkg_dir = os.path.join(args.index_dir, pkg['owner'], pkg['name'])
+      if not os.path.exists(pkg_dir):
+        os.makedirs(pkg_dir)
+      with open(os.path.join(pkg_dir, "metadata.json"), 'w') as f:
+        f.write(json.dumps(pkg, indent=2))
 
   data = {
     'fetchedAt': datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
     'packages': pkgs
   }
-  if args.output is None:
-    print(json.dumps(data))
+  if args.output_manifest is None:
+    print(json.dumps(data, indent=2))
   else:
-    with open(args.output, 'w') as f:
-      f.write(json.dumps(data))
+    with open(args.output_manifest, 'w') as f:
+      f.write(json.dumps(data, indent=2))
diff --git a/site/pages/packages/[id].vue b/site/pages/packages/[id].vue
@@ -24,8 +24,8 @@ defineOgImage({
   hasNoDescription: !pkg.description,
 })
 
-const formatLicense = (id: string) => {
-  switch (id) {
+const formatLicense = (id: string | null) => {
+  switch (id && id.toLowerCase()) {
     case 'apache-2.0':
       return "Apache 2.0"
     case "bsd-2-clause":