Skip to content

Commit

Permalink
Merge pull request #133 from lean-dojo/reduce-memory-usage
Browse files Browse the repository at this point in the history
Reduce memory usage
  • Loading branch information
Kaiyu Yang authored Feb 19, 2024
2 parents 271d356 + 6131b45 commit a11836a
Show file tree
Hide file tree
Showing 11 changed files with 103 additions and 97 deletions.
1 change: 1 addition & 0 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
FROM kitware/cmake:ci-clang_cxx_modules-x86_64-2023-02-15

RUN yum -y install which gmp-devel python3 python3-pip
RUN ln -s $(which python3) /usr/bin/python
RUN pip3 install toml loguru tqdm

ENV ELAN_HOME="/.elan"
Expand Down
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
project = "LeanDojo"
copyright = "2023, LeanDojo Team"
author = "Kaiyu Yang"
release = "1.2.0"
release = "1.6.0rc"

# -- General configuration ---------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ exclude = [

[project]
name = "lean-dojo"
version = "1.5.1"
version = "1.6.0rc"
authors = [
{ name="Kaiyu Yang", email="[email protected]" },
]
Expand Down
2 changes: 1 addition & 1 deletion src/lean_dojo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
ProofGivenUp,
)
from .interaction.parse_goals import Declaration, Goal, parse_goals
from .data_extraction.lean import LeanGitRepo, LeanFile, Theorem, Pos
from .data_extraction.lean import get_latest_commit, LeanGitRepo, LeanFile, Theorem, Pos
from .constants import __version__

if os.geteuid() == 0:
Expand Down
24 changes: 1 addition & 23 deletions src/lean_dojo/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,10 @@
from typing import Tuple
from loguru import logger
from dotenv import load_dotenv
from github import Github, Auth

load_dotenv()

__version__ = "1.5.1"
__version__ = "1.6.0rc"

logger.remove()
if "VERBOSE" in os.environ or "DEBUG" in os.environ:
Expand Down Expand Up @@ -59,27 +58,6 @@
LEAN4_URL = "https://github.com/leanprover/lean4"
"""The URL of the Lean 4 repo."""

GITHUB_ACCESS_TOKEN = os.getenv("GITHUB_ACCESS_TOKEN", None)
"""GiHub personal access token is optional.
If provided, it can increase the rate limit for GitHub API calls.
"""

if GITHUB_ACCESS_TOKEN:
logger.debug("Using GitHub personal access token for authentication")
GITHUB = Github(auth=Auth.Token(GITHUB_ACCESS_TOKEN))
GITHUB.get_user().login
else:
logger.debug(
"Using GitHub without authentication. Don't be surprised if you hit the API rate limit."
)
GITHUB = Github()

LEAN4_REPO = GITHUB.get_repo("leanprover/lean4")
"""The GitHub Repo for Lean 4 itself."""

LEAN4_NIGHTLY_REPO = GITHUB.get_repo("leanprover/lean4-nightly")
"""The GitHub Repo for Lean 4 nightly releases."""

LEAN4_PACKAGES_DIR_OLD = Path("lake-packages")
"""The directory where Lean 4 dependencies are stored (before v4.3.0-rc2)."""

Expand Down
66 changes: 36 additions & 30 deletions src/lean_dojo/data_extraction/ExtractData.lean
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,8 @@ private def visitInfo (ctx : ContextInfo) (i : Info) (parent : InfoTree) (env :
| _ => pure ()


private partial def traverseTree (ctx: ContextInfo) (tree : InfoTree) (parent : InfoTree) (env : Environment) : TraceM Unit := do
private partial def traverseTree (ctx: ContextInfo) (tree : InfoTree)
(parent : InfoTree) (env : Environment) : TraceM Unit := do
match tree with
| .context ctx' t => traverseTree ctx' t tree env
| .node i children =>
Expand Down Expand Up @@ -397,17 +398,43 @@ end Traversal
open Traversal


def getImports (header: Syntax) : IO String := do
-- Similar to `lean --deps` in Lean 3.
let mut s := ""

for dep in headerToImports header do
let oleanPath ← findOLean dep.module
if oleanPath.isRelative then
let leanPath := Path.toSrcDir! oleanPath "lean"
assert! ← leanPath.pathExists
s := s ++ "\n" ++ leanPath.toString
else if ¬(oleanPath.toString.endsWith "/lib/lean/Init.olean") then
let mut p := (Path.packagesDir / "lean4").toString ++ FilePath.pathSeparator.toString
let mut found := false
for c in (oleanPath.withExtension "lean").components do
if c == "lib" then
found := true
p := p ++ "src"
continue
if found then
p := p ++ FilePath.pathSeparator.toString ++ c
p := p.replace "/lean4/src/lean/Lake" "/lean4/src/lean/lake/Lake"
assert! ← FilePath.mk p |>.pathExists
s := s ++ "\n" ++ p

return s.trim


/--
Trace a *.lean file.
-/
unsafe def processFile (path : FilePath) : IO Unit := do
println! path
let input ← IO.FS.readFile path
let opts := Options.empty.setBool `trace.Elab.info true
enableInitializersExecution
let inputCtx := Parser.mkInputContext input path.toString
let (header, parserState, messages) ← Parser.parseHeader inputCtx
let (env, messages) ← processHeader header opts messages inputCtx
let (env, messages) ← processHeader header {} messages inputCtx

if messages.hasErrors then
for msg in messages.toList do
Expand All @@ -416,7 +443,7 @@ unsafe def processFile (path : FilePath) : IO Unit := do
throw $ IO.userError "Errors during import; aborting"

let env := env.setMainModule (← moduleNameOfFileName path none)
let commandState := { Command.mkState env messages opts with infoState.enabled := true }
let commandState := { Command.mkState env messages {} with infoState.enabled := true }
let s ← IO.processCommands inputCtx parserState commandState
let env' := s.commandState.env
let commands := s.commands.pop -- Remove EOI command.
Expand All @@ -433,31 +460,9 @@ unsafe def processFile (path : FilePath) : IO Unit := do
Path.makeParentDirs json_path
IO.FS.writeFile json_path (toJson trace).pretty

-- Print imports, similar to `lean --deps` in Lean 3.
let mut s := ""
for dep in headerToImports header do
let oleanPath ← findOLean dep.module
if oleanPath.isRelative then
let leanPath := Path.toSrcDir! oleanPath "lean"
assert! ← leanPath.pathExists
s := s ++ "\n" ++ leanPath.toString
else if ¬(oleanPath.toString.endsWith "/lib/lean/Init.olean") then
let mut p := (Path.packagesDir / "lean4").toString ++ FilePath.pathSeparator.toString
let mut found := false
for c in (oleanPath.withExtension "lean").components do
if c == "lib" then
found := true
p := p ++ "src"
continue
if found then
p := p ++ FilePath.pathSeparator.toString ++ c
p := p.replace "/lean4/src/lean/Lake" "/lean4/src/lean/lake/Lake"
assert! ← FilePath.mk p |>.pathExists
s := s ++ "\n" ++ p

let dep_path := Path.toBuildDir "ir" relativePath "dep_paths" |>.get!
Path.makeParentDirs dep_path
IO.FS.writeFile dep_path s.trim
IO.FS.writeFile dep_path (← getImports header)


end LeanDojo
Expand Down Expand Up @@ -511,6 +516,7 @@ def processAllFiles (noDeps : Bool) : IO Unit := do

unsafe def main (args : List String) : IO Unit := do
match args with
| "nodeps" :: _ => processAllFiles true
| path :: _ => processFile (← Path.toAbsolute ⟨path⟩)
| [] => processAllFiles false
| ["noDeps"] => processAllFiles (noDeps := true)
| [path] => processFile (← Path.toAbsolute ⟨path⟩)
| [] => processAllFiles (noDeps := false)
| _ => throw $ IO.userError "Invalid arguments"
3 changes: 2 additions & 1 deletion src/lean_dojo/data_extraction/build_lean4_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,8 @@ def main() -> None:
with launch_progressbar(dirs_to_monitor):
cmd = f"lake env lean --threads {num_procs} --run ExtractData.lean"
if args.no_deps:
cmd += " nodeps"
cmd += " noDeps"
logger.debug(cmd)
run_cmd(cmd, capture_output=True)

check_files(packages_path, args.no_deps)
Expand Down
6 changes: 5 additions & 1 deletion src/lean_dojo/data_extraction/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,11 @@
get_repo_info,
report_critical_failure,
)
from ..constants import CACHE_DIR, DISABLE_REMOTE_CACHE, REMOTE_CACHE_URL
from ..constants import (
CACHE_DIR,
DISABLE_REMOTE_CACHE,
REMOTE_CACHE_URL,
)


def _split_git_url(url: str) -> Tuple[str, str]:
Expand Down
59 changes: 54 additions & 5 deletions src/lean_dojo/data_extraction/lean.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@
import os
import json
import toml
import time
import urllib
import webbrowser
from pathlib import Path
from loguru import logger
from functools import cache
from github import Github, Auth
from dataclasses import dataclass, field
from github.Repository import Repository
from typing import List, Dict, Any, Generator, Union, Optional, Tuple
Expand All @@ -19,17 +21,12 @@
execute,
read_url,
url_exists,
url_to_repo,
normalize_url,
get_repo_info,
working_directory,
get_latest_commit,
)
from ..constants import (
LEAN3_URL,
LEAN4_URL,
LEAN4_REPO,
LEAN4_NIGHTLY_REPO,
LEAN3_PACKAGES_DIR,
LEAN4_PACKAGES_DIR,
LEAN4_PACKAGES_DIR_OLD,
Expand All @@ -38,6 +35,58 @@
)


GITHUB_ACCESS_TOKEN = os.getenv("GITHUB_ACCESS_TOKEN", None)
"""GiHub personal access token is optional.
If provided, it can increase the rate limit for GitHub API calls.
"""

if GITHUB_ACCESS_TOKEN:
logger.debug("Using GitHub personal access token for authentication")
GITHUB = Github(auth=Auth.Token(GITHUB_ACCESS_TOKEN))
GITHUB.get_user().login
else:
logger.debug(
"Using GitHub without authentication. Don't be surprised if you hit the API rate limit."
)
GITHUB = Github()

LEAN4_REPO = GITHUB.get_repo("leanprover/lean4")
"""The GitHub Repo for Lean 4 itself."""

LEAN4_NIGHTLY_REPO = GITHUB.get_repo("leanprover/lean4-nightly")
"""The GitHub Repo for Lean 4 nightly releases."""

_URL_REGEX = re.compile(r"(?P<url>.*?)/*")


def normalize_url(url: str) -> str:
return _URL_REGEX.fullmatch(url)["url"] # Remove trailing `/`.


@cache
def url_to_repo(url: str, num_retries: int = 2) -> Repository:
url = normalize_url(url)
backoff = 1

while True:
try:
return GITHUB.get_repo("/".join(url.split("/")[-2:]))
except Exception as ex:
if num_retries <= 0:
raise ex
num_retries -= 1
logger.debug(f'url_to_repo("{url}") failed. Retrying...')
time.sleep(backoff)
backoff *= 2


@cache
def get_latest_commit(url: str) -> str:
"""Get the hash of the latest commit of the Git repo at ``url``."""
repo = url_to_repo(url)
return repo.get_branch(repo.default_branch).commit.sha


def cleanse_string(s: Union[str, Path]) -> str:
"""Replace : and / with _ in a string."""
return str(s).replace("/", "_").replace(":", "_")
Expand Down
34 changes: 1 addition & 33 deletions src/lean_dojo/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,10 @@
from loguru import logger
from functools import cache
from contextlib import contextmanager
from github.Repository import Repository
from ray.util.actor_pool import ActorPool
from typing import Tuple, Union, List, Generator, Optional

from .constants import GITHUB, NUM_WORKERS, TMP_DIR
from .constants import NUM_WORKERS, TMP_DIR


@contextmanager
Expand Down Expand Up @@ -240,37 +239,6 @@ def parse_str_list(s: str) -> List[str]:
return [_.strip()[1:-1] for _ in s[1:-1].split(",") if _ != ""]


_URL_REGEX = re.compile(r"(?P<url>.*?)/*")


def normalize_url(url: str) -> str:
return _URL_REGEX.fullmatch(url)["url"] # Remove trailing `/`.


@cache
def url_to_repo(url: str, num_retries: int = 2) -> Repository:
url = normalize_url(url)
backoff = 1

while True:
try:
return GITHUB.get_repo("/".join(url.split("/")[-2:]))
except Exception as ex:
if num_retries <= 0:
raise ex
num_retries -= 1
logger.debug(f'url_to_repo("{url}") failed. Retrying...')
time.sleep(backoff)
backoff *= 2


@cache
def get_latest_commit(url: str) -> str:
"""Get the hash of the latest commit of the Git repo at ``url``."""
repo = url_to_repo(url)
return repo.get_branch(repo.default_branch).commit.sha


@cache
def is_git_repo(path: Path) -> bool:
"""Check if ``path`` is a Git repo."""
Expand Down
1 change: 0 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import pytest

from lean_dojo import *
from lean_dojo.utils import get_latest_commit


LEAN3_URL = "https://github.com/leanprover-community/lean"
Expand Down

0 comments on commit a11836a

Please sign in to comment.