From ffd55a93394621022c0176522b888e016e9fb3fa Mon Sep 17 00:00:00 2001 From: Kaiyu Yang Date: Thu, 8 Feb 2024 16:23:31 -0600 Subject: [PATCH 1/4] wip --- docs/source/conf.py | 2 +- docs/source/troubleshooting.rst | 4 +- docs/source/user-guide.rst | 1 + pyproject.toml | 2 +- src/lean_dojo/__init__.py | 2 +- src/lean_dojo/constants.py | 32 ++---- .../data_extraction/ExtractData.lean | 103 +++++++++++------- .../data_extraction/build_lean4_repo.py | 7 +- src/lean_dojo/data_extraction/cache.py | 7 +- src/lean_dojo/data_extraction/lean.py | 59 +++++++++- src/lean_dojo/data_extraction/trace.py | 3 + src/lean_dojo/utils.py | 34 +----- tests/conftest.py | 1 - 13 files changed, 148 insertions(+), 109 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 36f79d5e..d979d065 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -13,7 +13,7 @@ project = "LeanDojo" copyright = "2023, LeanDojo Team" author = "Kaiyu Yang" -release = "1.2.0" +release = "1.6.0rc" # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration diff --git a/docs/source/troubleshooting.rst b/docs/source/troubleshooting.rst index 33a105be..05981894 100644 --- a/docs/source/troubleshooting.rst +++ b/docs/source/troubleshooting.rst @@ -20,7 +20,9 @@ Tracing Repos The most likely reason is your machine doesn't have enough memory. The amount of memory required depends on the repo you're tracing. For large repos, such as mathlib, you need at least 32 GB memory. If getting more memory is not an option, -you can try a smaller repo. If your machine has enough memory but the process still gets killed, please check +you can try a smaller repo. Alternatively, you can run LeanDojo in the low-memory mode by setting the environment variable :code:`LOW_MEMORY_MODE=1`. In this mode, +LeanDojo will use less memory but cannot trace premise information. +If your machine has enough memory but the process still gets killed, please check whether your Docker has access to all resources of host machine (see `here `_). Interacting with Lean diff --git a/docs/source/user-guide.rst b/docs/source/user-guide.rst index ef40ed0f..22871b78 100644 --- a/docs/source/user-guide.rst +++ b/docs/source/user-guide.rst @@ -237,6 +237,7 @@ LeanDojo's behavior can be configured through the following environment variable * :code:`TACTIC_MEMORY_LIMIT`: Maximum memory when interacting with Lean (see the `--memory` flag of `docker run `_) (only applicable when :code:`CONTAINER=docker`). Default to 16 GB. * :code:`GITHUB_ACCESS_TOKEN`: GitHub `personal access token `_ for using the GitHub API. They are optional. If provided, they can increase the `API rate limit `_. * :code:`LOAD_USED_PACKAGES_ONLY`: Setting it to any value will cause LeanDojo to load only the dependency files that are actually used by the target repo. Otherwise, for Lean 4, it will load all files in the dependency repos. Not set by default. +* :code:`LOW_MEMORY_MODE`: Setting it to any value will cause LeanDojo to use less memory. In the low-memory mode, LeanDojo does not trace premise information. Not set by default. * :code:`VERBOSE` or :code:`DEBUG`: Setting either of them to any value will cause LeanDojo to print debug information. Not set by default. LeanDojo supports `python-dotenv `_. You can use it to manage environment variables in a :file:`.env` file. diff --git a/pyproject.toml b/pyproject.toml index 0630388a..646baebf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ exclude = [ [project] name = "lean-dojo" -version = "1.5.1" +version = "1.6.0rc" authors = [ { name="Kaiyu Yang", email="kaiyuy@caltech.edu" }, ] diff --git a/src/lean_dojo/__init__.py b/src/lean_dojo/__init__.py index eda3a130..e0efd567 100644 --- a/src/lean_dojo/__init__.py +++ b/src/lean_dojo/__init__.py @@ -25,7 +25,7 @@ ProofGivenUp, ) from .interaction.parse_goals import Declaration, Goal, parse_goals -from .data_extraction.lean import LeanGitRepo, LeanFile, Theorem, Pos +from .data_extraction.lean import get_latest_commit, LeanGitRepo, LeanFile, Theorem, Pos from .constants import __version__ if os.geteuid() == 0: diff --git a/src/lean_dojo/constants.py b/src/lean_dojo/constants.py index e4cef1bd..a4bed6f3 100644 --- a/src/lean_dojo/constants.py +++ b/src/lean_dojo/constants.py @@ -11,11 +11,10 @@ from typing import Tuple from loguru import logger from dotenv import load_dotenv -from github import Github, Auth load_dotenv() -__version__ = "1.5.1" +__version__ = "1.6.0rc" logger.remove() if "VERBOSE" in os.environ or "DEBUG" in os.environ: @@ -59,27 +58,6 @@ LEAN4_URL = "https://github.com/leanprover/lean4" """The URL of the Lean 4 repo.""" -GITHUB_ACCESS_TOKEN = os.getenv("GITHUB_ACCESS_TOKEN", None) -"""GiHub personal access token is optional. -If provided, it can increase the rate limit for GitHub API calls. -""" - -if GITHUB_ACCESS_TOKEN: - logger.debug("Using GitHub personal access token for authentication") - GITHUB = Github(auth=Auth.Token(GITHUB_ACCESS_TOKEN)) - GITHUB.get_user().login -else: - logger.debug( - "Using GitHub without authentication. Don't be surprised if you hit the API rate limit." - ) - GITHUB = Github() - -LEAN4_REPO = GITHUB.get_repo("leanprover/lean4") -"""The GitHub Repo for Lean 4 itself.""" - -LEAN4_NIGHTLY_REPO = GITHUB.get_repo("leanprover/lean4-nightly") -"""The GitHub Repo for Lean 4 nightly releases.""" - LEAN4_PACKAGES_DIR_OLD = Path("lake-packages") """The directory where Lean 4 dependencies are stored (before v4.3.0-rc2).""" @@ -89,6 +67,14 @@ LOAD_USED_PACKAGES_ONLY = "LOAD_USED_PACKAGES_ONLY" in os.environ """Only load depdendency files that are actually used by the target repo.""" +LOW_MEMORY_MODE = "LOW_MEMORY_MODE" in os.environ +"""Use less memory when tracing Lean 4 repos. LeanDojo cannot trace premise information in this mode.""" + +if LOW_MEMORY_MODE: + logger.warning( + "Running in low memory mode. LeanDojo cannot trace premise information in this mode." + ) + LEAN4_BUILD_DIR = Path(".lake/build") LEAN_BUILD_DIR_OLD = Path("build") diff --git a/src/lean_dojo/data_extraction/ExtractData.lean b/src/lean_dojo/data_extraction/ExtractData.lean index 03578e98..43bd5688 100644 --- a/src/lean_dojo/data_extraction/ExtractData.lean +++ b/src/lean_dojo/data_extraction/ExtractData.lean @@ -359,35 +359,43 @@ private def visitTermInfo (ti : TermInfo) (env : Environment) : TraceM Unit := d } -private def visitInfo (ctx : ContextInfo) (i : Info) (parent : InfoTree) (env : Environment) : TraceM Unit := do +private def visitInfo (noPremises : Bool) (ctx : ContextInfo) +(i : Info) (parent : InfoTree) (env : Environment) : TraceM Unit := do match i with | .ofTacticInfo ti => visitTacticInfo ctx ti parent - | .ofTermInfo ti => visitTermInfo ti env + | .ofTermInfo ti => + if noPremises then + pure () + else + visitTermInfo ti env | _ => pure () -private partial def traverseTree (ctx: ContextInfo) (tree : InfoTree) (parent : InfoTree) (env : Environment) : TraceM Unit := do +private partial def traverseTree (noPremises : Bool) (ctx: ContextInfo) (tree : InfoTree) +(parent : InfoTree) (env : Environment) : TraceM Unit := do match tree with - | .context ctx' t => traverseTree ctx' t tree env + | .context ctx' t => traverseTree noPremises ctx' t tree env | .node i children => - visitInfo ctx i parent env + visitInfo noPremises ctx i parent env for x in children do - traverseTree ctx x tree env + traverseTree noPremises ctx x tree env | _ => pure () -private def traverseTopLevelTree (tree : InfoTree) (env : Environment) : TraceM Unit := do +private def traverseTopLevelTree (tree : InfoTree) +(noPremises : Bool) (env : Environment) : TraceM Unit := do match tree with - | .context ctx t => traverseTree ctx t tree env + | .context ctx t => traverseTree noPremises ctx t tree env | _ => pure () /-- Process an array of `InfoTree` (one for each top-level command in the file). -/ -def traverseForest (trees : Array InfoTree) (env : Environment) : TraceM Trace := do +def traverseForest (trees : Array InfoTree) +(noPremises : Bool) (env : Environment) : TraceM Trace := do for t in trees do - traverseTopLevelTree t env + traverseTopLevelTree t noPremises env get @@ -397,10 +405,37 @@ end Traversal open Traversal +def getImports (header: Syntax) : IO String := do + -- Similar to `lean --deps` in Lean 3. + let mut s := "" + + for dep in headerToImports header do + let oleanPath ← findOLean dep.module + if oleanPath.isRelative then + let leanPath := Path.toSrcDir! oleanPath "lean" + assert! ← leanPath.pathExists + s := s ++ "\n" ++ leanPath.toString + else if ¬(oleanPath.toString.endsWith "/lib/lean/Init.olean") then + let mut p := (Path.packagesDir / "lean4").toString ++ FilePath.pathSeparator.toString + let mut found := false + for c in (oleanPath.withExtension "lean").components do + if c == "lib" then + found := true + p := p ++ "src" + continue + if found then + p := p ++ FilePath.pathSeparator.toString ++ c + p := p.replace "/lean4/src/lean/Lake" "/lean4/src/lean/lake/Lake" + assert! ← FilePath.mk p |>.pathExists + s := s ++ "\n" ++ p + + return s.trim + + /-- Trace a *.lean file. -/ -unsafe def processFile (path : FilePath) : IO Unit := do +unsafe def processFile (path : FilePath) (noPremises : Bool) : IO Unit := do println! path let input ← IO.FS.readFile path let opts := Options.empty.setBool `trace.Elab.info true @@ -422,7 +457,7 @@ unsafe def processFile (path : FilePath) : IO Unit := do let commands := s.commands.pop -- Remove EOI command. let trees := s.commandState.infoState.trees.toArray - let traceM := (traverseForest trees env').run' ⟨#[header] ++ commands, #[], #[]⟩ + let traceM := (traverseForest trees noPremises env').run' ⟨#[header] ++ commands, #[], #[]⟩ let (trace, _) ← traceM.run'.toIO {fileName := s!"{path}", fileMap := FileMap.ofString input} {env := env} let cwd ← IO.currentDir @@ -433,31 +468,9 @@ unsafe def processFile (path : FilePath) : IO Unit := do Path.makeParentDirs json_path IO.FS.writeFile json_path (toJson trace).pretty - -- Print imports, similar to `lean --deps` in Lean 3. - let mut s := "" - for dep in headerToImports header do - let oleanPath ← findOLean dep.module - if oleanPath.isRelative then - let leanPath := Path.toSrcDir! oleanPath "lean" - assert! ← leanPath.pathExists - s := s ++ "\n" ++ leanPath.toString - else if ¬(oleanPath.toString.endsWith "/lib/lean/Init.olean") then - let mut p := (Path.packagesDir / "lean4").toString ++ FilePath.pathSeparator.toString - let mut found := false - for c in (oleanPath.withExtension "lean").components do - if c == "lib" then - found := true - p := p ++ "src" - continue - if found then - p := p ++ FilePath.pathSeparator.toString ++ c - p := p.replace "/lean4/src/lean/Lake" "/lean4/src/lean/lake/Lake" - assert! ← FilePath.mk p |>.pathExists - s := s ++ "\n" ++ p - let dep_path := Path.toBuildDir "ir" relativePath "dep_paths" |>.get! Path.makeParentDirs dep_path - IO.FS.writeFile dep_path s.trim + IO.FS.writeFile dep_path (← getImports header) end LeanDojo @@ -487,7 +500,7 @@ def shouldProcess (path : FilePath) (noDeps : Bool) : IO Bool := do /-- Trace all *.lean files in the current directory whose corresponding *.olean file exists. -/ -def processAllFiles (noDeps : Bool) : IO Unit := do +def processAllFiles (noDeps : Bool) (noPremises : Bool) : IO Unit := do let cwd ← IO.currentDir assert! cwd.fileName != "lean4" println! "Extracting data at {cwd}" @@ -496,8 +509,12 @@ def processAllFiles (noDeps : Bool) : IO Unit := do for path in ← System.FilePath.walkDir cwd do if ← shouldProcess path noDeps then println! path + let mut args := #["env", "lean", "--run", "ExtractData.lean"] + if noPremises then + args := args.push "noPremises" + args := args.push path.toString let t ← IO.asTask $ IO.Process.run - {cmd := "lake", args := #["env", "lean", "--run", "ExtractData.lean", path.toString]} + {cmd := "lake", args := args} tasks := tasks.push (t, path) for (t, path) in tasks do @@ -511,6 +528,12 @@ def processAllFiles (noDeps : Bool) : IO Unit := do unsafe def main (args : List String) : IO Unit := do match args with - | "nodeps" :: _ => processAllFiles true - | path :: _ => processFile (← Path.toAbsolute ⟨path⟩) - | [] => processAllFiles false + | ["noDeps", "noPremises"] + | ["noPremises", "noDeps"] => + processAllFiles (noDeps := true) (noPremises := true) + | ["noDeps"] => processAllFiles (noDeps := true) (noPremises := false) + | ["noPremises"] => processAllFiles (noDeps := false) (noPremises := true) + | [] => processAllFiles (noDeps := false) (noPremises := false) + | ["noPremises", path] => processFile (← Path.toAbsolute ⟨path⟩) (noPremises := true) + | [path] => processFile (← Path.toAbsolute ⟨path⟩) (noPremises := false) + | _ => throw $ IO.userError "Invalid arguments" diff --git a/src/lean_dojo/data_extraction/build_lean4_repo.py b/src/lean_dojo/data_extraction/build_lean4_repo.py index 6593c56c..428fbcf1 100644 --- a/src/lean_dojo/data_extraction/build_lean4_repo.py +++ b/src/lean_dojo/data_extraction/build_lean4_repo.py @@ -129,6 +129,7 @@ def check_files(packages_path: str, no_deps: bool) -> None: missing_deps = {p.with_suffix(".dep_paths") for p in oleans - deps} if len(missing_jsons) > 0 or len(missing_deps) > 0: for p in missing_jsons.union(missing_deps): + import pdb; pdb.set_trace() logger.warning(f"Missing {p}") @@ -154,6 +155,7 @@ def is_new_version(v: str) -> bool: def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("repo_name") + parser.add_argument("--low-memory-mode", action="store_true") parser.add_argument("--no-deps", action="store_true") args = parser.parse_args() @@ -188,8 +190,11 @@ def main() -> None: logger.info(f"Tracing {repo_name}") with launch_progressbar(dirs_to_monitor): cmd = f"lake env lean --threads {num_procs} --run ExtractData.lean" + if args.low_memory_mode: + cmd += " noPremises" if args.no_deps: - cmd += " nodeps" + cmd += " noDeps" + logger.debug(cmd) run_cmd(cmd, capture_output=True) check_files(packages_path, args.no_deps) diff --git a/src/lean_dojo/data_extraction/cache.py b/src/lean_dojo/data_extraction/cache.py index 71375391..79f43b71 100644 --- a/src/lean_dojo/data_extraction/cache.py +++ b/src/lean_dojo/data_extraction/cache.py @@ -16,7 +16,7 @@ get_repo_info, report_critical_failure, ) -from ..constants import CACHE_DIR, DISABLE_REMOTE_CACHE, REMOTE_CACHE_URL +from ..constants import CACHE_DIR, DISABLE_REMOTE_CACHE, REMOTE_CACHE_URL, LOW_MEMORY_MODE def _split_git_url(url: str) -> Tuple[str, str]: @@ -32,7 +32,10 @@ def _split_git_url(url: str) -> Tuple[str, str]: def _format_dirname(url: str, commit: str) -> str: user_name, repo_name = _split_git_url(url) - return f"{user_name}-{repo_name}-{commit}" + dirname = f"{user_name}-{repo_name}-{commit}" + if LOW_MEMORY_MODE: + dirname += "-lowmemory" + return dirname _CACHE_CORRPUTION_MSG = "The cache may have been corrputed!" diff --git a/src/lean_dojo/data_extraction/lean.py b/src/lean_dojo/data_extraction/lean.py index 4af5325b..535b342e 100644 --- a/src/lean_dojo/data_extraction/lean.py +++ b/src/lean_dojo/data_extraction/lean.py @@ -6,11 +6,13 @@ import os import json import toml +import time import urllib import webbrowser from pathlib import Path from loguru import logger from functools import cache +from github import Github, Auth from dataclasses import dataclass, field from github.Repository import Repository from typing import List, Dict, Any, Generator, Union, Optional, Tuple @@ -19,17 +21,12 @@ execute, read_url, url_exists, - url_to_repo, - normalize_url, get_repo_info, working_directory, - get_latest_commit, ) from ..constants import ( LEAN3_URL, LEAN4_URL, - LEAN4_REPO, - LEAN4_NIGHTLY_REPO, LEAN3_PACKAGES_DIR, LEAN4_PACKAGES_DIR, LEAN4_PACKAGES_DIR_OLD, @@ -38,6 +35,58 @@ ) +GITHUB_ACCESS_TOKEN = os.getenv("GITHUB_ACCESS_TOKEN", None) +"""GiHub personal access token is optional. +If provided, it can increase the rate limit for GitHub API calls. +""" + +if GITHUB_ACCESS_TOKEN: + logger.debug("Using GitHub personal access token for authentication") + GITHUB = Github(auth=Auth.Token(GITHUB_ACCESS_TOKEN)) + GITHUB.get_user().login +else: + logger.debug( + "Using GitHub without authentication. Don't be surprised if you hit the API rate limit." + ) + GITHUB = Github() + +LEAN4_REPO = GITHUB.get_repo("leanprover/lean4") +"""The GitHub Repo for Lean 4 itself.""" + +LEAN4_NIGHTLY_REPO = GITHUB.get_repo("leanprover/lean4-nightly") +"""The GitHub Repo for Lean 4 nightly releases.""" + +_URL_REGEX = re.compile(r"(?P.*?)/*") + + +def normalize_url(url: str) -> str: + return _URL_REGEX.fullmatch(url)["url"] # Remove trailing `/`. + + +@cache +def url_to_repo(url: str, num_retries: int = 2) -> Repository: + url = normalize_url(url) + backoff = 1 + + while True: + try: + return GITHUB.get_repo("/".join(url.split("/")[-2:])) + except Exception as ex: + if num_retries <= 0: + raise ex + num_retries -= 1 + logger.debug(f'url_to_repo("{url}") failed. Retrying...') + time.sleep(backoff) + backoff *= 2 + + +@cache +def get_latest_commit(url: str) -> str: + """Get the hash of the latest commit of the Git repo at ``url``.""" + repo = url_to_repo(url) + return repo.get_branch(repo.default_branch).commit.sha + + def cleanse_string(s: Union[str, Path]) -> str: """Replace : and / with _ in a string.""" return str(s).replace("/", "_").replace(":", "_") diff --git a/src/lean_dojo/data_extraction/trace.py b/src/lean_dojo/data_extraction/trace.py index 367ba72e..2823700a 100644 --- a/src/lean_dojo/data_extraction/trace.py +++ b/src/lean_dojo/data_extraction/trace.py @@ -18,6 +18,7 @@ from ..constants import ( NUM_PROCS, LEAN3_URL, + LOW_MEMORY_MODE, MIN_LEAN3_VERSION, ) from .lean import LeanGitRepo @@ -116,6 +117,8 @@ def _trace_lean4(repo: LeanGitRepo, build_deps: bool) -> None: LEAN4_DATA_EXTRACTOR_PATH: f"/workspace/{repo.name}/{LEAN4_DATA_EXTRACTOR_PATH.name}", } cmd = f"python build_lean4_repo.py {repo.name}" + if LOW_MEMORY_MODE: + cmd += " --low-memory-mode" if not build_deps: cmd += " --no-deps" diff --git a/src/lean_dojo/utils.py b/src/lean_dojo/utils.py index 08917615..d58f0bc6 100644 --- a/src/lean_dojo/utils.py +++ b/src/lean_dojo/utils.py @@ -14,11 +14,10 @@ from loguru import logger from functools import cache from contextlib import contextmanager -from github.Repository import Repository from ray.util.actor_pool import ActorPool from typing import Tuple, Union, List, Generator, Optional -from .constants import GITHUB, NUM_WORKERS, TMP_DIR +from .constants import NUM_WORKERS, TMP_DIR @contextmanager @@ -240,37 +239,6 @@ def parse_str_list(s: str) -> List[str]: return [_.strip()[1:-1] for _ in s[1:-1].split(",") if _ != ""] -_URL_REGEX = re.compile(r"(?P.*?)/*") - - -def normalize_url(url: str) -> str: - return _URL_REGEX.fullmatch(url)["url"] # Remove trailing `/`. - - -@cache -def url_to_repo(url: str, num_retries: int = 2) -> Repository: - url = normalize_url(url) - backoff = 1 - - while True: - try: - return GITHUB.get_repo("/".join(url.split("/")[-2:])) - except Exception as ex: - if num_retries <= 0: - raise ex - num_retries -= 1 - logger.debug(f'url_to_repo("{url}") failed. Retrying...') - time.sleep(backoff) - backoff *= 2 - - -@cache -def get_latest_commit(url: str) -> str: - """Get the hash of the latest commit of the Git repo at ``url``.""" - repo = url_to_repo(url) - return repo.get_branch(repo.default_branch).commit.sha - - @cache def is_git_repo(path: Path) -> bool: """Check if ``path`` is a Git repo.""" diff --git a/tests/conftest.py b/tests/conftest.py index 4a7dd114..33b82ef4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,7 +1,6 @@ import pytest from lean_dojo import * -from lean_dojo.utils import get_latest_commit LEAN3_URL = "https://github.com/leanprover-community/lean" From 2bf18e5063a186e7f459c9e3ccd52144b8c17a6f Mon Sep 17 00:00:00 2001 From: Kaiyu Yang Date: Fri, 9 Feb 2024 08:07:18 -0600 Subject: [PATCH 2/4] format code --- docker/Dockerfile | 1 + src/lean_dojo/data_extraction/ExtractData.lean | 5 ++--- src/lean_dojo/data_extraction/build_lean4_repo.py | 4 +++- src/lean_dojo/data_extraction/cache.py | 7 ++++++- 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 8cbd005b..4d95722d 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,6 +1,7 @@ FROM kitware/cmake:ci-clang_cxx_modules-x86_64-2023-02-15 RUN yum -y install which gmp-devel python3 python3-pip +RUN ln -s $(which python3) /usr/bin/python RUN pip3 install toml loguru tqdm ENV ELAN_HOME="/.elan" diff --git a/src/lean_dojo/data_extraction/ExtractData.lean b/src/lean_dojo/data_extraction/ExtractData.lean index 43bd5688..9e52d25d 100644 --- a/src/lean_dojo/data_extraction/ExtractData.lean +++ b/src/lean_dojo/data_extraction/ExtractData.lean @@ -438,11 +438,10 @@ Trace a *.lean file. unsafe def processFile (path : FilePath) (noPremises : Bool) : IO Unit := do println! path let input ← IO.FS.readFile path - let opts := Options.empty.setBool `trace.Elab.info true enableInitializersExecution let inputCtx := Parser.mkInputContext input path.toString let (header, parserState, messages) ← Parser.parseHeader inputCtx - let (env, messages) ← processHeader header opts messages inputCtx + let (env, messages) ← processHeader header {} messages inputCtx if messages.hasErrors then for msg in messages.toList do @@ -451,7 +450,7 @@ unsafe def processFile (path : FilePath) (noPremises : Bool) : IO Unit := do throw $ IO.userError "Errors during import; aborting" let env := env.setMainModule (← moduleNameOfFileName path none) - let commandState := { Command.mkState env messages opts with infoState.enabled := true } + let commandState := { Command.mkState env messages {} with infoState.enabled := true } let s ← IO.processCommands inputCtx parserState commandState let env' := s.commandState.env let commands := s.commands.pop -- Remove EOI command. diff --git a/src/lean_dojo/data_extraction/build_lean4_repo.py b/src/lean_dojo/data_extraction/build_lean4_repo.py index 428fbcf1..742fd3f2 100644 --- a/src/lean_dojo/data_extraction/build_lean4_repo.py +++ b/src/lean_dojo/data_extraction/build_lean4_repo.py @@ -129,7 +129,9 @@ def check_files(packages_path: str, no_deps: bool) -> None: missing_deps = {p.with_suffix(".dep_paths") for p in oleans - deps} if len(missing_jsons) > 0 or len(missing_deps) > 0: for p in missing_jsons.union(missing_deps): - import pdb; pdb.set_trace() + import pdb + + pdb.set_trace() logger.warning(f"Missing {p}") diff --git a/src/lean_dojo/data_extraction/cache.py b/src/lean_dojo/data_extraction/cache.py index 79f43b71..c9df43cd 100644 --- a/src/lean_dojo/data_extraction/cache.py +++ b/src/lean_dojo/data_extraction/cache.py @@ -16,7 +16,12 @@ get_repo_info, report_critical_failure, ) -from ..constants import CACHE_DIR, DISABLE_REMOTE_CACHE, REMOTE_CACHE_URL, LOW_MEMORY_MODE +from ..constants import ( + CACHE_DIR, + DISABLE_REMOTE_CACHE, + REMOTE_CACHE_URL, + LOW_MEMORY_MODE, +) def _split_git_url(url: str) -> Tuple[str, str]: From 22a6e03b3dccf09b47f90f46f0616a3fc72ebe6c Mon Sep 17 00:00:00 2001 From: Kaiyu Yang Date: Fri, 9 Feb 2024 09:25:51 -0600 Subject: [PATCH 3/4] undo --- docs/source/troubleshooting.rst | 4 +- docs/source/user-guide.rst | 1 - src/lean_dojo/constants.py | 8 ---- .../data_extraction/ExtractData.lean | 45 +++++++------------ src/lean_dojo/data_extraction/cache.py | 3 -- src/lean_dojo/data_extraction/trace.py | 3 -- 6 files changed, 17 insertions(+), 47 deletions(-) diff --git a/docs/source/troubleshooting.rst b/docs/source/troubleshooting.rst index 05981894..33a105be 100644 --- a/docs/source/troubleshooting.rst +++ b/docs/source/troubleshooting.rst @@ -20,9 +20,7 @@ Tracing Repos The most likely reason is your machine doesn't have enough memory. The amount of memory required depends on the repo you're tracing. For large repos, such as mathlib, you need at least 32 GB memory. If getting more memory is not an option, -you can try a smaller repo. Alternatively, you can run LeanDojo in the low-memory mode by setting the environment variable :code:`LOW_MEMORY_MODE=1`. In this mode, -LeanDojo will use less memory but cannot trace premise information. -If your machine has enough memory but the process still gets killed, please check +you can try a smaller repo. If your machine has enough memory but the process still gets killed, please check whether your Docker has access to all resources of host machine (see `here `_). Interacting with Lean diff --git a/docs/source/user-guide.rst b/docs/source/user-guide.rst index 22871b78..ef40ed0f 100644 --- a/docs/source/user-guide.rst +++ b/docs/source/user-guide.rst @@ -237,7 +237,6 @@ LeanDojo's behavior can be configured through the following environment variable * :code:`TACTIC_MEMORY_LIMIT`: Maximum memory when interacting with Lean (see the `--memory` flag of `docker run `_) (only applicable when :code:`CONTAINER=docker`). Default to 16 GB. * :code:`GITHUB_ACCESS_TOKEN`: GitHub `personal access token `_ for using the GitHub API. They are optional. If provided, they can increase the `API rate limit `_. * :code:`LOAD_USED_PACKAGES_ONLY`: Setting it to any value will cause LeanDojo to load only the dependency files that are actually used by the target repo. Otherwise, for Lean 4, it will load all files in the dependency repos. Not set by default. -* :code:`LOW_MEMORY_MODE`: Setting it to any value will cause LeanDojo to use less memory. In the low-memory mode, LeanDojo does not trace premise information. Not set by default. * :code:`VERBOSE` or :code:`DEBUG`: Setting either of them to any value will cause LeanDojo to print debug information. Not set by default. LeanDojo supports `python-dotenv `_. You can use it to manage environment variables in a :file:`.env` file. diff --git a/src/lean_dojo/constants.py b/src/lean_dojo/constants.py index a4bed6f3..375fe469 100644 --- a/src/lean_dojo/constants.py +++ b/src/lean_dojo/constants.py @@ -67,14 +67,6 @@ LOAD_USED_PACKAGES_ONLY = "LOAD_USED_PACKAGES_ONLY" in os.environ """Only load depdendency files that are actually used by the target repo.""" -LOW_MEMORY_MODE = "LOW_MEMORY_MODE" in os.environ -"""Use less memory when tracing Lean 4 repos. LeanDojo cannot trace premise information in this mode.""" - -if LOW_MEMORY_MODE: - logger.warning( - "Running in low memory mode. LeanDojo cannot trace premise information in this mode." - ) - LEAN4_BUILD_DIR = Path(".lake/build") LEAN_BUILD_DIR_OLD = Path("build") diff --git a/src/lean_dojo/data_extraction/ExtractData.lean b/src/lean_dojo/data_extraction/ExtractData.lean index 9e52d25d..a88813a0 100644 --- a/src/lean_dojo/data_extraction/ExtractData.lean +++ b/src/lean_dojo/data_extraction/ExtractData.lean @@ -359,43 +359,37 @@ private def visitTermInfo (ti : TermInfo) (env : Environment) : TraceM Unit := d } -private def visitInfo (noPremises : Bool) (ctx : ContextInfo) +private def visitInfo (ctx : ContextInfo) (i : Info) (parent : InfoTree) (env : Environment) : TraceM Unit := do match i with | .ofTacticInfo ti => visitTacticInfo ctx ti parent - | .ofTermInfo ti => - if noPremises then - pure () - else - visitTermInfo ti env + | .ofTermInfo ti => visitTermInfo ti env | _ => pure () -private partial def traverseTree (noPremises : Bool) (ctx: ContextInfo) (tree : InfoTree) +private partial def traverseTree (ctx: ContextInfo) (tree : InfoTree) (parent : InfoTree) (env : Environment) : TraceM Unit := do match tree with - | .context ctx' t => traverseTree noPremises ctx' t tree env + | .context ctx' t => traverseTree ctx' t tree env | .node i children => - visitInfo noPremises ctx i parent env + visitInfo ctx i parent env for x in children do - traverseTree noPremises ctx x tree env + traverseTree ctx x tree env | _ => pure () -private def traverseTopLevelTree (tree : InfoTree) -(noPremises : Bool) (env : Environment) : TraceM Unit := do +private def traverseTopLevelTree (tree : InfoTree) (env : Environment) : TraceM Unit := do match tree with - | .context ctx t => traverseTree noPremises ctx t tree env + | .context ctx t => traverseTree ctx t tree env | _ => pure () /-- Process an array of `InfoTree` (one for each top-level command in the file). -/ -def traverseForest (trees : Array InfoTree) -(noPremises : Bool) (env : Environment) : TraceM Trace := do +def traverseForest (trees : Array InfoTree) (env : Environment) : TraceM Trace := do for t in trees do - traverseTopLevelTree t noPremises env + traverseTopLevelTree t env get @@ -435,7 +429,7 @@ def getImports (header: Syntax) : IO String := do /-- Trace a *.lean file. -/ -unsafe def processFile (path : FilePath) (noPremises : Bool) : IO Unit := do +unsafe def processFile (path : FilePath) : IO Unit := do println! path let input ← IO.FS.readFile path enableInitializersExecution @@ -456,7 +450,7 @@ unsafe def processFile (path : FilePath) (noPremises : Bool) : IO Unit := do let commands := s.commands.pop -- Remove EOI command. let trees := s.commandState.infoState.trees.toArray - let traceM := (traverseForest trees noPremises env').run' ⟨#[header] ++ commands, #[], #[]⟩ + let traceM := (traverseForest trees env').run' ⟨#[header] ++ commands, #[], #[]⟩ let (trace, _) ← traceM.run'.toIO {fileName := s!"{path}", fileMap := FileMap.ofString input} {env := env} let cwd ← IO.currentDir @@ -499,7 +493,7 @@ def shouldProcess (path : FilePath) (noDeps : Bool) : IO Bool := do /-- Trace all *.lean files in the current directory whose corresponding *.olean file exists. -/ -def processAllFiles (noDeps : Bool) (noPremises : Bool) : IO Unit := do +def processAllFiles (noDeps : Bool) : IO Unit := do let cwd ← IO.currentDir assert! cwd.fileName != "lean4" println! "Extracting data at {cwd}" @@ -509,8 +503,6 @@ def processAllFiles (noDeps : Bool) (noPremises : Bool) : IO Unit := do if ← shouldProcess path noDeps then println! path let mut args := #["env", "lean", "--run", "ExtractData.lean"] - if noPremises then - args := args.push "noPremises" args := args.push path.toString let t ← IO.asTask $ IO.Process.run {cmd := "lake", args := args} @@ -527,12 +519,7 @@ def processAllFiles (noDeps : Bool) (noPremises : Bool) : IO Unit := do unsafe def main (args : List String) : IO Unit := do match args with - | ["noDeps", "noPremises"] - | ["noPremises", "noDeps"] => - processAllFiles (noDeps := true) (noPremises := true) - | ["noDeps"] => processAllFiles (noDeps := true) (noPremises := false) - | ["noPremises"] => processAllFiles (noDeps := false) (noPremises := true) - | [] => processAllFiles (noDeps := false) (noPremises := false) - | ["noPremises", path] => processFile (← Path.toAbsolute ⟨path⟩) (noPremises := true) - | [path] => processFile (← Path.toAbsolute ⟨path⟩) (noPremises := false) + | ["noDeps"] => processAllFiles (noDeps := true) + | [path] => processFile (← Path.toAbsolute ⟨path⟩) + | [] => processAllFiles (noDeps := false) | _ => throw $ IO.userError "Invalid arguments" diff --git a/src/lean_dojo/data_extraction/cache.py b/src/lean_dojo/data_extraction/cache.py index c9df43cd..8ed81e6e 100644 --- a/src/lean_dojo/data_extraction/cache.py +++ b/src/lean_dojo/data_extraction/cache.py @@ -20,7 +20,6 @@ CACHE_DIR, DISABLE_REMOTE_CACHE, REMOTE_CACHE_URL, - LOW_MEMORY_MODE, ) @@ -38,8 +37,6 @@ def _split_git_url(url: str) -> Tuple[str, str]: def _format_dirname(url: str, commit: str) -> str: user_name, repo_name = _split_git_url(url) dirname = f"{user_name}-{repo_name}-{commit}" - if LOW_MEMORY_MODE: - dirname += "-lowmemory" return dirname diff --git a/src/lean_dojo/data_extraction/trace.py b/src/lean_dojo/data_extraction/trace.py index 2823700a..367ba72e 100644 --- a/src/lean_dojo/data_extraction/trace.py +++ b/src/lean_dojo/data_extraction/trace.py @@ -18,7 +18,6 @@ from ..constants import ( NUM_PROCS, LEAN3_URL, - LOW_MEMORY_MODE, MIN_LEAN3_VERSION, ) from .lean import LeanGitRepo @@ -117,8 +116,6 @@ def _trace_lean4(repo: LeanGitRepo, build_deps: bool) -> None: LEAN4_DATA_EXTRACTOR_PATH: f"/workspace/{repo.name}/{LEAN4_DATA_EXTRACTOR_PATH.name}", } cmd = f"python build_lean4_repo.py {repo.name}" - if LOW_MEMORY_MODE: - cmd += " --low-memory-mode" if not build_deps: cmd += " --no-deps" From 6131b4521445ffe57c808c2cdc43fc437f038ae8 Mon Sep 17 00:00:00 2001 From: Kaiyu Yang Date: Fri, 9 Feb 2024 11:18:13 -0600 Subject: [PATCH 4/4] minor fix --- src/lean_dojo/data_extraction/ExtractData.lean | 7 ++----- src/lean_dojo/data_extraction/build_lean4_repo.py | 6 ------ src/lean_dojo/data_extraction/cache.py | 3 +-- 3 files changed, 3 insertions(+), 13 deletions(-) diff --git a/src/lean_dojo/data_extraction/ExtractData.lean b/src/lean_dojo/data_extraction/ExtractData.lean index a88813a0..b5feef65 100644 --- a/src/lean_dojo/data_extraction/ExtractData.lean +++ b/src/lean_dojo/data_extraction/ExtractData.lean @@ -359,8 +359,7 @@ private def visitTermInfo (ti : TermInfo) (env : Environment) : TraceM Unit := d } -private def visitInfo (ctx : ContextInfo) -(i : Info) (parent : InfoTree) (env : Environment) : TraceM Unit := do +private def visitInfo (ctx : ContextInfo) (i : Info) (parent : InfoTree) (env : Environment) : TraceM Unit := do match i with | .ofTacticInfo ti => visitTacticInfo ctx ti parent | .ofTermInfo ti => visitTermInfo ti env @@ -502,10 +501,8 @@ def processAllFiles (noDeps : Bool) : IO Unit := do for path in ← System.FilePath.walkDir cwd do if ← shouldProcess path noDeps then println! path - let mut args := #["env", "lean", "--run", "ExtractData.lean"] - args := args.push path.toString let t ← IO.asTask $ IO.Process.run - {cmd := "lake", args := args} + {cmd := "lake", args := #["env", "lean", "--run", "ExtractData.lean", path.toString]} tasks := tasks.push (t, path) for (t, path) in tasks do diff --git a/src/lean_dojo/data_extraction/build_lean4_repo.py b/src/lean_dojo/data_extraction/build_lean4_repo.py index 742fd3f2..a129a975 100644 --- a/src/lean_dojo/data_extraction/build_lean4_repo.py +++ b/src/lean_dojo/data_extraction/build_lean4_repo.py @@ -129,9 +129,6 @@ def check_files(packages_path: str, no_deps: bool) -> None: missing_deps = {p.with_suffix(".dep_paths") for p in oleans - deps} if len(missing_jsons) > 0 or len(missing_deps) > 0: for p in missing_jsons.union(missing_deps): - import pdb - - pdb.set_trace() logger.warning(f"Missing {p}") @@ -157,7 +154,6 @@ def is_new_version(v: str) -> bool: def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("repo_name") - parser.add_argument("--low-memory-mode", action="store_true") parser.add_argument("--no-deps", action="store_true") args = parser.parse_args() @@ -192,8 +188,6 @@ def main() -> None: logger.info(f"Tracing {repo_name}") with launch_progressbar(dirs_to_monitor): cmd = f"lake env lean --threads {num_procs} --run ExtractData.lean" - if args.low_memory_mode: - cmd += " noPremises" if args.no_deps: cmd += " noDeps" logger.debug(cmd) diff --git a/src/lean_dojo/data_extraction/cache.py b/src/lean_dojo/data_extraction/cache.py index 8ed81e6e..acfe1c92 100644 --- a/src/lean_dojo/data_extraction/cache.py +++ b/src/lean_dojo/data_extraction/cache.py @@ -36,8 +36,7 @@ def _split_git_url(url: str) -> Tuple[str, str]: def _format_dirname(url: str, commit: str) -> str: user_name, repo_name = _split_git_url(url) - dirname = f"{user_name}-{repo_name}-{commit}" - return dirname + return f"{user_name}-{repo_name}-{commit}" _CACHE_CORRPUTION_MSG = "The cache may have been corrputed!"