Skip to content

Commit

Permalink
Merge pull request #125 from lean-dojo/dev
Browse files Browse the repository at this point in the history
Updates
  • Loading branch information
Kaiyu Yang authored Jan 9, 2024
2 parents e3b1d28 + b34d784 commit 6e6a4e7
Show file tree
Hide file tree
Showing 12 changed files with 335 additions and 276 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ exclude = [

[project]
name = "lean-dojo"
version = "1.4.5"
version = "1.5.0rc"
authors = [
{ name="Kaiyu Yang", email="[email protected]" },
]
Expand Down
381 changes: 191 additions & 190 deletions scripts/generate-benchmark-lean3.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion scripts/generate-benchmark-lean4.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@
" + \"\\n\"\n",
" )\n",
" logger.info(\n",
" f\"{num_premises} theorems/definitions from {traced_repo.num_traced_files} files saved to {oup_path}\"\n",
" f\"{num_premises} theorems/definitions from {len(traced_repo.traced_files)} files saved to {oup_path}\"\n",
" )\n",
"\n",
"\n",
Expand Down
2 changes: 1 addition & 1 deletion src/lean_dojo/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

load_dotenv()

__version__ = "1.4.5"
__version__ = "1.5.0rc"

logger.remove()
if "VERBOSE" in os.environ or "DEBUG" in os.environ:
Expand Down
2 changes: 1 addition & 1 deletion src/lean_dojo/container.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def _copy_file_or_dir(src: Path, dst: Path) -> None:
assert src.is_dir() and not src.is_relative_to(dst)
if dst.exists():
shutil.rmtree(dst)
shutil.copytree(src, dst)
shutil.copytree(src, dst, symlinks=True)


class NativeContainer(Container):
Expand Down
21 changes: 13 additions & 8 deletions src/lean_dojo/data_extraction/ExtractData.lean
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,10 @@ Convert the path of a *.lean file to its corresponding file (e.g., *.olean) in t
def toBuildDir (subDir : FilePath) (path : FilePath) (ext : String) : Option FilePath :=
let path' := (trim path).withExtension ext
match relativeTo path' $ packagesDir / "lean4/src" with
| some p => packagesDir / "lean4/lib" / p
| some p =>
match relativeTo p "lean/lake" with
| some p' => packagesDir / "lean4/lib/lean" / p'
| none => packagesDir / "lean4/lib" / p
| none => match relativeTo path' packagesDir with
| some p =>
match p.components with
Expand Down Expand Up @@ -448,6 +451,7 @@ unsafe def processFile (path : FilePath) : IO Unit := do
continue
if found then
p := p ++ FilePath.pathSeparator.toString ++ c
p := p.replace "/lean4/src/lean/Lake" "/lean4/src/lean/lake/Lake"
assert! ← FilePath.mk p |>.pathExists
s := s ++ "\n" ++ p

Expand All @@ -464,16 +468,17 @@ open LeanDojo
/--
Whether a *.lean file should be traced.
-/
def shouldProcess (path : FilePath) (noDep : Bool) : IO Bool := do
def shouldProcess (path : FilePath) (noDeps : Bool) : IO Bool := do
if (← path.isDir) ∨ path.extension != "lean" then
return false

if noDep ∧ Path.isRelativeTo path Path.packagesDir then
return false

let cwd ← IO.currentDir
let some relativePath := Path.relativeTo path cwd |
throw $ IO.userError s!"Invalid path: {path}"

if noDeps ∧ Path.isRelativeTo relativePath Path.packagesDir then
return false

let some oleanPath := Path.toBuildDir "lib" relativePath "olean" |
throw $ IO.userError s!"Invalid path: {path}"
return ← oleanPath.pathExists
Expand All @@ -482,14 +487,14 @@ def shouldProcess (path : FilePath) (noDep : Bool) : IO Bool := do
/--
Trace all *.lean files in the current directory whose corresponding *.olean file exists.
-/
def processAllFiles (noDep : Bool) : IO Unit := do
def processAllFiles (noDeps : Bool) : IO Unit := do
let cwd ← IO.currentDir
assert! cwd.fileName != "lean4"
println! "Extracting data at {cwd}"

let mut tasks := #[]
for path in ← System.FilePath.walkDir cwd do
if ← shouldProcess path noDep then
if ← shouldProcess path noDeps then
println! path
let t ← IO.asTask $ IO.Process.run
{cmd := "lake", args := #["env", "lean", "--run", "ExtractData.lean", path.toString]}
Expand All @@ -506,6 +511,6 @@ def processAllFiles (noDep : Bool) : IO Unit := do

unsafe def main (args : List String) : IO Unit := do
match args with
| "nodep" :: _ => processAllFiles true
| "nodeps" :: _ => processAllFiles true
| path :: _ => processFile (← Path.toAbsolute ⟨path⟩)
| [] => processAllFiles false
45 changes: 33 additions & 12 deletions src/lean_dojo/data_extraction/build_lean4_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
"""
import os
import re
import sys
import shutil
import argparse
import itertools
import subprocess
from tqdm import tqdm
Expand Down Expand Up @@ -104,16 +104,23 @@ def get_lean_version() -> str:
return m["version"]


def check_files() -> None:
def check_files(packages_path: str, no_deps: bool) -> None:
"""Check if all *.lean files have been processed to produce *.ast.json and *.dep_paths files."""
cwd = Path.cwd()
jsons = {
p.with_suffix("").with_suffix("") for p in cwd.glob("**/build/ir/**/*.ast.json")
p.with_suffix("").with_suffix("")
for p in cwd.glob("**/build/ir/**/*.ast.json")
if not no_deps or not p.is_relative_to(packages_path)
}
deps = {
p.with_suffix("")
for p in cwd.glob("**/build/ir/**/*.dep_paths")
if not no_deps or not p.is_relative_to(packages_path)
}
deps = {p.with_suffix("") for p in cwd.glob("**/build/ir/**/*.dep_paths")}
oleans = {
Path(str(p.with_suffix("")).replace("/build/lib/", "/build/ir/"))
for p in cwd.glob("**/build/lib/**/*.olean")
if not no_deps or not p.is_relative_to(packages_path)
}
assert len(jsons) <= len(oleans) and len(deps) <= len(oleans)
missing_jsons = {p.with_suffix(".ast.json") for p in oleans - jsons}
Expand All @@ -123,7 +130,7 @@ def check_files() -> None:
logger.warning(f"Missing {p}")


def is_new_version(v) -> bool:
def is_new_version(v: str) -> bool:
"""Check if ``v`` is at least `4.3.0-rc2`."""
major, minor, patch = [int(_) for _ in v.split("-")[0].split(".")]
if major < 4 or (major == 4 and minor < 3):
Expand All @@ -143,12 +150,23 @@ def is_new_version(v) -> bool:


def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("repo_name")
parser.add_argument("--no-deps", action="store_true")
args = parser.parse_args()

num_procs = int(os.environ["NUM_PROCS"])
repo_name = sys.argv[1]
repo_name = args.repo_name
os.chdir(repo_name)

# Build the repo using lake.
logger.info(f"Building {repo_name}")
if args.no_deps:
# The additional *.olean files wouldn't matter.
try:
run_cmd("lake exe cache get")
except subprocess.CalledProcessError:
pass
run_cmd("lake build")

# Copy the Lean 4 stdlib into the path of packages.
Expand All @@ -162,14 +180,17 @@ def main() -> None:
shutil.copytree(lean_prefix, f"{packages_path}/lean4")

# Run ExtractData.lean to extract ASTs, tactic states, and premise information.
dirs_to_monitor = [build_path]
if not args.no_deps:
dirs_to_monitor.append(packages_path)
logger.info(f"Tracing {repo_name}")
with launch_progressbar([build_path, packages_path]):
run_cmd(
f"lake env lean --threads {num_procs} --run ExtractData.lean",
capture_output=True,
)
with launch_progressbar(dirs_to_monitor):
cmd = f"lake env lean --threads {num_procs} --run ExtractData.lean"
if args.no_deps:
cmd += " nodeps"
run_cmd(cmd, capture_output=True)

check_files()
check_files(packages_path, args.no_deps)


if __name__ == "__main__":
Expand Down
8 changes: 3 additions & 5 deletions src/lean_dojo/data_extraction/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,15 +87,13 @@ def get(self, url: str, commit: str) -> Optional[Path]:

def store(self, src: Path) -> Path:
"""Store a traced repo at path ``src``. Return its path in the cache."""
dirs = list(src.glob("*"))
assert len(dirs) == 1, f"Unexpected number of directories in {src}"
url, commit = get_repo_info(dirs[0])
url, commit = get_repo_info(src)
dirpath = self.cache_dir / _format_dirname(url, commit)
_, repo_name = _split_git_url(url)
if not dirpath.exists():
with self.lock:
with report_critical_failure(_CACHE_CORRPUTION_MSG):
shutil.copytree(src, dirpath)
_, repo_name = _split_git_url(url)
shutil.copytree(src, dirpath / repo_name)
return dirpath / repo_name


Expand Down
6 changes: 3 additions & 3 deletions src/lean_dojo/data_extraction/lean.py
Original file line number Diff line number Diff line change
Expand Up @@ -601,7 +601,7 @@ def _get_lean4_dependencies(

try:
lake_manifest = (
self.get_config("lake-manifest.json")
self.get_config("lake-manifest.json", num_retries=0)
if path is None
else json.load((Path(path) / "lake-manifest.json").open())
)
Expand Down Expand Up @@ -638,10 +638,10 @@ def _get_config_url(self, filename: str) -> str:
url = self.url.replace("github.com", "raw.githubusercontent.com")
return f"{url}/{self.commit}/{filename}"

def get_config(self, filename: str) -> Dict[str, Any]:
def get_config(self, filename: str, num_retries: int = 2) -> Dict[str, Any]:
"""Return the repo's files."""
config_url = self._get_config_url(filename)
content = read_url(config_url)
content = read_url(config_url, num_retries)
if filename.endswith(".toml"):
return toml.loads(content)
elif filename.endswith(".json"):
Expand Down
33 changes: 22 additions & 11 deletions src/lean_dojo/data_extraction/trace.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,15 +55,16 @@ def _modify_lean3(version: str) -> None:
raise ex


def _trace(repo: LeanGitRepo) -> None:
def _trace(repo: LeanGitRepo, build_deps: bool) -> None:
assert (
repo.exists()
), f"The {repo} does not exist. Please check the URL `{repo.commit_url}`."
if repo.uses_lean3:
assert build_deps
_trace_lean3(repo)
else:
assert repo.uses_lean4
_trace_lean4(repo)
_trace_lean4(repo, build_deps)


def _trace_lean3(repo: LeanGitRepo) -> None:
Expand Down Expand Up @@ -101,7 +102,7 @@ def _trace_lean3(repo: LeanGitRepo) -> None:
raise ex


def _trace_lean4(repo: LeanGitRepo) -> None:
def _trace_lean4(repo: LeanGitRepo, build_deps: bool) -> None:
# Trace `repo` in the current working directory.
assert not repo.is_lean4, "Cannot trace the Lean 4 repo itself."
repo.clone_and_checkout()
Expand All @@ -113,9 +114,13 @@ def _trace_lean4(repo: LeanGitRepo) -> None:
LEAN4_BUILD_SCRIPT_PATH: f"/workspace/{LEAN4_BUILD_SCRIPT_PATH.name}",
LEAN4_DATA_EXTRACTOR_PATH: f"/workspace/{repo.name}/{LEAN4_DATA_EXTRACTOR_PATH.name}",
}
cmd = f"python3 build_lean4_repo.py {repo.name}"
if not build_deps:
cmd += " --no-deps"

try:
container.run(
f"python3 build_lean4_repo.py {repo.name}",
cmd,
create_mounts(mts),
{"NUM_PROCS": NUM_PROCS},
as_current_user=True,
Expand All @@ -134,13 +139,14 @@ def is_available_in_cache(repo: LeanGitRepo) -> bool:
return cache.get(repo.url, repo.commit) is not None


def get_traced_repo_path(repo: LeanGitRepo) -> Path:
def get_traced_repo_path(repo: LeanGitRepo, build_deps: bool = True) -> Path:
"""Return the path of a traced repo in the cache.
The function will trace a repo if it is not available in the cache. See :ref:`caching` for details.
Args:
repo (LeanGitRepo): The Lean repo to trace.
build_deps (bool): Whether to build the dependencies of ``repo``. Defaults to True.
Returns:
Path: The path of the traced repo in the cache, e.g. :file:`/home/kaiyu/.cache/lean_dojo/leanprover-community-mathlib-2196ab363eb097c008d4497125e0dde23fb36db2`
Expand All @@ -150,16 +156,20 @@ def get_traced_repo_path(repo: LeanGitRepo) -> Path:
logger.info(f"Tracing {repo}")
with working_directory() as tmp_dir:
logger.debug(f"Working in the temporary directory {tmp_dir}")
_trace(repo)
traced_repo = TracedRepo.from_traced_files(tmp_dir / repo.name)
_trace(repo, build_deps)
traced_repo = TracedRepo.from_traced_files(tmp_dir / repo.name, build_deps)
traced_repo.save_to_disk()
path = cache.store(tmp_dir)
path = cache.store(tmp_dir / repo.name)
else:
logger.debug("The traced repo is available in the cache.")
return path


def trace(repo: LeanGitRepo, dst_dir: Optional[Union[str, Path]] = None) -> TracedRepo:
def trace(
repo: LeanGitRepo,
dst_dir: Optional[Union[str, Path]] = None,
build_deps: bool = True,
) -> TracedRepo:
"""Trace a repo (and its dependencies), saving the results to ``dst_dir``.
The function only traces the repo when it's not available in the cache. Otherwise,
Expand All @@ -168,6 +178,7 @@ def trace(repo: LeanGitRepo, dst_dir: Optional[Union[str, Path]] = None) -> Trac
Args:
repo (LeanGitRepo): The Lean repo to trace.
dst_dir (Union[str, Path]): The directory for saving the traced repo. If None, the traced repo is only saved in the cahe.
build_deps (bool): Whether to build the dependencies of ``repo``. Defaults to True.
Returns:
TracedRepo: A :class:`TracedRepo` object corresponding to the files at ``dst_dir``.
Expand All @@ -178,9 +189,9 @@ def trace(repo: LeanGitRepo, dst_dir: Optional[Union[str, Path]] = None) -> Trac
not dst_dir.exists()
), f"The destination directory {dst_dir} already exists."

cached_path = get_traced_repo_path(repo)
cached_path = get_traced_repo_path(repo, build_deps)
logger.info(f"Loading the traced repo from {cached_path}")
traced_repo = TracedRepo.load_from_disk(cached_path)
traced_repo = TracedRepo.load_from_disk(cached_path, build_deps)
traced_repo.check_sanity()

if dst_dir is not None:
Expand Down
Loading

0 comments on commit 6e6a4e7

Please sign in to comment.