From 9c266a986a93edcd7df2927d7eaaa98352af5553 Mon Sep 17 00:00:00 2001 From: rexwzh <1073853456@qq.com> Date: Wed, 10 Jul 2024 01:08:12 +0800 Subject: [PATCH 01/34] add repo types: github/local/remote --- src/lean_dojo/data_extraction/lean.py | 69 ++++++++++++++++++++++----- 1 file changed, 56 insertions(+), 13 deletions(-) diff --git a/src/lean_dojo/data_extraction/lean.py b/src/lean_dojo/data_extraction/lean.py index 4b73f90..4938ee6 100644 --- a/src/lean_dojo/data_extraction/lean.py +++ b/src/lean_dojo/data_extraction/lean.py @@ -17,6 +17,8 @@ from github.Repository import Repository from github.GithubException import GithubException from typing import List, Dict, Any, Generator, Union, Optional, Tuple, Iterator +from urllib.parse import urlparse +from git import Repo, GitCommandError from ..utils import ( execute, @@ -43,27 +45,46 @@ ) GITHUB = Github() -LEAN4_REPO = GITHUB.get_repo("leanprover/lean4") +# LEAN4_REPO = GITHUB.get_repo("leanprover/lean4") +LEAN4_REPO = None """The GitHub Repo for Lean 4 itself.""" -LEAN4_NIGHTLY_REPO = GITHUB.get_repo("leanprover/lean4-nightly") +# LEAN4_NIGHTLY_REPO = GITHUB.get_repo("leanprover/lean4-nightly") +LEAN4_NIGHTLY_REPO = None """The GitHub Repo for Lean 4 nightly releases.""" _URL_REGEX = re.compile(r"(?P.*?)/*") -def normalize_url(url: str) -> str: +def normalize_url(url: str, repo_type:str ='github') -> str: + if repo_type == 'local': + return os.path.abspath(url) # Convert to absolute path if local return _URL_REGEX.fullmatch(url)["url"] # Remove trailing `/`. @cache -def url_to_repo(url: str, num_retries: int = 2) -> Repository: +def url_to_repo(url: str, num_retries: int = 2, repo_type: str = 'github') -> Repo: + """Convert a URL to a Repo object. + + Args: + url (str): The URL of the repository. + num_retries (int): Number of retries in case of failure. + repo_type (str): The type of the repository. Defaults to 'github'. + + Returns: + Repo: A Git Repo object. + """ url = normalize_url(url) backoff = 1 while True: try: - return GITHUB.get_repo("/".join(url.split("/")[-2:])) + if repo_type == 'github': + return GITHUB.get_repo("/".join(url.split("/")[-2:])) + elif repo_type == 'local': + return Repo(url) + else: + return Repo.clone_from(url, '/tmp/repo_clone') # We might need to change this with TMP_DIR except Exception as ex: if num_retries <= 0: raise ex @@ -72,7 +93,6 @@ def url_to_repo(url: str, num_retries: int = 2) -> Repository: time.sleep(backoff) backoff *= 2 - @cache def get_latest_commit(url: str) -> str: """Get the hash of the latest commit of the Git repo at ``url``.""" @@ -330,6 +350,11 @@ def get_lean4_version_from_config(toolchain: str) -> str: def get_lean4_commit_from_config(config_dict: Dict[str, Any]) -> str: """Return the required Lean commit given a ``lean-toolchain`` config.""" + global LEAN4_NIGHTLY_REPO, LEAN4_REPO + if LEAN4_REPO is None: + LEAN4_REPO = GITHUB.get_repo("leanprover/lean4") + if LEAN4_NIGHTLY_REPO is None: + LEAN4_NIGHTLY_REPO = GITHUB.get_repo("leanprover/lean4-nightly") assert "content" in config_dict, "config_dict must have a 'content' field" config = config_dict["content"].strip() prefix = "leanprover/lean4:" @@ -398,9 +423,9 @@ class LeanGitRepo: """Git repo of a Lean project.""" url: str - """The repo's Github URL. + """The repo's URL. - Note that we only support Github as of now. + It can be a GitHub URL that starts with https://, a local path, or any other valid Git URL. """ commit: str @@ -417,13 +442,31 @@ class LeanGitRepo: """Required Lean version. """ + repo_type: str = field(init=False, repr=False) + """Type of the repo. It can be ``github``, ``local`` or ``remote``. + """ + def __post_init__(self) -> None: - if "github.com" not in self.url: - raise ValueError(f"{self.url} is not a Github URL") - if not self.url.startswith("https://"): + parsed_url = urlparse(self.url) + + if parsed_url.scheme in ["http", "https"]: + object.__setattr__(self, "url", normalize_url(self.url)) + # case 1 - GitHub URL + if "github.com" in self.url: + if not self.url.startswith("https://"): + raise ValueError(f"{self.url} should start with https://") + repo_type = "github" + # case 2 - remote Git URL + else: + repo_type = "remote" + # case 3 - local path + elif os.path.exists(parsed_url.path): + repo_type = "local" + else: raise ValueError(f"{self.url} is not a valid URL") - object.__setattr__(self, "url", normalize_url(self.url)) - object.__setattr__(self, "repo", url_to_repo(self.url)) + object.__setattr__(self, "repo_type", repo_type) + object.__setattr__(self, "url", normalize_url(self.url, repo_type=repo_type)) + object.__setattr__(self, "repo", url_to_repo(self.url, repo_type=repo_type)) # Convert tags or branches to commit hashes if not (len(self.commit) == 40 and _COMMIT_REGEX.fullmatch(self.commit)): From 60907a887f7f56eee02ae02cd0219be81de01e9d Mon Sep 17 00:00:00 2001 From: rexwzh <1073853456@qq.com> Date: Wed, 10 Jul 2024 02:52:51 +0800 Subject: [PATCH 02/34] update property for repo types --- src/lean_dojo/data_extraction/lean.py | 39 ++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/src/lean_dojo/data_extraction/lean.py b/src/lean_dojo/data_extraction/lean.py index 4938ee6..15d5236 100644 --- a/src/lean_dojo/data_extraction/lean.py +++ b/src/lean_dojo/data_extraction/lean.py @@ -9,6 +9,7 @@ import time import urllib import webbrowser +import github from pathlib import Path from loguru import logger from functools import cache @@ -118,9 +119,23 @@ def _to_commit_hash(repo: Repository, label: str) -> str: for tag in repo.get_tags(): if tag.name == label: return tag.commit.sha + raise - raise ValueError(f"Invalid tag or branch: `{label}` for {repo}") - +def _to_commit_hash(repo: Repository, label: str) -> str: + """Convert a tag or branch to a commit hash.""" + if isinstance(repo, github.Repository.Repository): + # GitHub repository + try: + commit = repo.get_commit(label).sha + except Exception as e: + raise ValueError(f"Invalid tag or branch: `{label}` for {repo}") + else: + # Local or remote Git repository + try: + commit = repo.rev_parse(label).hexsha + except GitCommandError as e: + raise ValueError(f"Error converting ref to commit hash: {e}") + return commit @dataclass(eq=True, unsafe_hash=True) class Pos: @@ -485,9 +500,14 @@ def __post_init__(self) -> None: lean_version = self.commit else: config = self.get_config("lean-toolchain") - lean_version = get_lean4_commit_from_config(config) - v = get_lean4_version_from_config(config["content"]) - if not is_supported_version(v): + toolchain = config["content"] + m = _LEAN4_VERSION_REGEX.fullmatch(toolchain.strip()) + if m is not None: + lean_version = m["version"] + else: + lean_version_commit = get_lean4_commit_from_config(config) + lean_version = get_lean4_version_from_config(toolchain) + if not is_supported_version(lean_version): logger.warning( f"{self} relies on an unsupported Lean version: {lean_version}" ) @@ -661,8 +681,13 @@ def _get_config_url(self, filename: str) -> str: def get_config(self, filename: str, num_retries: int = 2) -> Dict[str, Any]: """Return the repo's files.""" - config_url = self._get_config_url(filename) - content = read_url(config_url, num_retries) + if self.repo_type == 'github': + config_url = self._get_config_url(filename) + content = read_url(config_url, num_retries) + else: + working_dir = self.repo.working_dir + with open(os.path.join(working_dir, filename), "r") as f: + content = f.read() if filename.endswith(".toml"): return toml.loads(content) elif filename.endswith(".json"): From 46f328962f143b816024fddfcf83fae9f36d9007 Mon Sep 17 00:00:00 2001 From: rexwzh <1073853456@qq.com> Date: Wed, 10 Jul 2024 02:53:07 +0800 Subject: [PATCH 03/34] add tests --- .gitignore | 2 + pyproject.toml | 1 + src/lean_dojo/data_extraction/lean.py | 13 ------ tests/data_extraction/test_repo_type.py | 53 +++++++++++++++++++++++++ tests/data_extraction/testdata/.keep | 0 5 files changed, 56 insertions(+), 13 deletions(-) create mode 100644 tests/data_extraction/test_repo_type.py create mode 100644 tests/data_extraction/testdata/.keep diff --git a/.gitignore b/.gitignore index cd92677..4c7f50f 100644 --- a/.gitignore +++ b/.gitignore @@ -130,3 +130,5 @@ dmypy.json # Pyre type checker .pyre/ +testdata/* +.vscode \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 287ce34..11d0fba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,7 @@ dependencies = [ "python-dotenv", "loguru", "filelock", + "gitpython", "psutil", "types-psutil", "tqdm", diff --git a/src/lean_dojo/data_extraction/lean.py b/src/lean_dojo/data_extraction/lean.py index 15d5236..0de935a 100644 --- a/src/lean_dojo/data_extraction/lean.py +++ b/src/lean_dojo/data_extraction/lean.py @@ -110,19 +110,6 @@ def cleanse_string(s: Union[str, Path]) -> str: def _to_commit_hash(repo: Repository, label: str) -> str: """Convert a tag or branch to a commit hash.""" logger.debug(f"Querying the commit hash for {repo.name} {label}") - - try: - return repo.get_branch(label).commit.sha - except GithubException: - pass - - for tag in repo.get_tags(): - if tag.name == label: - return tag.commit.sha - raise - -def _to_commit_hash(repo: Repository, label: str) -> str: - """Convert a tag or branch to a commit hash.""" if isinstance(repo, github.Repository.Repository): # GitHub repository try: diff --git a/tests/data_extraction/test_repo_type.py b/tests/data_extraction/test_repo_type.py new file mode 100644 index 0000000..4c3ec15 --- /dev/null +++ b/tests/data_extraction/test_repo_type.py @@ -0,0 +1,53 @@ +from lean_dojo import LeanGitRepo +from git import Repo +from github import Github +import os, shutil + +# GitHub repository details +GITHUB_REPO_URL = "https://github.com/yangky11/lean4-example" +GITHUB_COMMIT_HASH = "3f8c5eb303a225cdef609498b8d87262e5ef344b" + +# Local repository path (make sure this path exists and is the clone of the above GitHub repo) +LOCAL_REPO_PATH = "testdata/lean4-example" + +def clone_repo_if_not_exists(repo_url, local_path, label='main'): + if os.path.exists(local_path): + shutil.rmtree(local_path) + repo = Repo.clone_from(repo_url, local_path) + repo.git.checkout(label) + +def test_github_to_local(): + # Clone the GitHub repository to the local path + clone_repo_if_not_exists(GITHUB_REPO_URL, LOCAL_REPO_PATH) + + # Initialize GitHub repo + github_repo = LeanGitRepo(url=GITHUB_REPO_URL, commit="main") + + # Initialize local repo + local_repo = LeanGitRepo(url=LOCAL_REPO_PATH, commit="main") + + # Check if commit hashes match + assert github_repo.commit == local_repo.commit + assert github_repo.lean_version == local_repo.lean_version + + # check the repo type + assert github_repo.repo_type == 'github' + assert local_repo.repo_type == 'local' + +def test_github_to_local_with_commit(): + # Clone the GitHub repository to the local path + clone_repo_if_not_exists(GITHUB_REPO_URL, LOCAL_REPO_PATH, GITHUB_COMMIT_HASH) + + # Initialize GitHub repo + github_repo = LeanGitRepo(url=GITHUB_REPO_URL, commit=GITHUB_COMMIT_HASH) + + # Initialize local repo + local_repo = LeanGitRepo(url=LOCAL_REPO_PATH, commit=GITHUB_COMMIT_HASH) + + # Check if commit hashes match + assert github_repo.commit == local_repo.commit + assert github_repo.lean_version == local_repo.lean_version + + # check the repo type + assert github_repo.repo_type == 'github' + assert local_repo.repo_type == 'local' diff --git a/tests/data_extraction/testdata/.keep b/tests/data_extraction/testdata/.keep new file mode 100644 index 0000000..e69de29 From cdf28c3aba852f7c8e5a507d197aaecc29483ab0 Mon Sep 17 00:00:00 2001 From: rexwzh <1073853456@qq.com> Date: Thu, 11 Jul 2024 03:46:30 +0800 Subject: [PATCH 04/34] update method & test for remote url --- src/lean_dojo/data_extraction/lean.py | 79 ++++++++++++------------- src/lean_dojo/utils.py | 28 +++++++++ tests/data_extraction/test_repo_type.py | 20 ++++++- 3 files changed, 84 insertions(+), 43 deletions(-) diff --git a/src/lean_dojo/data_extraction/lean.py b/src/lean_dojo/data_extraction/lean.py index 0de935a..70cab47 100644 --- a/src/lean_dojo/data_extraction/lean.py +++ b/src/lean_dojo/data_extraction/lean.py @@ -9,7 +9,7 @@ import time import urllib import webbrowser -import github +import shutil from pathlib import Path from loguru import logger from functools import cache @@ -18,8 +18,8 @@ from github.Repository import Repository from github.GithubException import GithubException from typing import List, Dict, Any, Generator, Union, Optional, Tuple, Iterator -from urllib.parse import urlparse from git import Repo, GitCommandError +import uuid from ..utils import ( execute, @@ -27,8 +27,9 @@ url_exists, get_repo_info, working_directory, + repo_type_of_url, ) -from ..constants import LEAN4_URL +from ..constants import LEAN4_URL, TMP_DIR GITHUB_ACCESS_TOKEN = os.getenv("GITHUB_ACCESS_TOKEN", None) @@ -62,7 +63,6 @@ def normalize_url(url: str, repo_type:str ='github') -> str: return os.path.abspath(url) # Convert to absolute path if local return _URL_REGEX.fullmatch(url)["url"] # Remove trailing `/`. - @cache def url_to_repo(url: str, num_retries: int = 2, repo_type: str = 'github') -> Repo: """Convert a URL to a Repo object. @@ -83,9 +83,13 @@ def url_to_repo(url: str, num_retries: int = 2, repo_type: str = 'github') -> Re if repo_type == 'github': return GITHUB.get_repo("/".join(url.split("/")[-2:])) elif repo_type == 'local': - return Repo(url) + if os.path.exists(url): + return Repo(url) + else: + raise ValueError(f"Local path {url} does not exist") else: - return Repo.clone_from(url, '/tmp/repo_clone') # We might need to change this with TMP_DIR + temp_dir = os.path.join(TMP_DIR or '/tmp', str(uuid.uuid4())[:8]) + return Repo.clone_from(url, f"{temp_dir}/{os.path.basename(url)}") except Exception as ex: if num_retries <= 0: raise ex @@ -107,21 +111,25 @@ def cleanse_string(s: Union[str, Path]) -> str: @cache -def _to_commit_hash(repo: Repository, label: str) -> str: +def _to_commit_hash(repo: Union[Repository, Repo], label: str) -> str: """Convert a tag or branch to a commit hash.""" - logger.debug(f"Querying the commit hash for {repo.name} {label}") - if isinstance(repo, github.Repository.Repository): - # GitHub repository + # GitHub repository + if isinstance(repo, Repository): + logger.debug(f"Querying the commit hash for {repo.name} {label}") try: commit = repo.get_commit(label).sha - except Exception as e: - raise ValueError(f"Invalid tag or branch: `{label}` for {repo}") - else: - # Local or remote Git repository + except GithubException as e: + raise ValueError(f"Invalid tag or branch: `{label}` for {repo.name}") + # Local or remote Git repository + elif isinstance(repo, Repo): + logger.debug(f"Querying the commit hash for {repo.working_dir} repository {label}") try: - commit = repo.rev_parse(label).hexsha + # Resolve the label to a commit hash + commit = repo.commit(label).hexsha except GitCommandError as e: raise ValueError(f"Error converting ref to commit hash: {e}") + else: + raise TypeError("Unsupported repository type") return commit @dataclass(eq=True, unsafe_hash=True) @@ -436,7 +444,7 @@ class LeanGitRepo: You can also use tags such as ``v3.5.0``. They will be converted to commit hashes. """ - repo: Repository = field(init=False, repr=False) + repo: Union[Repository, Repo] = field(init=False, repr=False) """A :class:`github.Repository` object. """ @@ -449,26 +457,14 @@ class LeanGitRepo: """ def __post_init__(self) -> None: - parsed_url = urlparse(self.url) - - if parsed_url.scheme in ["http", "https"]: - object.__setattr__(self, "url", normalize_url(self.url)) - # case 1 - GitHub URL - if "github.com" in self.url: - if not self.url.startswith("https://"): - raise ValueError(f"{self.url} should start with https://") - repo_type = "github" - # case 2 - remote Git URL - else: - repo_type = "remote" - # case 3 - local path - elif os.path.exists(parsed_url.path): - repo_type = "local" - else: + repo_type = repo_type_of_url(self.url) + if repo_type is None: raise ValueError(f"{self.url} is not a valid URL") object.__setattr__(self, "repo_type", repo_type) object.__setattr__(self, "url", normalize_url(self.url, repo_type=repo_type)) object.__setattr__(self, "repo", url_to_repo(self.url, repo_type=repo_type)) + if self.repo_type != 'github': + self.repo.git.checkout(self.commit) # Convert tags or branches to commit hashes if not (len(self.commit) == 40 and _COMMIT_REGEX.fullmatch(self.commit)): @@ -492,7 +488,7 @@ def __post_init__(self) -> None: if m is not None: lean_version = m["version"] else: - lean_version_commit = get_lean4_commit_from_config(config) + # lean_version_commit = get_lean4_commit_from_config(config) lean_version = get_lean4_version_from_config(toolchain) if not is_supported_version(lean_version): logger.warning( @@ -509,7 +505,7 @@ def from_path(cls, path: Path) -> "LeanGitRepo": @property def name(self) -> str: - return self.repo.name + return os.path.basename(self.url) @property def is_lean4(self) -> bool: @@ -529,12 +525,15 @@ def exists(self) -> bool: def clone_and_checkout(self) -> None: """Clone the repo to the current working directory and checkout a specific commit.""" logger.debug(f"Cloning {self}") - execute(f"git clone -n --recursive {self.url}", capture_output=True) - with working_directory(self.name): - execute( - f"git checkout {self.commit} && git submodule update --recursive", - capture_output=True, - ) + if self.repo_type == 'github' or self.repo_type == 'remote': + execute(f"git clone -n --recursive {self.url}", capture_output=True) + with working_directory(self.name): + execute( + f"git checkout {self.commit} && git submodule update --recursive", + capture_output=True, + ) + else: + shutil.copytree(self.url, self.name) def get_dependencies( self, path: Union[str, Path, None] = None diff --git a/src/lean_dojo/utils.py b/src/lean_dojo/utils.py index 0386949..5e2434f 100644 --- a/src/lean_dojo/utils.py +++ b/src/lean_dojo/utils.py @@ -16,6 +16,7 @@ from contextlib import contextmanager from ray.util.actor_pool import ActorPool from typing import Tuple, Union, List, Generator, Optional +from urllib.parse import urlparse from .constants import NUM_WORKERS, TMP_DIR, LEAN4_PACKAGES_DIR, LEAN4_BUILD_DIR @@ -143,6 +144,33 @@ def camel_case(s: str) -> str: """Convert the string ``s`` to camel case.""" return _CAMEL_CASE_REGEX.sub(" ", s).title().replace(" ", "") +def repo_type_of_url(url: str) -> str: + """Get the type of the repository. + + Args: + url (str): The URL of the repository. + + Returns: + str: The type of the repository. + """ + parsed_url = urlparse(url) + if parsed_url.scheme in ["http", "https"]: + # case 1 - GitHub URL + if "github.com" in url: + if not url.startswith("https://"): + logger.warning(f"{url} should start with https://") + return + else: + return "github" + # case 2 - remote Git URL + else: + return "remote" + # case 3 - local path + elif os.path.exists(parsed_url.path): + return "local" + else: + logger.warning(f"{url} is not a valid URL") + @cache def get_repo_info(path: Path) -> Tuple[str, str]: diff --git a/tests/data_extraction/test_repo_type.py b/tests/data_extraction/test_repo_type.py index 4c3ec15..cedfdaa 100644 --- a/tests/data_extraction/test_repo_type.py +++ b/tests/data_extraction/test_repo_type.py @@ -6,9 +6,11 @@ # GitHub repository details GITHUB_REPO_URL = "https://github.com/yangky11/lean4-example" GITHUB_COMMIT_HASH = "3f8c5eb303a225cdef609498b8d87262e5ef344b" +GITEE_REPO_URL = "https://gitee.com/rexzong/lean4-example" # Local repository path (make sure this path exists and is the clone of the above GitHub repo) -LOCAL_REPO_PATH = "testdata/lean4-example" + +LOCAL_REPO_PATH = f"{os.path.dirname(__file__)}/testdata/lean4-example" def clone_repo_if_not_exists(repo_url, local_path, label='main'): if os.path.exists(local_path): @@ -16,7 +18,7 @@ def clone_repo_if_not_exists(repo_url, local_path, label='main'): repo = Repo.clone_from(repo_url, local_path) repo.git.checkout(label) -def test_github_to_local(): +def test_local_with_branch(): # Clone the GitHub repository to the local path clone_repo_if_not_exists(GITHUB_REPO_URL, LOCAL_REPO_PATH) @@ -34,7 +36,7 @@ def test_github_to_local(): assert github_repo.repo_type == 'github' assert local_repo.repo_type == 'local' -def test_github_to_local_with_commit(): +def test_local_with_commit(): # Clone the GitHub repository to the local path clone_repo_if_not_exists(GITHUB_REPO_URL, LOCAL_REPO_PATH, GITHUB_COMMIT_HASH) @@ -51,3 +53,15 @@ def test_github_to_local_with_commit(): # check the repo type assert github_repo.repo_type == 'github' assert local_repo.repo_type == 'local' + +def test_remote_url(): + # Initialize GitHub repo + github_repo = LeanGitRepo(url=GITHUB_REPO_URL, commit=GITHUB_COMMIT_HASH) + # Initialize Gitee repo + LeanGitRepo(url=GITEE_REPO_URL, commit="main") # get commit by branch + gitee_repo = LeanGitRepo(url=GITEE_REPO_URL, commit=GITHUB_COMMIT_HASH) + # Check if commit hashes match + assert github_repo.commit == gitee_repo.commit + assert github_repo.lean_version == gitee_repo.lean_version + # check the repo type + assert gitee_repo.repo_type == 'remote' From 7c02c9194f56e5f0451f6218bcafe5d2d7a8ca40 Mon Sep 17 00:00:00 2001 From: rexwzh <1073853456@qq.com> Date: Thu, 11 Jul 2024 06:18:59 +0800 Subject: [PATCH 05/34] update trace method & tests --- .gitignore | 2 +- src/lean_dojo/data_extraction/lean.py | 2 + src/lean_dojo/data_extraction/traced_data.py | 3 +- src/lean_dojo/utils.py | 7 ++- tests/data_extraction/test_repo_type.py | 6 +-- tests/data_extraction/test_trace.py | 49 +++++++++++++++++++- 6 files changed, 61 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index 4c7f50f..c607031 100644 --- a/.gitignore +++ b/.gitignore @@ -130,5 +130,5 @@ dmypy.json # Pyre type checker .pyre/ -testdata/* +testdata .vscode \ No newline at end of file diff --git a/src/lean_dojo/data_extraction/lean.py b/src/lean_dojo/data_extraction/lean.py index 70cab47..fbdaedc 100644 --- a/src/lean_dojo/data_extraction/lean.py +++ b/src/lean_dojo/data_extraction/lean.py @@ -520,6 +520,8 @@ def show(self) -> None: webbrowser.open(self.commit_url) def exists(self) -> bool: + if self.repo_type == 'local': + return os.path.exists(self.url) return url_exists(self.commit_url) def clone_and_checkout(self) -> None: diff --git a/src/lean_dojo/data_extraction/traced_data.py b/src/lean_dojo/data_extraction/traced_data.py index 82317b2..a0e8518 100644 --- a/src/lean_dojo/data_extraction/traced_data.py +++ b/src/lean_dojo/data_extraction/traced_data.py @@ -15,6 +15,7 @@ from loguru import logger from dataclasses import dataclass, field from typing import List, Optional, Dict, Any, Tuple, Union +from git import Repo from ..utils import ( is_git_repo, @@ -1030,7 +1031,7 @@ def check_sanity(self) -> None: The function raises exceptions in case of unsuccessful checks. """ logger.debug(f"Checking the sanity of {self}") - assert isinstance(self.repo, LeanGitRepo) + assert isinstance(self.repo, (LeanGitRepo, Repo)) assert isinstance(self.dependencies, dict) for k, v in self.dependencies.items(): assert isinstance(k, str) and isinstance(v, LeanGitRepo) diff --git a/src/lean_dojo/utils.py b/src/lean_dojo/utils.py index 5e2434f..2b5d9ff 100644 --- a/src/lean_dojo/utils.py +++ b/src/lean_dojo/utils.py @@ -171,7 +171,6 @@ def repo_type_of_url(url: str) -> str: else: logger.warning(f"{url} is not a valid URL") - @cache def get_repo_info(path: Path) -> Tuple[str, str]: """Get the URL and commit hash of the Git repo at ``path``. @@ -184,7 +183,11 @@ def get_repo_info(path: Path) -> Tuple[str, str]: """ with working_directory(path): # Get the URL. - url_msg, _ = execute(f"git remote get-url origin", capture_output=True) + try: + url_msg, _ = execute(f"git remote get-url origin", capture_output=True) + except Exception as e: # local repo + logger.debug("Local repo with no remote origin.") + url_msg = str(path) url = url_msg.strip() # Get the commit. commit_msg, _ = execute(f"git log -n 1", capture_output=True) diff --git a/tests/data_extraction/test_repo_type.py b/tests/data_extraction/test_repo_type.py index cedfdaa..e86660b 100644 --- a/tests/data_extraction/test_repo_type.py +++ b/tests/data_extraction/test_repo_type.py @@ -1,6 +1,5 @@ from lean_dojo import LeanGitRepo from git import Repo -from github import Github import os, shutil # GitHub repository details @@ -18,9 +17,10 @@ def clone_repo_if_not_exists(repo_url, local_path, label='main'): repo = Repo.clone_from(repo_url, local_path) repo.git.checkout(label) +# Clone the GitHub repository to the local path +clone_repo_if_not_exists(GITHUB_REPO_URL, LOCAL_REPO_PATH) + def test_local_with_branch(): - # Clone the GitHub repository to the local path - clone_repo_if_not_exists(GITHUB_REPO_URL, LOCAL_REPO_PATH) # Initialize GitHub repo github_repo = LeanGitRepo(url=GITHUB_REPO_URL, commit="main") diff --git a/tests/data_extraction/test_trace.py b/tests/data_extraction/test_trace.py index 0064933..4a5f278 100644 --- a/tests/data_extraction/test_trace.py +++ b/tests/data_extraction/test_trace.py @@ -1,10 +1,57 @@ from pathlib import Path from lean_dojo import * +from git import Repo +import os, shutil +# repository details +GITHUB_REPO_URL = "https://github.com/yangky11/lean4-example" +GITHUB_COMMIT_HASH = "3f8c5eb303a225cdef609498b8d87262e5ef344b" +GITEE_REPO_URL = "https://gitee.com/rexzong/lean4-example" -def test_trace(traced_repo): +LOCAL_REPO_PATH = f"{os.path.dirname(__file__)}/testdata/lean4-example" +LOCAL_TRACE_DIR = f"{os.path.dirname(__file__)}/testdata/lean4-example-traced" + + +def clone_repo_and_remove_remote(repo_url, local_path, label='main'): + if os.path.exists(local_path): + shutil.rmtree(local_path) + repo = Repo.clone_from(repo_url, local_path) + repo.git.checkout(label) + remote = repo.remote(name='origin') + remote.remove(repo, 'origin') + +# Clone the GitHub repository to the local path +clone_repo_and_remove_remote(GITHUB_REPO_URL, LOCAL_REPO_PATH) + +def test_remote_trace(): + remote_repo = LeanGitRepo(url=GITEE_REPO_URL, commit="main") + assert remote_repo.repo_type == 'remote' + if os.path.exists(LOCAL_TRACE_DIR): + shutil.rmtree(LOCAL_TRACE_DIR) + traced_repo = trace(remote_repo, LOCAL_TRACE_DIR) traced_repo.check_sanity() + assert traced_repo.repo.repo_type == 'remote' + +def test_local_trace(): + local_repo = LeanGitRepo(url=LOCAL_REPO_PATH, commit="main") + assert local_repo.repo_type == 'local' + if os.path.exists(LOCAL_TRACE_DIR): + shutil.rmtree(LOCAL_TRACE_DIR) + traced_repo = trace(local_repo, LOCAL_TRACE_DIR) + traced_repo.check_sanity() + assert traced_repo.repo.repo_type == 'local' + +def test_github_trace(): + remote_repo = LeanGitRepo(url=GITHUB_REPO_URL, commit="main") + assert remote_repo.repo_type == 'github' + if os.path.exists(LOCAL_TRACE_DIR): + shutil.rmtree(LOCAL_TRACE_DIR) + traced_repo = trace(remote_repo, LOCAL_TRACE_DIR) + traced_repo.check_sanity() + assert traced_repo.repo.repo_type == 'github' +def test_trace(traced_repo): + traced_repo.check_sanity() def test_get_traced_repo_path(mathlib4_repo): path = get_traced_repo_path(mathlib4_repo) From 268fc2da158301fa476dc56ef1a46f3bd6292e3a Mon Sep 17 00:00:00 2001 From: rexwzh <1073853456@qq.com> Date: Thu, 11 Jul 2024 06:56:29 +0800 Subject: [PATCH 06/34] add tests for interaction --- tests/data_extraction/testdata/.keep | 0 tests/interaction/test_nongithub_repo.py | 55 ++++++++++++++++++++++++ 2 files changed, 55 insertions(+) delete mode 100644 tests/data_extraction/testdata/.keep create mode 100644 tests/interaction/test_nongithub_repo.py diff --git a/tests/data_extraction/testdata/.keep b/tests/data_extraction/testdata/.keep deleted file mode 100644 index e69de29..0000000 diff --git a/tests/interaction/test_nongithub_repo.py b/tests/interaction/test_nongithub_repo.py new file mode 100644 index 0000000..e7f0b8e --- /dev/null +++ b/tests/interaction/test_nongithub_repo.py @@ -0,0 +1,55 @@ +from pathlib import Path +from lean_dojo import * +from git import Repo +import os, shutil + +# repository details +GITHUB_REPO_URL = "https://github.com/yangky11/lean4-example" +GITHUB_COMMIT_HASH = "3f8c5eb303a225cdef609498b8d87262e5ef344b" +GITEE_REPO_URL = "https://gitee.com/rexzong/lean4-example" + +LOCAL_REPO_PATH = f"{os.path.dirname(__file__)}/testdata/lean4-example" +LOCAL_TRACE_DIR = f"{os.path.dirname(__file__)}/testdata/lean4-example-traced" + +def clone_repo_and_remove_remote(repo_url, local_path, label='main'): + if os.path.exists(local_path): + shutil.rmtree(local_path) + repo = Repo.clone_from(repo_url, local_path) + repo.git.checkout(label) + remote = repo.remote(name='origin') + remote.remove(repo, 'origin') + +# Clone the GitHub repository to the local path +clone_repo_and_remove_remote(GITHUB_REPO_URL, LOCAL_REPO_PATH) + +def test_remote_interact(): + repo = LeanGitRepo(url=GITEE_REPO_URL, commit="main") + assert repo.repo_type == 'remote' + theorem = Theorem(repo, "Lean4Example.lean", "hello_world") + # initial state + dojo, state_0 = Dojo(theorem).__enter__() + assert state_0.pp == 'a b c : Nat\n⊢ a + b + c = a + c + b' + # state after running a tactic + state_1 = dojo.run_tac(state_0, "rw [add_assoc]") + assert state_1.pp == 'a b c : Nat\n⊢ a + (b + c) = a + c + b' + # state after running another a sorry tactic + assert dojo.run_tac(state_1, "sorry") == ProofGivenUp() + # finish proof + final_state = dojo.run_tac(state_1, "rw [add_comm b, ←add_assoc]") + assert isinstance(final_state, ProofFinished) + +def test_local_interact(): + repo = LeanGitRepo(url=LOCAL_REPO_PATH, commit="main") + assert repo.repo_type == 'local' + theorem = Theorem(repo, "Lean4Example.lean", "hello_world") + # initial state + dojo, state_0 = Dojo(theorem).__enter__() + assert state_0.pp == 'a b c : Nat\n⊢ a + b + c = a + c + b' + # state after running a tactic + state_1 = dojo.run_tac(state_0, "rw [add_assoc]") + assert state_1.pp == 'a b c : Nat\n⊢ a + (b + c) = a + c + b' + # state after running another a sorry tactic + assert dojo.run_tac(state_1, "sorry") == ProofGivenUp() + # finish proof + final_state = dojo.run_tac(state_1, "rw [add_comm b, ←add_assoc]") + assert isinstance(final_state, ProofFinished) From 36661d6663219d3daa0bcb456dc293cb635b55ed Mon Sep 17 00:00:00 2001 From: rexwzh <1073853456@qq.com> Date: Thu, 11 Jul 2024 07:18:53 +0800 Subject: [PATCH 07/34] use tempdir for local type & resolve cache problem --- src/lean_dojo/data_extraction/cache.py | 7 ++++--- src/lean_dojo/data_extraction/lean.py | 12 ++++++++---- src/lean_dojo/data_extraction/trace.py | 9 +++++++-- 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/src/lean_dojo/data_extraction/cache.py b/src/lean_dojo/data_extraction/cache.py index acfe1c9..c401e5a 100644 --- a/src/lean_dojo/data_extraction/cache.py +++ b/src/lean_dojo/data_extraction/cache.py @@ -8,7 +8,7 @@ from loguru import logger from filelock import FileLock from dataclasses import dataclass, field -from typing import Optional, Tuple, Generator +from typing import Optional, Tuple, Generator, Union from ..utils import ( execute, @@ -90,11 +90,12 @@ def get(self, url: str, commit: str) -> Optional[Path]: else: return None - def store(self, src: Path) -> Path: + def store(self, src: Path, dirpath: Union[Path, None]=None) -> Path: """Store a traced repo at path ``src``. Return its path in the cache.""" url, commit = get_repo_info(src) - dirpath = self.cache_dir / _format_dirname(url, commit) _, repo_name = _split_git_url(url) + if dirpath is None: + dirpath = self.cache_dir / _format_dirname(url, commit) if not dirpath.exists(): with self.lock: with report_critical_failure(_CACHE_CORRPUTION_MSG): diff --git a/src/lean_dojo/data_extraction/lean.py b/src/lean_dojo/data_extraction/lean.py index fbdaedc..2c70d8b 100644 --- a/src/lean_dojo/data_extraction/lean.py +++ b/src/lean_dojo/data_extraction/lean.py @@ -77,18 +77,22 @@ def url_to_repo(url: str, num_retries: int = 2, repo_type: str = 'github') -> Re """ url = normalize_url(url) backoff = 1 + temp_dir = os.path.join(TMP_DIR or '/tmp', str(uuid.uuid4())[:8]) while True: try: if repo_type == 'github': return GITHUB.get_repo("/".join(url.split("/")[-2:])) elif repo_type == 'local': - if os.path.exists(url): - return Repo(url) - else: + if not os.path.exists(url): raise ValueError(f"Local path {url} does not exist") + try: + Repo(url) + except: + raise ValueError(f"Local path {url} is not a git repo") + shutil.copytree(url, f"{temp_dir}/{os.path.basename(url)}") + return Repo(f"{temp_dir}/{os.path.basename(url)}") else: - temp_dir = os.path.join(TMP_DIR or '/tmp', str(uuid.uuid4())[:8]) return Repo.clone_from(url, f"{temp_dir}/{os.path.basename(url)}") except Exception as ex: if num_retries <= 0: diff --git a/src/lean_dojo/data_extraction/trace.py b/src/lean_dojo/data_extraction/trace.py index e5b2492..53f4093 100644 --- a/src/lean_dojo/data_extraction/trace.py +++ b/src/lean_dojo/data_extraction/trace.py @@ -16,7 +16,7 @@ from subprocess import CalledProcessError from typing import Union, Optional, List, Generator -from .cache import cache +from .cache import cache, get_repo_info, _format_dirname from .lean import LeanGitRepo from ..constants import NUM_PROCS from .traced_data import TracedRepo @@ -212,7 +212,12 @@ def get_traced_repo_path(repo: LeanGitRepo, build_deps: bool = True) -> Path: _trace(repo, build_deps) traced_repo = TracedRepo.from_traced_files(tmp_dir / repo.name, build_deps) traced_repo.save_to_disk() - path = cache.store(tmp_dir / repo.name) + if repo.repo_type == "local": # avoid using the temp prefix + url, commit = get_repo_info(Path(repo.url)) + dirpath = cache.cache_dir / _format_dirname(url, commit) + path = cache.store(tmp_dir / repo.name, dirpath) + else: + path = cache.store(tmp_dir / repo.name) else: logger.debug("The traced repo is available in the cache.") return path From 95714323227f79d7927b4081b1e8fb15ef864d14 Mon Sep 17 00:00:00 2001 From: rexwzh <1073853456@qq.com> Date: Sun, 21 Jul 2024 01:39:39 +0800 Subject: [PATCH 08/34] update from_path & simplify tests --- src/lean_dojo/data_extraction/lean.py | 3 +- src/lean_dojo/utils.py | 15 +++--- tests/conftest.py | 28 ++++++++++- tests/data_extraction/test_repo_type.py | 64 +++++++++++------------- tests/data_extraction/test_trace.py | 62 ++++++++++------------- tests/interaction/test_nongithub_repo.py | 35 ++++--------- 6 files changed, 100 insertions(+), 107 deletions(-) diff --git a/src/lean_dojo/data_extraction/lean.py b/src/lean_dojo/data_extraction/lean.py index 2c70d8b..d0174f8 100644 --- a/src/lean_dojo/data_extraction/lean.py +++ b/src/lean_dojo/data_extraction/lean.py @@ -467,7 +467,7 @@ def __post_init__(self) -> None: object.__setattr__(self, "repo_type", repo_type) object.__setattr__(self, "url", normalize_url(self.url, repo_type=repo_type)) object.__setattr__(self, "repo", url_to_repo(self.url, repo_type=repo_type)) - if self.repo_type != 'github': + if self.repo_type != 'github': # checkout to the commit for the `git.Repo` object self.repo.git.checkout(self.commit) # Convert tags or branches to commit hashes @@ -540,6 +540,7 @@ def clone_and_checkout(self) -> None: ) else: shutil.copytree(self.url, self.name) + Repo(self.name).git.checkout(self.commit) def get_dependencies( self, path: Union[str, Path, None] = None diff --git a/src/lean_dojo/utils.py b/src/lean_dojo/utils.py index 2b5d9ff..8b1e5a6 100644 --- a/src/lean_dojo/utils.py +++ b/src/lean_dojo/utils.py @@ -183,21 +183,18 @@ def get_repo_info(path: Path) -> Tuple[str, str]: """ with working_directory(path): # Get the URL. - try: - url_msg, _ = execute(f"git remote get-url origin", capture_output=True) - except Exception as e: # local repo - logger.debug("Local repo with no remote origin.") - url_msg = str(path) - url = url_msg.strip() + # url_msg, _ = execute(f"git remote get-url origin", capture_output=True) + # url = url_msg.strip() + url = str(path) # use the absolute path # Get the commit. commit_msg, _ = execute(f"git log -n 1", capture_output=True) m = re.search(r"(?<=^commit )[a-z0-9]+", commit_msg) assert m is not None commit = m.group() - if url.startswith("git@"): - assert url.endswith(".git") - url = url[: -len(".git")].replace(":", "/").replace("git@", "https://") + # if url.startswith("git@"): + # assert url.endswith(".git") + # url = url[: -len(".git")].replace(":", "/").replace("git@", "https://") return url, commit diff --git a/tests/conftest.py b/tests/conftest.py index 1e14c96..ad80630 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,6 @@ import pytest - +import os, shutil +from git import Repo from lean_dojo import * @@ -14,6 +15,31 @@ LEAN4_EXAMPLE_URL, ] +TEST_GITHUB_COMMIT_HASH = "3f8c5eb303a225cdef609498b8d87262e5ef344b" +REMOTE_EXAMPLE_URL = "https://gitee.com/rexzong/lean4-example" +LOCAL_TEST_PATH = f"{os.path.dirname(__file__)}/testdata" + +@pytest.fixture(scope="session") +def clean_clone_and_checkout(): + def _clean_clone_and_checkout(repo_url, local_path, label='main'): + if os.path.exists(local_path): + shutil.rmtree(local_path) + repo = Repo.clone_from(repo_url, local_path) + repo.git.checkout(label) + return repo + return _clean_clone_and_checkout + +@pytest.fixture(scope="session") +def lean4_example_url(): + return LEAN4_EXAMPLE_URL + +@pytest.fixture(scope="session") +def remote_example_url(): + return REMOTE_EXAMPLE_URL + +@pytest.fixture(scope="session") +def local_test_path(): + return LOCAL_TEST_PATH @pytest.fixture(scope="session") def monkeysession(): diff --git a/tests/data_extraction/test_repo_type.py b/tests/data_extraction/test_repo_type.py index e86660b..60d14c2 100644 --- a/tests/data_extraction/test_repo_type.py +++ b/tests/data_extraction/test_repo_type.py @@ -1,50 +1,37 @@ +import pytest from lean_dojo import LeanGitRepo -from git import Repo -import os, shutil +import os -# GitHub repository details -GITHUB_REPO_URL = "https://github.com/yangky11/lean4-example" -GITHUB_COMMIT_HASH = "3f8c5eb303a225cdef609498b8d87262e5ef344b" -GITEE_REPO_URL = "https://gitee.com/rexzong/lean4-example" - -# Local repository path (make sure this path exists and is the clone of the above GitHub repo) - -LOCAL_REPO_PATH = f"{os.path.dirname(__file__)}/testdata/lean4-example" - -def clone_repo_if_not_exists(repo_url, local_path, label='main'): - if os.path.exists(local_path): - shutil.rmtree(local_path) - repo = Repo.clone_from(repo_url, local_path) - repo.git.checkout(label) - -# Clone the GitHub repository to the local path -clone_repo_if_not_exists(GITHUB_REPO_URL, LOCAL_REPO_PATH) - -def test_local_with_branch(): - +def test_local_with_branch(clean_clone_and_checkout, lean4_example_url, local_test_path): # Initialize GitHub repo - github_repo = LeanGitRepo(url=GITHUB_REPO_URL, commit="main") + github_repo = LeanGitRepo(url=lean4_example_url, commit="main") # Initialize local repo - local_repo = LeanGitRepo(url=LOCAL_REPO_PATH, commit="main") + local_repo_path = os.path.join(local_test_path, 'lean4-example') + clean_clone_and_checkout(lean4_example_url, local_repo_path, 'main') + local_repo = LeanGitRepo(url=local_repo_path, commit="main") + from_path_repo = LeanGitRepo.from_path(local_repo_path) # Check if commit hashes match - assert github_repo.commit == local_repo.commit - assert github_repo.lean_version == local_repo.lean_version + assert github_repo.commit == local_repo.commit == from_path_repo.commit + assert github_repo.lean_version == local_repo.lean_version == from_path_repo.lean_version # check the repo type assert github_repo.repo_type == 'github' assert local_repo.repo_type == 'local' + assert from_path_repo.repo_type == 'local' + +def test_local_with_commit(clean_clone_and_checkout, lean4_example_url, local_test_path): + # GitHub commit hash from conftest.py + COMMIT_HASH = "3f8c5eb303a225cdef609498b8d87262e5ef344b" -def test_local_with_commit(): - # Clone the GitHub repository to the local path - clone_repo_if_not_exists(GITHUB_REPO_URL, LOCAL_REPO_PATH, GITHUB_COMMIT_HASH) - # Initialize GitHub repo - github_repo = LeanGitRepo(url=GITHUB_REPO_URL, commit=GITHUB_COMMIT_HASH) + github_repo = LeanGitRepo(url=lean4_example_url, commit=COMMIT_HASH) # Initialize local repo - local_repo = LeanGitRepo(url=LOCAL_REPO_PATH, commit=GITHUB_COMMIT_HASH) + local_repo_path = os.path.join(local_test_path, 'lean4-example') + clean_clone_and_checkout(lean4_example_url, local_repo_path, 'main') # use main branch + local_repo = LeanGitRepo(url=local_repo_path, commit=COMMIT_HASH) # checkout to commit hash # Check if commit hashes match assert github_repo.commit == local_repo.commit @@ -54,14 +41,19 @@ def test_local_with_commit(): assert github_repo.repo_type == 'github' assert local_repo.repo_type == 'local' -def test_remote_url(): +def test_remote_url(lean4_example_url, remote_example_url): + # GitHub commit hash from conftest.py + COMMIT_HASH = "3f8c5eb303a225cdef609498b8d87262e5ef344b" + # Initialize GitHub repo - github_repo = LeanGitRepo(url=GITHUB_REPO_URL, commit=GITHUB_COMMIT_HASH) + github_repo = LeanGitRepo(url=lean4_example_url, commit=COMMIT_HASH) # Initialize Gitee repo - LeanGitRepo(url=GITEE_REPO_URL, commit="main") # get commit by branch - gitee_repo = LeanGitRepo(url=GITEE_REPO_URL, commit=GITHUB_COMMIT_HASH) + _ = LeanGitRepo(url=remote_example_url, commit="main") # get commit by branch + gitee_repo = LeanGitRepo(url=remote_example_url, commit=COMMIT_HASH) + # Check if commit hashes match assert github_repo.commit == gitee_repo.commit assert github_repo.lean_version == gitee_repo.lean_version + # check the repo type assert gitee_repo.repo_type == 'remote' diff --git a/tests/data_extraction/test_trace.py b/tests/data_extraction/test_trace.py index 4a5f278..541060c 100644 --- a/tests/data_extraction/test_trace.py +++ b/tests/data_extraction/test_trace.py @@ -1,54 +1,44 @@ -from pathlib import Path +import pytest from lean_dojo import * from git import Repo import os, shutil +from pathlib import Path -# repository details -GITHUB_REPO_URL = "https://github.com/yangky11/lean4-example" -GITHUB_COMMIT_HASH = "3f8c5eb303a225cdef609498b8d87262e5ef344b" -GITEE_REPO_URL = "https://gitee.com/rexzong/lean4-example" - -LOCAL_REPO_PATH = f"{os.path.dirname(__file__)}/testdata/lean4-example" -LOCAL_TRACE_DIR = f"{os.path.dirname(__file__)}/testdata/lean4-example-traced" - - -def clone_repo_and_remove_remote(repo_url, local_path, label='main'): - if os.path.exists(local_path): - shutil.rmtree(local_path) - repo = Repo.clone_from(repo_url, local_path) - repo.git.checkout(label) - remote = repo.remote(name='origin') - remote.remove(repo, 'origin') +# Avoid using remote cache +os.environ['DISABLE_REMOTE_CACHE'] = 'true' -# Clone the GitHub repository to the local path -clone_repo_and_remove_remote(GITHUB_REPO_URL, LOCAL_REPO_PATH) +@pytest.fixture(scope="session") +def local_trace_dir(local_test_path): + return os.path.join(local_test_path, 'lean4-example-traced') -def test_remote_trace(): - remote_repo = LeanGitRepo(url=GITEE_REPO_URL, commit="main") +def test_remote_trace(remote_example_url, local_trace_dir): + remote_repo = LeanGitRepo(url=remote_example_url, commit="main") assert remote_repo.repo_type == 'remote' - if os.path.exists(LOCAL_TRACE_DIR): - shutil.rmtree(LOCAL_TRACE_DIR) - traced_repo = trace(remote_repo, LOCAL_TRACE_DIR) + if os.path.exists(local_trace_dir): + shutil.rmtree(local_trace_dir) + traced_repo = trace(remote_repo, local_trace_dir) traced_repo.check_sanity() - assert traced_repo.repo.repo_type == 'remote' + assert traced_repo.repo.repo_type == 'local' -def test_local_trace(): - local_repo = LeanGitRepo(url=LOCAL_REPO_PATH, commit="main") +def test_local_trace(clean_clone_and_checkout, lean4_example_url, local_test_path, local_trace_dir): + local_repo_path = os.path.join(local_test_path, 'lean4-example') + clean_clone_and_checkout(lean4_example_url, local_repo_path) + local_repo = LeanGitRepo(url=local_repo_path, commit="main") assert local_repo.repo_type == 'local' - if os.path.exists(LOCAL_TRACE_DIR): - shutil.rmtree(LOCAL_TRACE_DIR) - traced_repo = trace(local_repo, LOCAL_TRACE_DIR) + if os.path.exists(local_trace_dir): + shutil.rmtree(local_trace_dir) + traced_repo = trace(local_repo, local_trace_dir) traced_repo.check_sanity() assert traced_repo.repo.repo_type == 'local' -def test_github_trace(): - remote_repo = LeanGitRepo(url=GITHUB_REPO_URL, commit="main") +def test_github_trace(lean4_example_url, local_trace_dir): + remote_repo = LeanGitRepo(url=lean4_example_url, commit="main") assert remote_repo.repo_type == 'github' - if os.path.exists(LOCAL_TRACE_DIR): - shutil.rmtree(LOCAL_TRACE_DIR) - traced_repo = trace(remote_repo, LOCAL_TRACE_DIR) + if os.path.exists(local_trace_dir): + shutil.rmtree(local_trace_dir) + traced_repo = trace(remote_repo, local_trace_dir) traced_repo.check_sanity() - assert traced_repo.repo.repo_type == 'github' + assert traced_repo.repo.repo_type == 'local' def test_trace(traced_repo): traced_repo.check_sanity() diff --git a/tests/interaction/test_nongithub_repo.py b/tests/interaction/test_nongithub_repo.py index e7f0b8e..76078df 100644 --- a/tests/interaction/test_nongithub_repo.py +++ b/tests/interaction/test_nongithub_repo.py @@ -1,29 +1,13 @@ -from pathlib import Path +import pytest from lean_dojo import * from git import Repo -import os, shutil +import os -# repository details -GITHUB_REPO_URL = "https://github.com/yangky11/lean4-example" -GITHUB_COMMIT_HASH = "3f8c5eb303a225cdef609498b8d87262e5ef344b" -GITEE_REPO_URL = "https://gitee.com/rexzong/lean4-example" +# Avoid using remote cache +os.environ['DISABLE_REMOTE_CACHE'] = 'true' -LOCAL_REPO_PATH = f"{os.path.dirname(__file__)}/testdata/lean4-example" -LOCAL_TRACE_DIR = f"{os.path.dirname(__file__)}/testdata/lean4-example-traced" - -def clone_repo_and_remove_remote(repo_url, local_path, label='main'): - if os.path.exists(local_path): - shutil.rmtree(local_path) - repo = Repo.clone_from(repo_url, local_path) - repo.git.checkout(label) - remote = repo.remote(name='origin') - remote.remove(repo, 'origin') - -# Clone the GitHub repository to the local path -clone_repo_and_remove_remote(GITHUB_REPO_URL, LOCAL_REPO_PATH) - -def test_remote_interact(): - repo = LeanGitRepo(url=GITEE_REPO_URL, commit="main") +def test_remote_interact(remote_example_url): + repo = LeanGitRepo(url=remote_example_url, commit="main") assert repo.repo_type == 'remote' theorem = Theorem(repo, "Lean4Example.lean", "hello_world") # initial state @@ -38,8 +22,11 @@ def test_remote_interact(): final_state = dojo.run_tac(state_1, "rw [add_comm b, ←add_assoc]") assert isinstance(final_state, ProofFinished) -def test_local_interact(): - repo = LeanGitRepo(url=LOCAL_REPO_PATH, commit="main") +def test_local_interact(clean_clone_and_checkout, lean4_example_url, local_test_path): + # Clone the GitHub repository to the local path + local_repo_path = os.path.join(local_test_path, 'lean4-example') + clean_clone_and_checkout(lean4_example_url, local_repo_path) + repo = LeanGitRepo(url=local_repo_path, commit="main") assert repo.repo_type == 'local' theorem = Theorem(repo, "Lean4Example.lean", "hello_world") # initial state From 6429b410c3881b331242ce392092f04a1f6b5ba1 Mon Sep 17 00:00:00 2001 From: rex <1073853456@qq.com> Date: Mon, 22 Jul 2024 20:48:25 +0800 Subject: [PATCH 09/34] fix git version for windows --- src/lean_dojo/constants.py | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/src/lean_dojo/constants.py b/src/lean_dojo/constants.py index aa50261..c7a5fc3 100644 --- a/src/lean_dojo/constants.py +++ b/src/lean_dojo/constants.py @@ -71,20 +71,25 @@ assert re.fullmatch(r"\d+g", TACTIC_MEMORY_LIMIT) -def check_git_version(min_version: Tuple[int, int, int]) -> Tuple[int, int, int]: +def check_git_version(min_version: Tuple[int, int, int]) -> None: """Check the version of Git installed on the system.""" - res = subprocess.run("git --version", shell=True, capture_output=True, check=True) - output = res.stdout.decode() - error = res.stderr.decode() - assert error == "", error - m = re.match(r"git version (?P[0-9.]+)", output) - version = tuple(int(_) for _ in m["version"].split(".")) - - version_str = ".".join(str(_) for _ in version) - min_version_str = ".".join(str(_) for _ in min_version) - assert ( - version >= min_version - ), f"Git version {version_str} is too old. Please upgrade to at least {min_version_str}." - + try: + res = subprocess.run("git --version", shell=True, capture_output=True, check=True) + output = res.stdout.decode().strip() + error = res.stderr.decode() + assert error == "", error + match = re.search(r"git version (\d+\.\d+\.\d+)", output) + if not match: + raise ValueError("Could not parse Git version from the output.") + # Convert version number string to tuple of integers + version = tuple(int(_) for _ in match.group(1).split('.')) + + version_str = ".".join(str(_) for _ in version) + min_version_str = ".".join(str(_) for _ in min_version) + assert ( + version >= min_version + ), f"Git version {version_str} is too old. Please upgrade to at least {min_version_str}." + except subprocess.CalledProcessError as e: + raise Exception(f"Failed to run git command: {e}") check_git_version((2, 25, 0)) From 0a8796d763d1e4bab0a6eac64e24bda021eefb42 Mon Sep 17 00:00:00 2001 From: rexwzh <1073853456@qq.com> Date: Tue, 23 Jul 2024 10:50:28 +0800 Subject: [PATCH 10/34] add more tests for repo type --- src/lean_dojo/data_extraction/cache.py | 16 ++--- src/lean_dojo/data_extraction/lean.py | 4 +- src/lean_dojo/data_extraction/trace.py | 10 ++- src/lean_dojo/utils.py | 21 ++---- tests/conftest.py | 6 +- tests/data_extraction/test_repo_type.py | 92 ++++++++++++++++++++++--- 6 files changed, 108 insertions(+), 41 deletions(-) diff --git a/src/lean_dojo/data_extraction/cache.py b/src/lean_dojo/data_extraction/cache.py index c401e5a..5517895 100644 --- a/src/lean_dojo/data_extraction/cache.py +++ b/src/lean_dojo/data_extraction/cache.py @@ -90,17 +90,17 @@ def get(self, url: str, commit: str) -> Optional[Path]: else: return None - def store(self, src: Path, dirpath: Union[Path, None]=None) -> Path: + def store(self, src: Path, cache_path: Union[Path, None]=None) -> Path: """Store a traced repo at path ``src``. Return its path in the cache.""" - url, commit = get_repo_info(src) - _, repo_name = _split_git_url(url) - if dirpath is None: - dirpath = self.cache_dir / _format_dirname(url, commit) - if not dirpath.exists(): + if cache_path is None: + url, commit = get_repo_info(src) + _, repo_name = _split_git_url(url) + cache_path = self.cache_dir / _format_dirname(url, commit) / repo_name + if not cache_path.exists(): with self.lock: with report_critical_failure(_CACHE_CORRPUTION_MSG): - shutil.copytree(src, dirpath / repo_name) - return dirpath / repo_name + shutil.copytree(src, cache_path) + return cache_path cache = Cache(CACHE_DIR) diff --git a/src/lean_dojo/data_extraction/lean.py b/src/lean_dojo/data_extraction/lean.py index 0045f8a..3f210e5 100644 --- a/src/lean_dojo/data_extraction/lean.py +++ b/src/lean_dojo/data_extraction/lean.py @@ -487,8 +487,10 @@ def __post_init__(self) -> None: object.__setattr__(self, "lean_version", lean_version) @classmethod - def from_path(cls, path: Path) -> "LeanGitRepo": + def from_path(cls, path: Union[str, Path]) -> "LeanGitRepo": """Construct a :class:`LeanGitRepo` object from the path to a local Git repo.""" + if isinstance(path, str): + path = Path(path) url, commit = get_repo_info(path) return cls(url, commit) diff --git a/src/lean_dojo/data_extraction/trace.py b/src/lean_dojo/data_extraction/trace.py index 53f4093..963f01d 100644 --- a/src/lean_dojo/data_extraction/trace.py +++ b/src/lean_dojo/data_extraction/trace.py @@ -212,12 +212,10 @@ def get_traced_repo_path(repo: LeanGitRepo, build_deps: bool = True) -> Path: _trace(repo, build_deps) traced_repo = TracedRepo.from_traced_files(tmp_dir / repo.name, build_deps) traced_repo.save_to_disk() - if repo.repo_type == "local": # avoid using the temp prefix - url, commit = get_repo_info(Path(repo.url)) - dirpath = cache.cache_dir / _format_dirname(url, commit) - path = cache.store(tmp_dir / repo.name, dirpath) - else: - path = cache.store(tmp_dir / repo.name) + # cache path: $HOME/.cache/lean_dojo/{_format_dirname}/{repo_name} + url, commit = get_repo_info(Path(repo.url)) + cache_path = cache.cache_dir / _format_dirname(url, commit) / repo.name + path = cache.store(tmp_dir / repo.name, cache_path) else: logger.debug("The traced repo is available in the cache.") return path diff --git a/src/lean_dojo/utils.py b/src/lean_dojo/utils.py index 8b1e5a6..63b19d9 100644 --- a/src/lean_dojo/utils.py +++ b/src/lean_dojo/utils.py @@ -181,21 +181,12 @@ def get_repo_info(path: Path) -> Tuple[str, str]: Returns: Tuple[str, str]: URL and (most recent) hash commit """ - with working_directory(path): - # Get the URL. - # url_msg, _ = execute(f"git remote get-url origin", capture_output=True) - # url = url_msg.strip() - url = str(path) # use the absolute path - # Get the commit. - commit_msg, _ = execute(f"git log -n 1", capture_output=True) - m = re.search(r"(?<=^commit )[a-z0-9]+", commit_msg) - assert m is not None - commit = m.group() - - # if url.startswith("git@"): - # assert url.endswith(".git") - # url = url[: -len(".git")].replace(":", "/").replace("git@", "https://") - + url = str(path.absolute()) # use the absolute path + # Get the commit. + commit_msg, _ = execute(f"git log -n 1", capture_output=True) + m = re.search(r"(?<=^commit )[a-z0-9]+", commit_msg) + assert m is not None + commit = m.group() return url, commit diff --git a/tests/conftest.py b/tests/conftest.py index ad80630..0df03dc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -15,7 +15,7 @@ LEAN4_EXAMPLE_URL, ] -TEST_GITHUB_COMMIT_HASH = "3f8c5eb303a225cdef609498b8d87262e5ef344b" +EXAMPLE_COMMIT_HASH = "3f8c5eb303a225cdef609498b8d87262e5ef344b" REMOTE_EXAMPLE_URL = "https://gitee.com/rexzong/lean4-example" LOCAL_TEST_PATH = f"{os.path.dirname(__file__)}/testdata" @@ -33,6 +33,10 @@ def _clean_clone_and_checkout(repo_url, local_path, label='main'): def lean4_example_url(): return LEAN4_EXAMPLE_URL +@pytest.fixture(scope="session") +def example_commit_hash(): + return EXAMPLE_COMMIT_HASH + @pytest.fixture(scope="session") def remote_example_url(): return REMOTE_EXAMPLE_URL diff --git a/tests/data_extraction/test_repo_type.py b/tests/data_extraction/test_repo_type.py index 60d14c2..c566856 100644 --- a/tests/data_extraction/test_repo_type.py +++ b/tests/data_extraction/test_repo_type.py @@ -1,6 +1,62 @@ import pytest from lean_dojo import LeanGitRepo import os +from lean_dojo.data_extraction.lean import url_to_repo, _to_commit_hash +from github import Repository +from git import Repo +from lean_dojo.utils import repo_type_of_url, get_repo_info +from pathlib import Path + +def test_repo_types(clean_clone_and_checkout, local_test_path, lean4_example_url, remote_example_url, example_commit_hash): + # init: local repo + local_repo_path = os.path.join(local_test_path, 'lean4-example') + clean_clone_and_checkout(lean4_example_url, local_repo_path, 'main') + url, _ = get_repo_info(Path(local_repo_path)) + assert url == local_repo_path + + # test repo type of url + assert repo_type_of_url(lean4_example_url) == 'github' + assert repo_type_of_url(remote_example_url) == 'remote' + assert repo_type_of_url(local_repo_path) == 'local' + + # test url to repo + ## test github url + github_repo = url_to_repo(lean4_example_url, num_retries=2) + assert isinstance(github_repo, Repository.Repository) + assert github_repo.full_name == 'yangky11/lean4-example' + + ## test remote url + remote_repo = url_to_repo(remote_example_url, repo_type='remote', num_retries=2) + assert isinstance(remote_repo, Repo) + assert remote_repo.remotes[0].url == remote_example_url + + ## test local path + local_repo = url_to_repo(local_repo_path, repo_type='local') + assert isinstance(local_repo, Repo) + assert local_repo.remotes[0].url == lean4_example_url + assert ( + local_repo.working_dir != local_repo_path + ), "working_dir should not be the same as local_repo_path to avoid changing the original repo" + + # test commit hash + ## test github url + commit_hash = _to_commit_hash(github_repo, 'main') + assert len(commit_hash) == 40 + commit_hash = _to_commit_hash(github_repo, example_commit_hash) + assert commit_hash == example_commit_hash + + ## test remote url + commit_hash = _to_commit_hash(remote_repo, 'main') + assert len(commit_hash) == 40 + commit_hash = _to_commit_hash(remote_repo, example_commit_hash) + assert commit_hash == example_commit_hash + + ## test local path + commit_hash = _to_commit_hash(local_repo, 'main') + assert len(commit_hash) == 40 + commit_hash = _to_commit_hash(local_repo, example_commit_hash) + assert commit_hash == example_commit_hash + def test_local_with_branch(clean_clone_and_checkout, lean4_example_url, local_test_path): # Initialize GitHub repo @@ -21,17 +77,22 @@ def test_local_with_branch(clean_clone_and_checkout, lean4_example_url, local_te assert local_repo.repo_type == 'local' assert from_path_repo.repo_type == 'local' -def test_local_with_commit(clean_clone_and_checkout, lean4_example_url, local_test_path): - # GitHub commit hash from conftest.py - COMMIT_HASH = "3f8c5eb303a225cdef609498b8d87262e5ef344b" + # check repo name + assert github_repo.name == 'lean4-example' == local_repo.name == from_path_repo.name + + # test get_config + assert github_repo.get_config('lean-toolchain') == {'content': 'leanprover/lean4:v4.9.0-rc3\n'} + assert local_repo.get_config('lean-toolchain') == {'content': 'leanprover/lean4:v4.9.0-rc3\n'} + assert from_path_repo.get_config('lean-toolchain') == {'content': 'leanprover/lean4:v4.9.0-rc3\n'} +def test_local_with_commit(clean_clone_and_checkout, lean4_example_url, local_test_path, example_commit_hash): # Initialize GitHub repo - github_repo = LeanGitRepo(url=lean4_example_url, commit=COMMIT_HASH) + github_repo = LeanGitRepo(url=lean4_example_url, commit=example_commit_hash) # Initialize local repo local_repo_path = os.path.join(local_test_path, 'lean4-example') clean_clone_and_checkout(lean4_example_url, local_repo_path, 'main') # use main branch - local_repo = LeanGitRepo(url=local_repo_path, commit=COMMIT_HASH) # checkout to commit hash + local_repo = LeanGitRepo(url=local_repo_path, commit=example_commit_hash) # checkout to commit hash # Check if commit hashes match assert github_repo.commit == local_repo.commit @@ -41,15 +102,19 @@ def test_local_with_commit(clean_clone_and_checkout, lean4_example_url, local_te assert github_repo.repo_type == 'github' assert local_repo.repo_type == 'local' -def test_remote_url(lean4_example_url, remote_example_url): - # GitHub commit hash from conftest.py - COMMIT_HASH = "3f8c5eb303a225cdef609498b8d87262e5ef344b" + # check repo name + assert github_repo.name == 'lean4-example' == local_repo.name + # test get_config + assert github_repo.get_config('lean-toolchain') == {'content': 'leanprover/lean4:v4.9.0-rc3\n'} + assert local_repo.get_config('lean-toolchain') == {'content': 'leanprover/lean4:v4.9.0-rc3\n'} + +def test_remote_url(lean4_example_url, remote_example_url, example_commit_hash): # Initialize GitHub repo - github_repo = LeanGitRepo(url=lean4_example_url, commit=COMMIT_HASH) + github_repo = LeanGitRepo(url=lean4_example_url, commit=example_commit_hash) # Initialize Gitee repo _ = LeanGitRepo(url=remote_example_url, commit="main") # get commit by branch - gitee_repo = LeanGitRepo(url=remote_example_url, commit=COMMIT_HASH) + gitee_repo = LeanGitRepo(url=remote_example_url, commit=example_commit_hash) # Check if commit hashes match assert github_repo.commit == gitee_repo.commit @@ -57,3 +122,10 @@ def test_remote_url(lean4_example_url, remote_example_url): # check the repo type assert gitee_repo.repo_type == 'remote' + + # check repo name + assert gitee_repo.name == 'lean4-example' == github_repo.name + + # test get_config + assert gitee_repo.get_config('lean-toolchain') == {'content': 'leanprover/lean4:v4.9.0-rc3\n'} + assert github_repo.get_config('lean-toolchain') == {'content': 'leanprover/lean4:v4.9.0-rc3\n'} From 12d23cb8e5acfe9854f8e31115bfc13e51cd5310 Mon Sep 17 00:00:00 2001 From: rexwzh <1073853456@qq.com> Date: Tue, 23 Jul 2024 08:05:14 +0800 Subject: [PATCH 11/34] fix commit url & git clone for windows --- .gitignore | 3 +++ pyproject.toml | 1 + src/lean_dojo/data_extraction/lean.py | 16 +++++++++------- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index cd92677..8d89e22 100644 --- a/.gitignore +++ b/.gitignore @@ -130,3 +130,6 @@ dmypy.json # Pyre type checker .pyre/ + +# vscode debug config +.vscode/ \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 4625fc2..d6c7061 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,7 @@ dependencies = [ "python-dotenv", "loguru", "filelock", + "gitpython", "psutil", "pexpect", "types-psutil", diff --git a/src/lean_dojo/data_extraction/lean.py b/src/lean_dojo/data_extraction/lean.py index 5d7b140..ad4a2a2 100644 --- a/src/lean_dojo/data_extraction/lean.py +++ b/src/lean_dojo/data_extraction/lean.py @@ -17,6 +17,8 @@ from github.Repository import Repository from github.GithubException import GithubException from typing import List, Dict, Any, Generator, Union, Optional, Tuple, Iterator +from git import Repo + from ..utils import ( execute, @@ -455,7 +457,7 @@ def is_lean4(self) -> bool: @property def commit_url(self) -> str: - return os.path.join(self.url, f"tree/{self.commit}") + return f"{self.url}/tree/{self.commit}" def show(self) -> None: """Show the repo in the default browser.""" @@ -467,12 +469,12 @@ def exists(self) -> bool: def clone_and_checkout(self) -> None: """Clone the repo to the current working directory and checkout a specific commit.""" logger.debug(f"Cloning {self}") - execute(f"git clone -n --recursive {self.url}", capture_output=True) - with working_directory(self.name): - execute( - f"git checkout {self.commit} && git submodule update --recursive", - capture_output=True, - ) + # Clone the repository + repo = Repo.clone_from(self.url, Path(self.name)) + # Checkout the specific commit + repo.git.checkout(self.commit) + # Initialize and update submodules + repo.submodule_update(recursive=True) def get_dependencies( self, path: Union[str, Path, None] = None From 97565149d5b261af989d55be0db3d453862feda7 Mon Sep 17 00:00:00 2001 From: rexwzh <1073853456@qq.com> Date: Wed, 24 Jul 2024 03:18:34 +0800 Subject: [PATCH 12/34] add tests for lean repo & dojo --- src/lean_dojo/data_extraction/lean.py | 8 +++----- tests/conftest.py | 8 ++++++++ tests/data_extraction/test_lean_repo.py | 10 ++++++++++ tests/interaction/test_interaction.py | 16 ++++++++++++++++ 4 files changed, 37 insertions(+), 5 deletions(-) create mode 100644 tests/data_extraction/test_lean_repo.py create mode 100644 tests/interaction/test_interaction.py diff --git a/src/lean_dojo/data_extraction/lean.py b/src/lean_dojo/data_extraction/lean.py index ad4a2a2..aeea52f 100644 --- a/src/lean_dojo/data_extraction/lean.py +++ b/src/lean_dojo/data_extraction/lean.py @@ -469,12 +469,10 @@ def exists(self) -> bool: def clone_and_checkout(self) -> None: """Clone the repo to the current working directory and checkout a specific commit.""" logger.debug(f"Cloning {self}") - # Clone the repository - repo = Repo.clone_from(self.url, Path(self.name)) - # Checkout the specific commit + repo = Repo.clone_from(self.url, Path(self.name), no_checkout=True) repo.git.checkout(self.commit) - # Initialize and update submodules - repo.submodule_update(recursive=True) + repo.submodule_update(init=True, recursive=True) + def get_dependencies( self, path: Union[str, Path, None] = None diff --git a/tests/conftest.py b/tests/conftest.py index 1e14c96..3767c1d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,6 +7,7 @@ AESOP_URL = "https://github.com/leanprover-community/aesop" MATHLIB4_URL = "https://github.com/leanprover-community/mathlib4" LEAN4_EXAMPLE_URL = "https://github.com/yangky11/lean4-example" +EXAMPLE_COMMIT_HASH = "3f8c5eb303a225cdef609498b8d87262e5ef344b" URLS = [ BATTERIES_URL, AESOP_URL, @@ -14,6 +15,13 @@ LEAN4_EXAMPLE_URL, ] +@pytest.fixture(scope="session") +def example_commit_hash(): + return EXAMPLE_COMMIT_HASH + +@pytest.fixture(scope="session") +def lean4_example_url(): + return LEAN4_EXAMPLE_URL @pytest.fixture(scope="session") def monkeysession(): diff --git a/tests/data_extraction/test_lean_repo.py b/tests/data_extraction/test_lean_repo.py new file mode 100644 index 0000000..4e1e331 --- /dev/null +++ b/tests/data_extraction/test_lean_repo.py @@ -0,0 +1,10 @@ +# test for the class `LeanGitRepo` +from lean_dojo import LeanGitRepo + +def test_lean_git_repo(lean4_example_url, example_commit_hash): + repo = LeanGitRepo(lean4_example_url, example_commit_hash) + assert repo.url == lean4_example_url + assert repo.commit == example_commit_hash + assert repo.exists() + assert repo.name == "lean4-example" + assert repo.commit_url == f"{lean4_example_url}/tree/{example_commit_hash}" \ No newline at end of file diff --git a/tests/interaction/test_interaction.py b/tests/interaction/test_interaction.py new file mode 100644 index 0000000..6288bd9 --- /dev/null +++ b/tests/interaction/test_interaction.py @@ -0,0 +1,16 @@ +from lean_dojo import LeanGitRepo, Dojo, ProofFinished, ProofGivenUp, Theorem + +def test_remote_interact(lean4_example_url): + repo = LeanGitRepo(url=lean4_example_url, commit="main") + theorem = Theorem(repo, "Lean4Example.lean", "hello_world") + # initial state + dojo, state_0 = Dojo(theorem).__enter__() + assert state_0.pp == 'a b c : Nat\n⊢ a + b + c = a + c + b' + # state after running a tactic + state_1 = dojo.run_tac(state_0, "rw [add_assoc]") + assert state_1.pp == 'a b c : Nat\n⊢ a + (b + c) = a + c + b' + # state after running another a sorry tactic + assert dojo.run_tac(state_1, "sorry") == ProofGivenUp() + # finish proof + final_state = dojo.run_tac(state_1, "rw [add_comm b, ←add_assoc]") + assert isinstance(final_state, ProofFinished) \ No newline at end of file From a8a0977be231113debaab149df78fbfa407d70cc Mon Sep 17 00:00:00 2001 From: rexwzh <1073853456@qq.com> Date: Wed, 24 Jul 2024 03:47:16 +0800 Subject: [PATCH 13/34] simplify commit hash & delay initalization of lean4 repo --- src/lean_dojo/data_extraction/lean.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/src/lean_dojo/data_extraction/lean.py b/src/lean_dojo/data_extraction/lean.py index 5d7b140..8e5a75c 100644 --- a/src/lean_dojo/data_extraction/lean.py +++ b/src/lean_dojo/data_extraction/lean.py @@ -43,7 +43,7 @@ ) GITHUB = Github() -LEAN4_REPO = GITHUB.get_repo("leanprover/lean4") +LEAN4_REPO = None """The GitHub Repo for Lean 4 itself.""" _URL_REGEX = re.compile(r"(?P.*?)/*") @@ -88,15 +88,9 @@ def _to_commit_hash(repo: Repository, label: str) -> str: logger.debug(f"Querying the commit hash for {repo.name} {label}") try: - return repo.get_branch(label).commit.sha - except GithubException: - pass - - for tag in repo.get_tags(): - if tag.name == label: - return tag.commit.sha - - raise ValueError(f"Invalid tag or branch: `{label}` for {repo}") + return repo.get_commit(label).sha + except Exception: + raise ValueError(f"Invalid tag or branch: `{label}` for {repo}") @dataclass(eq=True, unsafe_hash=True) @@ -328,6 +322,9 @@ def get_lean4_version_from_config(toolchain: str) -> str: def get_lean4_commit_from_config(config_dict: Dict[str, Any]) -> str: """Return the required Lean commit given a ``lean-toolchain`` config.""" assert "content" in config_dict, "config_dict must have a 'content' field" + global LEAN4_REPO + if LEAN4_REPO is None: + LEAN4_REPO = GITHUB.get_repo("leanprover/lean4") config = config_dict["content"].strip() prefix = "leanprover/lean4:" assert config.startswith(prefix), f"Invalid Lean 4 version: {config}" @@ -447,7 +444,7 @@ def from_path(cls, path: Path) -> "LeanGitRepo": @property def name(self) -> str: - return self.repo.name + return os.path.basename(self.url) @property def is_lean4(self) -> bool: From fab19e2b72458723d2ede2020c108cc0f8750d27 Mon Sep 17 00:00:00 2001 From: rexwzh <1073853456@qq.com> Date: Wed, 24 Jul 2024 11:47:28 +0800 Subject: [PATCH 14/34] lean version: use string instead of commit --- src/lean_dojo/data_extraction/lean.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/lean_dojo/data_extraction/lean.py b/src/lean_dojo/data_extraction/lean.py index 8e5a75c..427b531 100644 --- a/src/lean_dojo/data_extraction/lean.py +++ b/src/lean_dojo/data_extraction/lean.py @@ -427,9 +427,14 @@ def __post_init__(self) -> None: lean_version = self.commit else: config = self.get_config("lean-toolchain") - lean_version = get_lean4_commit_from_config(config) - v = get_lean4_version_from_config(config["content"]) - if not is_supported_version(v): + toolchain = config["content"] + m = _LEAN4_VERSION_REGEX.fullmatch(toolchain.strip()) + if m is not None: + lean_version = m["version"] + else: + # lean_version_commit = get_lean4_commit_from_config(config) + lean_version = get_lean4_version_from_config(toolchain) + if not is_supported_version(lean_version): logger.warning( f"{self} relies on an unsupported Lean version: {lean_version}" ) From 60a500ee23ddae962590bab989ae14d84a57c933 Mon Sep 17 00:00:00 2001 From: Kaiyu Yang Date: Wed, 24 Jul 2024 19:55:06 +0000 Subject: [PATCH 15/34] format code --- src/lean_dojo/constants.py | 9 ++++++--- src/lean_dojo/data_extraction/lean.py | 1 - tests/conftest.py | 3 +++ tests/data_extraction/test_lean_repo.py | 3 ++- tests/interaction/test_interaction.py | 7 ++++--- 5 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/lean_dojo/constants.py b/src/lean_dojo/constants.py index c7a5fc3..fd3f74b 100644 --- a/src/lean_dojo/constants.py +++ b/src/lean_dojo/constants.py @@ -74,7 +74,9 @@ def check_git_version(min_version: Tuple[int, int, int]) -> None: """Check the version of Git installed on the system.""" try: - res = subprocess.run("git --version", shell=True, capture_output=True, check=True) + res = subprocess.run( + "git --version", shell=True, capture_output=True, check=True + ) output = res.stdout.decode().strip() error = res.stderr.decode() assert error == "", error @@ -82,8 +84,8 @@ def check_git_version(min_version: Tuple[int, int, int]) -> None: if not match: raise ValueError("Could not parse Git version from the output.") # Convert version number string to tuple of integers - version = tuple(int(_) for _ in match.group(1).split('.')) - + version = tuple(int(_) for _ in match.group(1).split(".")) + version_str = ".".join(str(_) for _ in version) min_version_str = ".".join(str(_) for _ in min_version) assert ( @@ -92,4 +94,5 @@ def check_git_version(min_version: Tuple[int, int, int]) -> None: except subprocess.CalledProcessError as e: raise Exception(f"Failed to run git command: {e}") + check_git_version((2, 25, 0)) diff --git a/src/lean_dojo/data_extraction/lean.py b/src/lean_dojo/data_extraction/lean.py index aeea52f..c12fae7 100644 --- a/src/lean_dojo/data_extraction/lean.py +++ b/src/lean_dojo/data_extraction/lean.py @@ -473,7 +473,6 @@ def clone_and_checkout(self) -> None: repo.git.checkout(self.commit) repo.submodule_update(init=True, recursive=True) - def get_dependencies( self, path: Union[str, Path, None] = None ) -> Dict[str, "LeanGitRepo"]: diff --git a/tests/conftest.py b/tests/conftest.py index 3767c1d..7113916 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -15,14 +15,17 @@ LEAN4_EXAMPLE_URL, ] + @pytest.fixture(scope="session") def example_commit_hash(): return EXAMPLE_COMMIT_HASH + @pytest.fixture(scope="session") def lean4_example_url(): return LEAN4_EXAMPLE_URL + @pytest.fixture(scope="session") def monkeysession(): with pytest.MonkeyPatch.context() as mp: diff --git a/tests/data_extraction/test_lean_repo.py b/tests/data_extraction/test_lean_repo.py index 4e1e331..005218c 100644 --- a/tests/data_extraction/test_lean_repo.py +++ b/tests/data_extraction/test_lean_repo.py @@ -1,10 +1,11 @@ # test for the class `LeanGitRepo` from lean_dojo import LeanGitRepo + def test_lean_git_repo(lean4_example_url, example_commit_hash): repo = LeanGitRepo(lean4_example_url, example_commit_hash) assert repo.url == lean4_example_url assert repo.commit == example_commit_hash assert repo.exists() assert repo.name == "lean4-example" - assert repo.commit_url == f"{lean4_example_url}/tree/{example_commit_hash}" \ No newline at end of file + assert repo.commit_url == f"{lean4_example_url}/tree/{example_commit_hash}" diff --git a/tests/interaction/test_interaction.py b/tests/interaction/test_interaction.py index 6288bd9..1e0b858 100644 --- a/tests/interaction/test_interaction.py +++ b/tests/interaction/test_interaction.py @@ -1,16 +1,17 @@ from lean_dojo import LeanGitRepo, Dojo, ProofFinished, ProofGivenUp, Theorem + def test_remote_interact(lean4_example_url): repo = LeanGitRepo(url=lean4_example_url, commit="main") theorem = Theorem(repo, "Lean4Example.lean", "hello_world") # initial state dojo, state_0 = Dojo(theorem).__enter__() - assert state_0.pp == 'a b c : Nat\n⊢ a + b + c = a + c + b' + assert state_0.pp == "a b c : Nat\n⊢ a + b + c = a + c + b" # state after running a tactic state_1 = dojo.run_tac(state_0, "rw [add_assoc]") - assert state_1.pp == 'a b c : Nat\n⊢ a + (b + c) = a + c + b' + assert state_1.pp == "a b c : Nat\n⊢ a + (b + c) = a + c + b" # state after running another a sorry tactic assert dojo.run_tac(state_1, "sorry") == ProofGivenUp() # finish proof final_state = dojo.run_tac(state_1, "rw [add_comm b, ←add_assoc]") - assert isinstance(final_state, ProofFinished) \ No newline at end of file + assert isinstance(final_state, ProofFinished) From eabaa222dca8db700a66d3b8b5eed822ad470cf3 Mon Sep 17 00:00:00 2001 From: rexwzh <1073853456@qq.com> Date: Thu, 25 Jul 2024 14:35:16 +0800 Subject: [PATCH 16/34] update cache method --- src/lean_dojo/data_extraction/cache.py | 6 +++-- src/lean_dojo/data_extraction/lean.py | 29 +++++++++++++++---------- src/lean_dojo/data_extraction/trace.py | 2 +- tests/data_extraction/test_lean_repo.py | 18 +++++++++++++++ 4 files changed, 40 insertions(+), 15 deletions(-) diff --git a/src/lean_dojo/data_extraction/cache.py b/src/lean_dojo/data_extraction/cache.py index acfe1c9..e878ff2 100644 --- a/src/lean_dojo/data_extraction/cache.py +++ b/src/lean_dojo/data_extraction/cache.py @@ -90,10 +90,12 @@ def get(self, url: str, commit: str) -> Optional[Path]: else: return None - def store(self, src: Path) -> Path: + def store(self, src: Path, fmt_name: str = "") -> Path: """Store a traced repo at path ``src``. Return its path in the cache.""" url, commit = get_repo_info(src) - dirpath = self.cache_dir / _format_dirname(url, commit) + if fmt_name == "": # if not specified, extract from the traced repo + fmt_name = _format_dirname(url, commit) + dirpath = self.cache_dir / fmt_name _, repo_name = _split_git_url(url) if not dirpath.exists(): with self.lock: diff --git a/src/lean_dojo/data_extraction/lean.py b/src/lean_dojo/data_extraction/lean.py index 0daed9e..cc37e91 100644 --- a/src/lean_dojo/data_extraction/lean.py +++ b/src/lean_dojo/data_extraction/lean.py @@ -28,6 +28,7 @@ working_directory, ) from ..constants import LEAN4_URL +from .cache import _format_dirname GITHUB_ACCESS_TOKEN = os.getenv("GITHUB_ACCESS_TOKEN", None) @@ -90,9 +91,14 @@ def _to_commit_hash(repo: Repository, label: str) -> str: logger.debug(f"Querying the commit hash for {repo.name} {label}") try: - return repo.get_commit(label).sha - except Exception: - raise ValueError(f"Invalid tag or branch: `{label}` for {repo}") + return repo.get_branch(label).commit.sha + except GithubException: + pass + + for tag in repo.get_tags(): + if tag.name == label: + return tag.commit.sha + raise ValueError(f"Invalid tag or branch: `{label}` for {repo}") @dataclass(eq=True, unsafe_hash=True) @@ -429,13 +435,7 @@ def __post_init__(self) -> None: lean_version = self.commit else: config = self.get_config("lean-toolchain") - toolchain = config["content"] - m = _LEAN4_VERSION_REGEX.fullmatch(toolchain.strip()) - if m is not None: - lean_version = m["version"] - else: - # lean_version_commit = get_lean4_commit_from_config(config) - lean_version = get_lean4_version_from_config(toolchain) + lean_version = get_lean4_version_from_config(config["content"]) if not is_supported_version(lean_version): logger.warning( f"{self} relies on an unsupported Lean version: {lean_version}" @@ -444,9 +444,9 @@ def __post_init__(self) -> None: object.__setattr__(self, "lean_version", lean_version) @classmethod - def from_path(cls, path: Path) -> "LeanGitRepo": + def from_path(cls, path: Union[Path, str]) -> "LeanGitRepo": """Construct a :class:`LeanGitRepo` object from the path to a local Git repo.""" - url, commit = get_repo_info(path) + url, commit = get_repo_info(Path(path)) return cls(url, commit) @property @@ -461,6 +461,11 @@ def is_lean4(self) -> bool: def commit_url(self) -> str: return f"{self.url}/tree/{self.commit}" + @property + def format_dirname(self) -> str: + """Return the formatted cache directory name""" + return _format_dirname(self.url, self.commit) + def show(self) -> None: """Show the repo in the default browser.""" webbrowser.open(self.commit_url) diff --git a/src/lean_dojo/data_extraction/trace.py b/src/lean_dojo/data_extraction/trace.py index e5b2492..f30a511 100644 --- a/src/lean_dojo/data_extraction/trace.py +++ b/src/lean_dojo/data_extraction/trace.py @@ -212,7 +212,7 @@ def get_traced_repo_path(repo: LeanGitRepo, build_deps: bool = True) -> Path: _trace(repo, build_deps) traced_repo = TracedRepo.from_traced_files(tmp_dir / repo.name, build_deps) traced_repo.save_to_disk() - path = cache.store(tmp_dir / repo.name) + path = cache.store(tmp_dir / repo.name, repo.format_dirname) else: logger.debug("The traced repo is available in the cache.") return path diff --git a/tests/data_extraction/test_lean_repo.py b/tests/data_extraction/test_lean_repo.py index 005218c..9660c3c 100644 --- a/tests/data_extraction/test_lean_repo.py +++ b/tests/data_extraction/test_lean_repo.py @@ -1,5 +1,7 @@ # test for the class `LeanGitRepo` from lean_dojo import LeanGitRepo +from lean_dojo.data_extraction.lean import _to_commit_hash +from lean_dojo.constants import LEAN4_URL def test_lean_git_repo(lean4_example_url, example_commit_hash): @@ -9,3 +11,19 @@ def test_lean_git_repo(lean4_example_url, example_commit_hash): assert repo.exists() assert repo.name == "lean4-example" assert repo.commit_url == f"{lean4_example_url}/tree/{example_commit_hash}" + # test cache directory + assert ( + repo.format_dirname + == "yangky11-lean4-example-3f8c5eb303a225cdef609498b8d87262e5ef344b" + ) + # test commit hash + ## test commit hash + assert _to_commit_hash(repo, example_commit_hash) == example_commit_hash + ## test branch, assume the branch is not changed + assert _to_commit_hash(repo, "paper") == "8bf74cf67d1acf652a0c74baaa9dc3b9b9e4098c" + ## test tag + lean4_repo = LeanGitRepo(LEAN4_URL, "master") + assert ( + _to_commit_hash(lean4_repo, "v4.9.1") + == "1b78cb4836cf626007bd38872956a6fab8910993" + ) From ff0f9f475e09e70019cde469de26e52ee1fff686 Mon Sep 17 00:00:00 2001 From: rexwzh <1073853456@qq.com> Date: Fri, 26 Jul 2024 03:30:25 +0800 Subject: [PATCH 17/34] update functions for git Repos & add tests --- src/lean_dojo/data_extraction/cache.py | 24 +++--- src/lean_dojo/data_extraction/lean.py | 89 ++++++++++++++----- src/lean_dojo/data_extraction/trace.py | 4 +- src/lean_dojo/utils.py | 28 ++++++ tests/conftest.py | 6 ++ tests/data_extraction/test_cache.py | 55 ++++++++++++ tests/data_extraction/test_lean_repo.py | 109 ++++++++++++++++++++---- 7 files changed, 266 insertions(+), 49 deletions(-) create mode 100644 tests/data_extraction/test_cache.py diff --git a/src/lean_dojo/data_extraction/cache.py b/src/lean_dojo/data_extraction/cache.py index e878ff2..829b7e8 100644 --- a/src/lean_dojo/data_extraction/cache.py +++ b/src/lean_dojo/data_extraction/cache.py @@ -59,11 +59,11 @@ def __post_init__(self): lock_path = self.cache_dir.with_suffix(".lock") object.__setattr__(self, "lock", FileLock(lock_path)) - def get(self, url: str, commit: str) -> Optional[Path]: + def get(self, url: str, commit: str, prefix: str = "") -> Optional[Path]: """Get the path of a traced repo with URL ``url`` and commit hash ``commit``. Return None if no such repo can be found.""" _, repo_name = _split_git_url(url) dirname = _format_dirname(url, commit) - dirpath = self.cache_dir / dirname + dirpath = self.cache_dir / prefix / dirname with self.lock: if dirpath.exists(): @@ -90,18 +90,20 @@ def get(self, url: str, commit: str) -> Optional[Path]: else: return None - def store(self, src: Path, fmt_name: str = "") -> Path: - """Store a traced repo at path ``src``. Return its path in the cache.""" - url, commit = get_repo_info(src) - if fmt_name == "": # if not specified, extract from the traced repo - fmt_name = _format_dirname(url, commit) - dirpath = self.cache_dir / fmt_name - _, repo_name = _split_git_url(url) + def store(self, src: Path, rel_cache_dir: Path) -> Path: + """Store a traced repo at path ``src``. Return its path in the cache. + + Args: + src (Path): Path to the repo. + rel_cache_name (Path): The relative path of the stored repo in the cache. + """ + dirpath = self.cache_dir / rel_cache_dir.parent + cache_path = self.cache_dir / rel_cache_dir if not dirpath.exists(): with self.lock: with report_critical_failure(_CACHE_CORRPUTION_MSG): - shutil.copytree(src, dirpath / repo_name) - return dirpath / repo_name + shutil.copytree(src, cache_path) + return cache_path cache = Cache(CACHE_DIR) diff --git a/src/lean_dojo/data_extraction/lean.py b/src/lean_dojo/data_extraction/lean.py index cc37e91..b9c5f60 100644 --- a/src/lean_dojo/data_extraction/lean.py +++ b/src/lean_dojo/data_extraction/lean.py @@ -18,6 +18,9 @@ from github.GithubException import GithubException from typing import List, Dict, Any, Generator, Union, Optional, Tuple, Iterator from git import Repo +from ..constants import TMP_DIR +import uuid +import shutil from ..utils import ( @@ -26,6 +29,8 @@ url_exists, get_repo_info, working_directory, + is_git_repo, + repo_type_of_url, ) from ..constants import LEAN4_URL from .cache import _format_dirname @@ -52,18 +57,46 @@ _URL_REGEX = re.compile(r"(?P.*?)/*") -def normalize_url(url: str) -> str: +def normalize_url(url: str, repo_type: str = "github") -> str: + if repo_type == "local": + return os.path.abspath(url) # Convert to absolute path if local return _URL_REGEX.fullmatch(url)["url"] # Remove trailing `/`. @cache -def url_to_repo(url: str, num_retries: int = 2) -> Repository: +def url_to_repo( + url: str, + num_retries: int = 2, + repo_type: Union[str, None] = None, + tmp_dir: Union[Path] = None, +) -> Union[Repo, Repository]: + """Convert a URL to a Repo object. + + Args: + url (str): The URL of the repository. + num_retries (int): Number of retries in case of failure. + repo_type (Optional[str]): The type of the repository. Defaults to None. + tmp_dir (Optional[Path]): The temporary directory to clone the repo to. Defaults to None. + + Returns: + Repo: A Git Repo object. + """ url = normalize_url(url) backoff = 1 - + tmp_dir = tmp_dir or os.path.join(TMP_DIR or "/tmp", str(uuid.uuid4())[:8]) + repo_type = repo_type or repo_type_of_url(url) while True: try: - return GITHUB.get_repo("/".join(url.split("/")[-2:])) + if repo_type == "github": + return GITHUB.get_repo("/".join(url.split("/")[-2:])) + with working_directory(tmp_dir): + repo_name = os.path.basename(url) + if repo_type == "local": + assert is_git_repo(url), f"Local path {url} is not a git repo" + shutil.copytree(url, repo_name) + return Repo(repo_name) + else: + return Repo.clone_from(url, repo_name) except Exception as ex: if num_retries <= 0: raise ex @@ -77,7 +110,10 @@ def url_to_repo(url: str, num_retries: int = 2) -> Repository: def get_latest_commit(url: str) -> str: """Get the hash of the latest commit of the Git repo at ``url``.""" repo = url_to_repo(url) - return repo.get_branch(repo.default_branch).commit.sha + if isinstance(repo, Repository): + return repo.get_branch(repo.default_branch).commit.sha + else: + return repo.head.commit.hexsha def cleanse_string(s: Union[str, Path]) -> str: @@ -85,20 +121,28 @@ def cleanse_string(s: Union[str, Path]) -> str: return str(s).replace("/", "_").replace(":", "_") -@cache -def _to_commit_hash(repo: Repository, label: str) -> str: +def _to_commit_hash(repo: Union[Repository, Repo], label: str) -> str: """Convert a tag or branch to a commit hash.""" - logger.debug(f"Querying the commit hash for {repo.name} {label}") - - try: - return repo.get_branch(label).commit.sha - except GithubException: - pass - - for tag in repo.get_tags(): - if tag.name == label: - return tag.commit.sha - raise ValueError(f"Invalid tag or branch: `{label}` for {repo}") + # GitHub repository + if isinstance(repo, Repository): + logger.debug(f"Querying the commit hash for {repo.name} {label}") + try: + commit = repo.get_commit(label).sha + except GithubException as e: + raise ValueError(f"Invalid tag or branch: `{label}` for {repo.name}") + # Local or remote Git repository + elif isinstance(repo, Repo): + logger.debug( + f"Querying the commit hash for {repo.working_dir} repository {label}" + ) + try: + # Resolve the label to a commit hash + commit = repo.commit(label).hexsha + except Exception as e: + raise ValueError(f"Error converting ref to commit hash: {e}") + else: + raise TypeError("Unsupported repository type") + return commit @dataclass(eq=True, unsafe_hash=True) @@ -320,6 +364,11 @@ def __getitem__(self, key) -> str: _LEAN4_VERSION_REGEX = re.compile(r"leanprover/lean4:(?P.+?)") +def is_commit_hash(s: str): + """Check if a string is a valid commit hash.""" + return len(s) == 40 and _COMMIT_REGEX.fullmatch(s) + + def get_lean4_version_from_config(toolchain: str) -> str: """Return the required Lean version given a ``lean-toolchain`` config.""" m = _LEAN4_VERSION_REGEX.fullmatch(toolchain.strip()) @@ -419,12 +468,12 @@ def __post_init__(self) -> None: object.__setattr__(self, "repo", url_to_repo(self.url)) # Convert tags or branches to commit hashes - if not (len(self.commit) == 40 and _COMMIT_REGEX.fullmatch(self.commit)): + if not is_commit_hash(self.commit): if (self.url, self.commit) in info_cache.tag2commit: commit = info_cache.tag2commit[(self.url, self.commit)] else: commit = _to_commit_hash(self.repo, self.commit) - assert _COMMIT_REGEX.fullmatch(commit), f"Invalid commit hash: {commit}" + assert is_commit_hash(commit) info_cache.tag2commit[(self.url, self.commit)] = commit object.__setattr__(self, "commit", commit) diff --git a/src/lean_dojo/data_extraction/trace.py b/src/lean_dojo/data_extraction/trace.py index f30a511..56f2358 100644 --- a/src/lean_dojo/data_extraction/trace.py +++ b/src/lean_dojo/data_extraction/trace.py @@ -212,7 +212,9 @@ def get_traced_repo_path(repo: LeanGitRepo, build_deps: bool = True) -> Path: _trace(repo, build_deps) traced_repo = TracedRepo.from_traced_files(tmp_dir / repo.name, build_deps) traced_repo.save_to_disk() - path = cache.store(tmp_dir / repo.name, repo.format_dirname) + src_dir = tmp_dir / repo.name + rel_cache_dir = Path(repo.format_dirname / repo.name) + path = cache.store(src_dir, rel_cache_dir) else: logger.debug("The traced repo is available in the cache.") return path diff --git a/src/lean_dojo/utils.py b/src/lean_dojo/utils.py index 0386949..12cf101 100644 --- a/src/lean_dojo/utils.py +++ b/src/lean_dojo/utils.py @@ -16,6 +16,7 @@ from contextlib import contextmanager from ray.util.actor_pool import ActorPool from typing import Tuple, Union, List, Generator, Optional +from urllib.parse import urlparse from .constants import NUM_WORKERS, TMP_DIR, LEAN4_PACKAGES_DIR, LEAN4_BUILD_DIR @@ -144,6 +145,33 @@ def camel_case(s: str) -> str: return _CAMEL_CASE_REGEX.sub(" ", s).title().replace(" ", "") +def repo_type_of_url(url: str) -> Union[str, None]: + """Get the type of the repository. + + Args: + url (str): The URL of the repository. + Returns: + str: The type of the repository. + """ + parsed_url = urlparse(url) + if parsed_url.scheme in ["http", "https"]: + # case 1 - GitHub URL + if "github.com" in url: + if not url.startswith("https://"): + logger.warning(f"{url} should start with https://") + return + else: + return "github" + # case 2 - remote URL + elif url_exists(url): # not check whether it is a git URL + return "remote" + # case 3 - local path + elif is_git_repo(Path(parsed_url.path)): + return "local" + logger.warning(f"{url} is not a valid URL") + return None + + @cache def get_repo_info(path: Path) -> Tuple[str, str]: """Get the URL and commit hash of the Git repo at ``path``. diff --git a/tests/conftest.py b/tests/conftest.py index 7113916..d581c62 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -8,6 +8,7 @@ MATHLIB4_URL = "https://github.com/leanprover-community/mathlib4" LEAN4_EXAMPLE_URL = "https://github.com/yangky11/lean4-example" EXAMPLE_COMMIT_HASH = "3f8c5eb303a225cdef609498b8d87262e5ef344b" +REMOTE_EXAMPLE_URL = "https://gitee.com/rexzong/lean4-example" URLS = [ BATTERIES_URL, AESOP_URL, @@ -16,6 +17,11 @@ ] +@pytest.fixture(scope="session") +def remote_example_url(): + return REMOTE_EXAMPLE_URL + + @pytest.fixture(scope="session") def example_commit_hash(): return EXAMPLE_COMMIT_HASH diff --git a/tests/data_extraction/test_cache.py b/tests/data_extraction/test_cache.py new file mode 100644 index 0000000..5f8004a --- /dev/null +++ b/tests/data_extraction/test_cache.py @@ -0,0 +1,55 @@ +# test for cache manager +from git import Repo +from lean_dojo.utils import working_directory +from pathlib import Path +from lean_dojo.data_extraction.lean import _format_dirname +from lean_dojo.data_extraction.cache import cache + + +def test_get_cache(lean4_example_url, remote_example_url, example_commit_hash): + # Note: The `git.Repo` requires the local repo to be cloned in a directory + # all cached repos are stored in CACHE_DIR/repos + prefix = "repos" + + # test local repo cache + with working_directory() as tmp_dir: + # assume that the local repo placed in `/.../testrepo/lean4-example` + repo = Repo.clone_from(lean4_example_url, "testrepo/lean4-example") + repo.git.checkout(example_commit_hash) + local_dir = tmp_dir / "testrepo/lean4-example" + # use local_dir as the key to store the cache + rel_cache_dir = ( + prefix + / Path(_format_dirname(str(local_dir), example_commit_hash)) + / local_dir.name + ) + cache.store(local_dir, rel_cache_dir) + # get the cache + local_url, local_commit = str(local_dir), example_commit_hash + repo_cache = cache.get(local_url, local_commit, prefix) + assert ( + _format_dirname(local_url, local_commit) + == f"{local_dir.parent.name}-{local_dir.name}-{local_commit}" + ) + assert repo_cache is not None + + # test remote repo cache + with working_directory() as tmp_dir: + repo = Repo.clone_from(remote_example_url, "lean4-example") + repo.git.checkout(example_commit_hash) + tmp_remote_dir = tmp_dir / "lean4-example" + # use remote url as the key to store the cache + rel_cache_dir = ( + prefix + / Path(_format_dirname(str(remote_example_url), example_commit_hash)) + / tmp_remote_dir.name + ) + cache.store(tmp_remote_dir, rel_cache_dir) + # get the cache + remote_url, remote_commit = remote_example_url, example_commit_hash + repo_cache = cache.get(remote_url, remote_commit, prefix) + assert repo_cache is not None + assert ( + _format_dirname(remote_url, remote_commit) + == f"rexzong-lean4-example-{example_commit_hash}" + ) diff --git a/tests/data_extraction/test_lean_repo.py b/tests/data_extraction/test_lean_repo.py index 9660c3c..c25b18c 100644 --- a/tests/data_extraction/test_lean_repo.py +++ b/tests/data_extraction/test_lean_repo.py @@ -1,10 +1,99 @@ # test for the class `LeanGitRepo` from lean_dojo import LeanGitRepo -from lean_dojo.data_extraction.lean import _to_commit_hash from lean_dojo.constants import LEAN4_URL +from git import Repo +from github.Repository import Repository +from lean_dojo.utils import working_directory, repo_type_of_url +from lean_dojo.data_extraction.lean import ( + _to_commit_hash, + url_to_repo, + get_latest_commit, + is_commit_hash, + GITHUB, +) -def test_lean_git_repo(lean4_example_url, example_commit_hash): +def test_url_to_repo(lean4_example_url, remote_example_url): + repo_name = "lean4-example" + + # 1. github + ## test get_latest_commit + gh_cm_hash = get_latest_commit(lean4_example_url) + assert is_commit_hash(gh_cm_hash) + + ## test url_to_repo & repo_type_of_url + github_repo = url_to_repo(lean4_example_url) + assert repo_type_of_url(lean4_example_url) == "github" + assert isinstance(github_repo, Repository) + assert github_repo.name == repo_name + + # 2. local + with working_directory() as tmp_dir: + + ## clone from github + Repo.clone_from(lean4_example_url, repo_name) + + ## test get_latest_commit + local_url = str((tmp_dir / repo_name).absolute()) + assert get_latest_commit(local_url) == gh_cm_hash + + ## test url_to_repo & repo_type_of_url + local_repo = url_to_repo(local_url, repo_type="local") + assert repo_type_of_url(local_url) == "local" + assert isinstance(local_repo, Repo) + assert ( + local_repo.working_dir != local_url + ), "The working directory should not be the same as the original repo" + + # 3. remote + with working_directory(): + remote_repo = url_to_repo(remote_example_url) + assert repo_type_of_url(remote_example_url) == "remote" + assert isinstance(remote_repo, Repo) + re_cm_hash = get_latest_commit(remote_example_url) + tmp_repo_path = str(remote_repo.working_dir) + assert re_cm_hash == get_latest_commit(tmp_repo_path) + + +def test_to_commit_hash(lean4_example_url, remote_example_url, example_commit_hash): + # 1. github + repo = GITHUB.get_repo("yangky11/lean4-example") + ## commit hash + assert _to_commit_hash(repo, example_commit_hash) == example_commit_hash + ## branch, assume this branch is not changing + assert _to_commit_hash(repo, "paper") == "8bf74cf67d1acf652a0c74baaa9dc3b9b9e4098c" + gh_main_hash = _to_commit_hash(repo, "main") + ## git tag + assert ( + _to_commit_hash(GITHUB.get_repo("leanprover/lean4"), "v4.9.1") + == "1b78cb4836cf626007bd38872956a6fab8910993" + ) + # 2. local + with working_directory(): + repo = Repo.clone_from(lean4_example_url, "lean4-example") + repo.git.checkout(example_commit_hash) + repo.create_tag("v0.1.0") # create a tag + repo.git.checkout("main") + assert _to_commit_hash(repo, example_commit_hash) == example_commit_hash + assert _to_commit_hash(repo, "main") == gh_main_hash + assert ( + _to_commit_hash(repo, "origin/paper") + == "8bf74cf67d1acf652a0c74baaa9dc3b9b9e4098c" + ) + assert _to_commit_hash(repo, "v0.1.0") == example_commit_hash + + # 3. remote + with working_directory(): + repo = url_to_repo(remote_example_url) + assert _to_commit_hash(repo, example_commit_hash) == example_commit_hash + assert ( + _to_commit_hash(repo, "origin/paper") + == "8bf74cf67d1acf652a0c74baaa9dc3b9b9e4098c" + ) + # no tags in the remote repo + + +def test_git_lean_repo(lean4_example_url, example_commit_hash): repo = LeanGitRepo(lean4_example_url, example_commit_hash) assert repo.url == lean4_example_url assert repo.commit == example_commit_hash @@ -12,18 +101,4 @@ def test_lean_git_repo(lean4_example_url, example_commit_hash): assert repo.name == "lean4-example" assert repo.commit_url == f"{lean4_example_url}/tree/{example_commit_hash}" # test cache directory - assert ( - repo.format_dirname - == "yangky11-lean4-example-3f8c5eb303a225cdef609498b8d87262e5ef344b" - ) - # test commit hash - ## test commit hash - assert _to_commit_hash(repo, example_commit_hash) == example_commit_hash - ## test branch, assume the branch is not changed - assert _to_commit_hash(repo, "paper") == "8bf74cf67d1acf652a0c74baaa9dc3b9b9e4098c" - ## test tag - lean4_repo = LeanGitRepo(LEAN4_URL, "master") - assert ( - _to_commit_hash(lean4_repo, "v4.9.1") - == "1b78cb4836cf626007bd38872956a6fab8910993" - ) + assert repo.format_dirname == f"yangky11-lean4-example-{example_commit_hash}" From 6c7ef63875ab82a2c2f1112baaf5f45b1e347768 Mon Sep 17 00:00:00 2001 From: rexwzh <1073853456@qq.com> Date: Fri, 26 Jul 2024 04:04:05 +0800 Subject: [PATCH 18/34] fix url_exists --- src/lean_dojo/data_extraction/trace.py | 2 +- src/lean_dojo/utils.py | 8 ++++++-- tests/data_extraction/test_trace.py | 8 ++++++++ 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/lean_dojo/data_extraction/trace.py b/src/lean_dojo/data_extraction/trace.py index 56f2358..9527c5a 100644 --- a/src/lean_dojo/data_extraction/trace.py +++ b/src/lean_dojo/data_extraction/trace.py @@ -213,7 +213,7 @@ def get_traced_repo_path(repo: LeanGitRepo, build_deps: bool = True) -> Path: traced_repo = TracedRepo.from_traced_files(tmp_dir / repo.name, build_deps) traced_repo.save_to_disk() src_dir = tmp_dir / repo.name - rel_cache_dir = Path(repo.format_dirname / repo.name) + rel_cache_dir = Path(repo.format_dirname) / repo.name path = cache.store(src_dir, rel_cache_dir) else: logger.debug("The traced repo is available in the cache.") diff --git a/src/lean_dojo/utils.py b/src/lean_dojo/utils.py index 12cf101..2ee132e 100644 --- a/src/lean_dojo/utils.py +++ b/src/lean_dojo/utils.py @@ -237,9 +237,13 @@ def read_url(url: str, num_retries: int = 2) -> str: @cache def url_exists(url: str) -> bool: - """Return True if the URL ``url`` exists.""" + """Return True if the URL ``url`` exists, using the GITHUB_ACCESS_TOKEN for authentication if provided.""" try: - with urllib.request.urlopen(url) as _: + request = urllib.request.Request(url) + gh_token = os.getenv("GITHUB_ACCESS_TOKEN") + if gh_token is not None: + request.add_header("Authorization", f"token {gh_token}") + with urllib.request.urlopen(request) as _: return True except urllib.error.HTTPError: return False diff --git a/tests/data_extraction/test_trace.py b/tests/data_extraction/test_trace.py index 0064933..71788fd 100644 --- a/tests/data_extraction/test_trace.py +++ b/tests/data_extraction/test_trace.py @@ -1,5 +1,13 @@ from pathlib import Path from lean_dojo import * +from lean_dojo.data_extraction.cache import cache + + +def test_example_trace(lean4_example_repo): + trace_repo = trace(lean4_example_repo) + repo = trace_repo.repo + path = cache.get(repo.url, repo.commit) + assert path is not None def test_trace(traced_repo): From e5941ff350445edb89ab17b70ad2e2a8be8195a0 Mon Sep 17 00:00:00 2001 From: rexwzh <1073853456@qq.com> Date: Fri, 26 Jul 2024 10:47:05 +0800 Subject: [PATCH 19/34] allow repo_type of git@github.com --- src/lean_dojo/data_extraction/lean.py | 34 +++++++++++++++++++++++-- src/lean_dojo/utils.py | 28 -------------------- tests/data_extraction/test_lean_repo.py | 5 +++- 3 files changed, 36 insertions(+), 31 deletions(-) diff --git a/src/lean_dojo/data_extraction/lean.py b/src/lean_dojo/data_extraction/lean.py index b9c5f60..ed82cdf 100644 --- a/src/lean_dojo/data_extraction/lean.py +++ b/src/lean_dojo/data_extraction/lean.py @@ -21,16 +21,15 @@ from ..constants import TMP_DIR import uuid import shutil +from urllib.parse import urlparse from ..utils import ( - execute, read_url, url_exists, get_repo_info, working_directory, is_git_repo, - repo_type_of_url, ) from ..constants import LEAN4_URL from .cache import _format_dirname @@ -56,6 +55,8 @@ _URL_REGEX = re.compile(r"(?P.*?)/*") +_SSH_TO_HTTPS_REGEX = re.compile(r"^git@github\.com:(.+)/(.+)(?:\.git)?$") + def normalize_url(url: str, repo_type: str = "github") -> str: if repo_type == "local": @@ -63,6 +64,35 @@ def normalize_url(url: str, repo_type: str = "github") -> str: return _URL_REGEX.fullmatch(url)["url"] # Remove trailing `/`. +def repo_type_of_url(url: str) -> Union[str, None]: + """Get the type of the repository. + + Args: + url (str): The URL of the repository. + Returns: + str: The type of the repository. + """ + m = _SSH_TO_HTTPS_REGEX.match(url) + url = f"https://github.com/{m.group(1)}/{m.group(2)}" if m else url + parsed_url = urlparse(url) + if parsed_url.scheme in ["http", "https"]: + # case 1 - GitHub URL + if "github.com" in url: + if not url.startswith("https://"): + logger.warning(f"{url} should start with https://") + return + else: + return "github" + # case 2 - remote URL + elif url_exists(url): # not check whether it is a git URL + return "remote" + # case 3 - local path + elif is_git_repo(Path(parsed_url.path)): + return "local" + logger.warning(f"{url} is not a valid URL") + return None + + @cache def url_to_repo( url: str, diff --git a/src/lean_dojo/utils.py b/src/lean_dojo/utils.py index 2ee132e..0adf60f 100644 --- a/src/lean_dojo/utils.py +++ b/src/lean_dojo/utils.py @@ -16,7 +16,6 @@ from contextlib import contextmanager from ray.util.actor_pool import ActorPool from typing import Tuple, Union, List, Generator, Optional -from urllib.parse import urlparse from .constants import NUM_WORKERS, TMP_DIR, LEAN4_PACKAGES_DIR, LEAN4_BUILD_DIR @@ -145,33 +144,6 @@ def camel_case(s: str) -> str: return _CAMEL_CASE_REGEX.sub(" ", s).title().replace(" ", "") -def repo_type_of_url(url: str) -> Union[str, None]: - """Get the type of the repository. - - Args: - url (str): The URL of the repository. - Returns: - str: The type of the repository. - """ - parsed_url = urlparse(url) - if parsed_url.scheme in ["http", "https"]: - # case 1 - GitHub URL - if "github.com" in url: - if not url.startswith("https://"): - logger.warning(f"{url} should start with https://") - return - else: - return "github" - # case 2 - remote URL - elif url_exists(url): # not check whether it is a git URL - return "remote" - # case 3 - local path - elif is_git_repo(Path(parsed_url.path)): - return "local" - logger.warning(f"{url} is not a valid URL") - return None - - @cache def get_repo_info(path: Path) -> Tuple[str, str]: """Get the URL and commit hash of the Git repo at ``path``. diff --git a/tests/data_extraction/test_lean_repo.py b/tests/data_extraction/test_lean_repo.py index c25b18c..cf8b4d4 100644 --- a/tests/data_extraction/test_lean_repo.py +++ b/tests/data_extraction/test_lean_repo.py @@ -3,9 +3,10 @@ from lean_dojo.constants import LEAN4_URL from git import Repo from github.Repository import Repository -from lean_dojo.utils import working_directory, repo_type_of_url +from lean_dojo.utils import working_directory from lean_dojo.data_extraction.lean import ( _to_commit_hash, + repo_type_of_url, url_to_repo, get_latest_commit, is_commit_hash, @@ -24,6 +25,8 @@ def test_url_to_repo(lean4_example_url, remote_example_url): ## test url_to_repo & repo_type_of_url github_repo = url_to_repo(lean4_example_url) assert repo_type_of_url(lean4_example_url) == "github" + assert repo_type_of_url("git@github.com:yangky11/lean4-example.git") == "github" + assert repo_type_of_url("git@github.com:yangky11/lean4-example") == "github" assert isinstance(github_repo, Repository) assert github_repo.name == repo_name From 8f1b76534cd5e39310b91442b48cb0e394dc24b6 Mon Sep 17 00:00:00 2001 From: rexwzh <1073853456@qq.com> Date: Tue, 30 Jul 2024 02:28:49 +0800 Subject: [PATCH 20/34] simplify repo cache --- src/lean_dojo/data_extraction/lean.py | 68 ++++++++++++------------- tests/data_extraction/test_lean_repo.py | 3 ++ 2 files changed, 36 insertions(+), 35 deletions(-) diff --git a/src/lean_dojo/data_extraction/lean.py b/src/lean_dojo/data_extraction/lean.py index 1c5048b..bb46bf0 100644 --- a/src/lean_dojo/data_extraction/lean.py +++ b/src/lean_dojo/data_extraction/lean.py @@ -94,6 +94,16 @@ def repo_type_of_url(url: str) -> Union[str, None]: return None +def _format_dirname(url: str, commit: str) -> str: + user_name, repo_name = _split_git_url(url) + repo_type = repo_type_of_url(url) + assert repo_type is not None, f"Invalid url {url}" + if repo_type == "github": + return f"{user_name}-{repo_name}-{commit}" + else: # git repo + return f"gitpython-{repo_name}-{commit}" + + @cache def url_to_repo( url: str, @@ -506,25 +516,29 @@ def __post_init__(self) -> None: # set repo and commit if repo_type == "github": repo = url_to_repo(self.url, repo_type=repo_type) - # Convert tags or branches to commit hashes - if not is_commit_hash(self.commit): - if (self.url, self.commit) in info_cache.tag2commit: - commit = info_cache.tag2commit[(self.url, self.commit)] - else: - commit = _to_commit_hash(repo, self.commit) - assert is_commit_hash(commit), f"Invalid commit hash: {commit}" - info_cache.tag2commit[(self.url, commit)] = commit - object.__setattr__(self, "commit", commit) else: # get repo from cache - cache_repo_path = repo_cache.get( - REPO_CACHE_PREFIX / self.format_dirname / self.name + rel_cache_dir = lambda url, commit: Path( + f"{REPO_CACHE_PREFIX}/{_format_dirname(url, commit)}/{self.name}" ) - # clone and store the repo if not in cache - if cache_repo_path is None: - cache_repo_path = self.add_to_cache() - repo = Repo(cache_repo_path) - object.__setattr__(self, "commit", _to_commit_hash(repo, self.commit)) + cache_repo_dir = repo_cache.get(rel_cache_dir(self.url, self.commit)) + if cache_repo_dir is None: + with working_directory() as tmp_dir: + repo = url_to_repo(self.url, repo_type=repo_type, tmp_dir=tmp_dir) + commit = _to_commit_hash(repo, self.commit) + cache_repo_dir = repo_cache.store( + repo.working_dir, rel_cache_dir(self.url, commit) + ) + repo = Repo(cache_repo_dir) + # Convert tags or branches to commit hashes + if not is_commit_hash(self.commit): + if (self.url, self.commit) in info_cache.tag2commit: + commit = info_cache.tag2commit[(self.url, self.commit)] + else: + commit = _to_commit_hash(repo, self.commit) + assert is_commit_hash(commit), f"Invalid commit hash: {commit}" + info_cache.tag2commit[(self.url, commit)] = commit + object.__setattr__(self, "commit", commit) object.__setattr__(self, "repo", repo) # Determine the required Lean version. @@ -561,26 +575,10 @@ def commit_url(self) -> str: return f"{self.url}/tree/{self.commit}" @property - def format_dirname(self) -> str: + def format_dirname(self) -> Path: """Return the formatted cache directory name""" - if self.repo_type == "github": - user_name, repo_name = _split_git_url(self.url) - else: # f"gitpython-{repo_name}-{commit}" - user_name, repo_name = "gitpython", self.name - return Path(f"{user_name}-{repo_name}-{self.commit}") - - def add_to_cache(self) -> Path: - """Store the repo in the cache directory.""" - assert self.repo_type in [ - "local", - "remote", - ], f"Unsupported cache repo type: {self.repo_type}" - with working_directory() as tmp_dir: - repo = url_to_repo(self.url, repo_type=self.repo_type, tmp_dir=tmp_dir) - commit = _to_commit_hash(repo, self.commit) - repo.git.checkout(commit) - rel_cache_dir = REPO_CACHE_PREFIX / self.format_dirname / self.name - return repo_cache.store(repo.working_dir, rel_cache_dir) + assert is_commit_hash(self.commit), f"Invalid commit hash: {self.commit}" + return Path(_format_dirname(self.url, self.commit)) def show(self) -> None: """Show the repo in the default browser.""" diff --git a/tests/data_extraction/test_lean_repo.py b/tests/data_extraction/test_lean_repo.py index d9763a7..04abb7f 100644 --- a/tests/data_extraction/test_lean_repo.py +++ b/tests/data_extraction/test_lean_repo.py @@ -43,6 +43,7 @@ def test_github_type(lean4_example_url, example_commit_hash): ) ## LeanGitRepo + LeanGitRepo(lean4_example_url, "main") # init with branch repo = LeanGitRepo(lean4_example_url, example_commit_hash) assert repo.url == lean4_example_url assert repo.repo_type == "github" @@ -71,6 +72,7 @@ def test_remote_type(remote_example_url, example_commit_hash): ) ## LeanGitRepo + LeanGitRepo(remote_example_url, "main") repo = LeanGitRepo(remote_example_url, example_commit_hash) assert repo.url == remote_example_url assert repo.repo_type == "remote" @@ -117,6 +119,7 @@ def test_local_type(lean4_example_url, example_commit_hash): assert _to_commit_hash(repo, "v0.1.0") == example_commit_hash ## LeanGitRepo + LeanGitRepo(local_url, "main") repo = LeanGitRepo(local_url, example_commit_hash) repo2 = LeanGitRepo.from_path(local_url) # test from_path assert repo.url == local_url == repo2.url From fbfa0cddcfb455c04c0bc0e0faf80efd163721f4 Mon Sep 17 00:00:00 2001 From: Kaiyu Yang Date: Tue, 30 Jul 2024 11:52:02 +0000 Subject: [PATCH 21/34] bump to v2.1.0 --- docs/source/conf.py | 2 +- pyproject.toml | 2 +- src/lean_dojo/constants.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 5edd158..e12d4d8 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -13,7 +13,7 @@ project = "LeanDojo" copyright = "2023, LeanDojo Team" author = "Kaiyu Yang" -release = "2.0.3" +release = "2.1.0" # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration diff --git a/pyproject.toml b/pyproject.toml index d6c7061..fec8ce5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ exclude = [ [project] name = "lean-dojo" -version = "2.0.3" +version = "2.1.0" authors = [ { name="Kaiyu Yang", email="kaiyuy@meta.com" }, ] diff --git a/src/lean_dojo/constants.py b/src/lean_dojo/constants.py index fd3f74b..e19a83a 100644 --- a/src/lean_dojo/constants.py +++ b/src/lean_dojo/constants.py @@ -14,7 +14,7 @@ load_dotenv() -__version__ = "2.0.3" +__version__ = "2.1.0" logger.remove() if "VERBOSE" in os.environ or "DEBUG" in os.environ: From 7adf0336af6f3f9273bceec307adffc1e813fb10 Mon Sep 17 00:00:00 2001 From: Kaiyu Yang Date: Mon, 5 Aug 2024 00:29:56 +0000 Subject: [PATCH 22/34] minor cleanup --- src/lean_dojo/constants.py | 33 +++++-------- src/lean_dojo/data_extraction/cache.py | 2 +- src/lean_dojo/data_extraction/lean.py | 4 +- src/lean_dojo/data_extraction/traced_data.py | 1 - src/lean_dojo/utils.py | 50 +------------------- 5 files changed, 17 insertions(+), 73 deletions(-) diff --git a/src/lean_dojo/constants.py b/src/lean_dojo/constants.py index e19a83a..590c23a 100644 --- a/src/lean_dojo/constants.py +++ b/src/lean_dojo/constants.py @@ -73,26 +73,19 @@ def check_git_version(min_version: Tuple[int, int, int]) -> None: """Check the version of Git installed on the system.""" - try: - res = subprocess.run( - "git --version", shell=True, capture_output=True, check=True - ) - output = res.stdout.decode().strip() - error = res.stderr.decode() - assert error == "", error - match = re.search(r"git version (\d+\.\d+\.\d+)", output) - if not match: - raise ValueError("Could not parse Git version from the output.") - # Convert version number string to tuple of integers - version = tuple(int(_) for _ in match.group(1).split(".")) - - version_str = ".".join(str(_) for _ in version) - min_version_str = ".".join(str(_) for _ in min_version) - assert ( - version >= min_version - ), f"Git version {version_str} is too old. Please upgrade to at least {min_version_str}." - except subprocess.CalledProcessError as e: - raise Exception(f"Failed to run git command: {e}") + res = subprocess.run("git --version", shell=True, capture_output=True, check=True) + output = res.stdout.decode().strip() + error = res.stderr.decode() + assert error == "", error + m = re.search(r"git version (\d+\.\d+\.\d+)", output) + assert m, f"Could not parse Git version from: {output}" + # Convert version number string to tuple of integers + version = tuple(int(_) for _ in m.group(1).split(".")) + version_str = ".".join(str(_) for _ in version) + min_version_str = ".".join(str(_) for _ in min_version) + assert ( + version >= min_version + ), f"Git version {version_str} is too old. Please upgrade to at least {min_version_str}." check_git_version((2, 25, 0)) diff --git a/src/lean_dojo/data_extraction/cache.py b/src/lean_dojo/data_extraction/cache.py index bbbafd3..daaa3cb 100644 --- a/src/lean_dojo/data_extraction/cache.py +++ b/src/lean_dojo/data_extraction/cache.py @@ -8,7 +8,7 @@ from loguru import logger from filelock import FileLock from dataclasses import dataclass, field -from typing import Optional, Tuple, Generator, Union +from typing import Optional, Tuple, Generator from ..utils import ( execute, diff --git a/src/lean_dojo/data_extraction/lean.py b/src/lean_dojo/data_extraction/lean.py index bb46bf0..f453cf7 100644 --- a/src/lean_dojo/data_extraction/lean.py +++ b/src/lean_dojo/data_extraction/lean.py @@ -13,12 +13,13 @@ from pathlib import Path from loguru import logger from functools import cache +from git import Repo, BadName from github import Github, Auth from dataclasses import dataclass, field from github.Repository import Repository from github.GithubException import GithubException from typing import List, Dict, Any, Generator, Union, Optional, Tuple, Iterator -from git import Repo, BadName + from ..constants import TMP_DIR import uuid import shutil @@ -28,7 +29,6 @@ from ..utils import ( read_url, url_exists, - get_repo_info, working_directory, is_git_repo, ) diff --git a/src/lean_dojo/data_extraction/traced_data.py b/src/lean_dojo/data_extraction/traced_data.py index cf4a23a..82317b2 100644 --- a/src/lean_dojo/data_extraction/traced_data.py +++ b/src/lean_dojo/data_extraction/traced_data.py @@ -15,7 +15,6 @@ from loguru import logger from dataclasses import dataclass, field from typing import List, Optional, Dict, Any, Tuple, Union -from git import Repo from ..utils import ( is_git_repo, diff --git a/src/lean_dojo/utils.py b/src/lean_dojo/utils.py index cc3e211..3efad4c 100644 --- a/src/lean_dojo/utils.py +++ b/src/lean_dojo/utils.py @@ -5,7 +5,7 @@ import os import ray import time -import urllib, urllib.request, urllib.error +import urllib import typing import hashlib import tempfile @@ -16,7 +16,6 @@ from contextlib import contextmanager from ray.util.actor_pool import ActorPool from typing import Tuple, Union, List, Generator, Optional -from urllib.parse import urlparse from .constants import NUM_WORKERS, TMP_DIR, LEAN4_PACKAGES_DIR, LEAN4_BUILD_DIR @@ -145,53 +144,6 @@ def camel_case(s: str) -> str: return _CAMEL_CASE_REGEX.sub(" ", s).title().replace(" ", "") -def repo_type_of_url(url: str) -> str: - """Get the type of the repository. - - Args: - url (str): The URL of the repository. - - Returns: - str: The type of the repository. - """ - parsed_url = urlparse(url) - if parsed_url.scheme in ["http", "https"]: - # case 1 - GitHub URL - if "github.com" in url: - if not url.startswith("https://"): - logger.warning(f"{url} should start with https://") - return - else: - return "github" - # case 2 - remote Git URL - else: - return "remote" - # case 3 - local path - elif os.path.exists(parsed_url.path): - return "local" - else: - logger.warning(f"{url} is not a valid URL") - - -@cache -def get_repo_info(path: Path) -> Tuple[str, str]: - """Get the URL and commit hash of the Git repo at ``path``. - - Args: - path (Path): Path to the Git repo. - - Returns: - Tuple[str, str]: URL and (most recent) hash commit - """ - url = str(path.absolute()) # use the absolute path - # Get the commit. - commit_msg, _ = execute(f"git log -n 1", capture_output=True) - m = re.search(r"(?<=^commit )[a-z0-9]+", commit_msg) - assert m is not None - commit = m.group() - return url, commit - - def is_optional_type(tp: type) -> bool: """Test if ``tp`` is Optional[X].""" if typing.get_origin(tp) != Union: From 43f770b4a2ab362671c59616f42ec0cd005e0428 Mon Sep 17 00:00:00 2001 From: Kaiyu Yang Date: Mon, 5 Aug 2024 00:45:09 +0000 Subject: [PATCH 23/34] move _split_git_url --- src/lean_dojo/data_extraction/cache.py | 12 ------------ src/lean_dojo/data_extraction/lean.py | 12 +++++++++++- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/src/lean_dojo/data_extraction/cache.py b/src/lean_dojo/data_extraction/cache.py index daaa3cb..b96d81b 100644 --- a/src/lean_dojo/data_extraction/cache.py +++ b/src/lean_dojo/data_extraction/cache.py @@ -21,18 +21,6 @@ REMOTE_CACHE_URL, ) - -def _split_git_url(url: str) -> Tuple[str, str]: - """Split a Git URL into user name and repo name.""" - if url.endswith("/"): - url = url[:-1] - assert not url.endswith("/"), f"Unexpected URL: {url}" - fields = url.split("/") - user_name = fields[-2] - repo_name = fields[-1] - return user_name, repo_name - - _CACHE_CORRPUTION_MSG = "The cache may have been corrputed!" diff --git a/src/lean_dojo/data_extraction/lean.py b/src/lean_dojo/data_extraction/lean.py index f453cf7..fddfe5e 100644 --- a/src/lean_dojo/data_extraction/lean.py +++ b/src/lean_dojo/data_extraction/lean.py @@ -25,7 +25,6 @@ import shutil from urllib.parse import urlparse from .cache import cache as repo_cache -from .cache import _split_git_url from ..utils import ( read_url, url_exists, @@ -94,6 +93,17 @@ def repo_type_of_url(url: str) -> Union[str, None]: return None +def _split_git_url(url: str) -> Tuple[str, str]: + """Split a Git URL into user name and repo name.""" + if url.endswith("/"): + url = url[:-1] + assert not url.endswith("/"), f"Unexpected URL: {url}" + fields = url.split("/") + user_name = fields[-2] + repo_name = fields[-1] + return user_name, repo_name + + def _format_dirname(url: str, commit: str) -> str: user_name, repo_name = _split_git_url(url) repo_type = repo_type_of_url(url) From 515d3613a0222f4c585b817332c7289775560bc2 Mon Sep 17 00:00:00 2001 From: Kaiyu Yang Date: Mon, 5 Aug 2024 00:49:21 +0000 Subject: [PATCH 24/34] edit imports --- src/lean_dojo/data_extraction/cache.py | 9 +++++++-- src/lean_dojo/data_extraction/lean.py | 13 ++++++------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/src/lean_dojo/data_extraction/cache.py b/src/lean_dojo/data_extraction/cache.py index b96d81b..3bf239c 100644 --- a/src/lean_dojo/data_extraction/cache.py +++ b/src/lean_dojo/data_extraction/cache.py @@ -7,8 +7,8 @@ from pathlib import Path from loguru import logger from filelock import FileLock +from typing import Optional, Generator from dataclasses import dataclass, field -from typing import Optional, Tuple, Generator from ..utils import ( execute, @@ -21,6 +21,7 @@ REMOTE_CACHE_URL, ) + _CACHE_CORRPUTION_MSG = "The cache may have been corrputed!" @@ -42,7 +43,11 @@ def __post_init__(self): object.__setattr__(self, "lock", FileLock(lock_path)) def get(self, rel_cache_dir: Path) -> Optional[Path]: - """Get the cache repo at ``CACHE_DIR / rel_cache_dir`` from the cache.""" + """Get the cache repo at ``CACHE_DIR / rel_cache_dir`` from the cache. + + Args: + rel_cache_dir (Path): The relative path of the stored repo in the cache. + """ dirname = rel_cache_dir.parent dirpath = self.cache_dir / dirname cache_path = self.cache_dir / rel_cache_dir diff --git a/src/lean_dojo/data_extraction/lean.py b/src/lean_dojo/data_extraction/lean.py index fddfe5e..92a21e1 100644 --- a/src/lean_dojo/data_extraction/lean.py +++ b/src/lean_dojo/data_extraction/lean.py @@ -5,8 +5,10 @@ import re import os import json +import uuid import toml import time +import shutil import urllib import webbrowser import shutil @@ -20,18 +22,15 @@ from github.GithubException import GithubException from typing import List, Dict, Any, Generator, Union, Optional, Tuple, Iterator -from ..constants import TMP_DIR -import uuid -import shutil -from urllib.parse import urlparse -from .cache import cache as repo_cache from ..utils import ( read_url, url_exists, working_directory, is_git_repo, ) -from ..constants import LEAN4_URL +from .cache import cache as repo_cache +from ..constants import TMP_DIR, LEAN4_URL + GITHUB_ACCESS_TOKEN = os.getenv("GITHUB_ACCESS_TOKEN", None) """GiHub personal access token is optional. @@ -74,7 +73,7 @@ def repo_type_of_url(url: str) -> Union[str, None]: """ m = _SSH_TO_HTTPS_REGEX.match(url) url = f"https://github.com/{m.group(1)}/{m.group(2)}" if m else url - parsed_url = urlparse(url) + parsed_url = urllib.parse.urlparse(url) if parsed_url.scheme in ["http", "https"]: # case 1 - GitHub URL if "github.com" in url: From 897c6f1ad6d94f67ca6b11a9dce53f962537a6cf Mon Sep 17 00:00:00 2001 From: Kaiyu Yang Date: Mon, 5 Aug 2024 00:56:38 +0000 Subject: [PATCH 25/34] minor edits to get_traced_repo_path --- src/lean_dojo/data_extraction/trace.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/lean_dojo/data_extraction/trace.py b/src/lean_dojo/data_extraction/trace.py index 1fae85a..c0f3c4b 100644 --- a/src/lean_dojo/data_extraction/trace.py +++ b/src/lean_dojo/data_extraction/trace.py @@ -204,16 +204,16 @@ def get_traced_repo_path(repo: LeanGitRepo, build_deps: bool = True) -> Path: Returns: Path: The path of the traced repo in the cache, e.g. :file:`/home/kaiyu/.cache/lean_dojo/leanprover-community-mathlib-2196ab363eb097c008d4497125e0dde23fb36db2` """ - path = cache.get(repo.format_dirname / repo.name) + rel_cache_dir = repo.format_dirname / repo.name + path = cache.get(rel_cache_dir) if path is None: logger.info(f"Tracing {repo}") with working_directory() as tmp_dir: logger.debug(f"Working in the temporary directory {tmp_dir}") _trace(repo, build_deps) - traced_repo = TracedRepo.from_traced_files(tmp_dir / repo.name, build_deps) - traced_repo.save_to_disk() src_dir = tmp_dir / repo.name - rel_cache_dir = Path(repo.format_dirname) / repo.name + traced_repo = TracedRepo.from_traced_files(src_dir, build_deps) + traced_repo.save_to_disk() path = cache.store(src_dir, rel_cache_dir) else: logger.debug("The traced repo is available in the cache.") From 592f9b5b514c339b72c4e021b650204c29018aa5 Mon Sep 17 00:00:00 2001 From: Kaiyu Yang Date: Mon, 5 Aug 2024 01:58:25 +0000 Subject: [PATCH 26/34] minor edits --- src/lean_dojo/data_extraction/lean.py | 96 +++++++++++++------------ src/lean_dojo/data_extraction/trace.py | 2 +- tests/data_extraction/test_cache.py | 11 +-- tests/data_extraction/test_lean_repo.py | 40 +++++------ tests/data_extraction/test_trace.py | 14 ++-- tests/interaction/test_interaction.py | 16 +++-- 6 files changed, 92 insertions(+), 87 deletions(-) diff --git a/src/lean_dojo/data_extraction/lean.py b/src/lean_dojo/data_extraction/lean.py index 92a21e1..0d4001f 100644 --- a/src/lean_dojo/data_extraction/lean.py +++ b/src/lean_dojo/data_extraction/lean.py @@ -5,13 +5,12 @@ import re import os import json -import uuid import toml import time -import shutil import urllib import webbrowser import shutil +from enum import Enum from pathlib import Path from loguru import logger from functools import cache @@ -57,37 +56,44 @@ REPO_CACHE_PREFIX = "repos" -def normalize_url(url: str, repo_type: str = "github") -> str: - if repo_type == "local": - return os.path.abspath(url) # Convert to absolute path if local - return _URL_REGEX.fullmatch(url)["url"] # Remove trailing `/`. +class RepoType(Enum): + GITHUB = 0 + REMOTE = 1 # Remote but not GitHub. + LOCAL = 2 + +def normalize_url(url: str, repo_type: RepoType = RepoType.GITHUB) -> str: + if repo_type == RepoType.LOCAL: # Convert to absolute path if local. + return os.path.abspath(url) + # Remove trailing `/`. + return _URL_REGEX.fullmatch(url)["url"] -def repo_type_of_url(url: str) -> Union[str, None]: + +def get_repo_type(url: str) -> Optional[RepoType]: """Get the type of the repository. Args: url (str): The URL of the repository. Returns: - str: The type of the repository. + Optional[str]: The type of the repository (None if the repo cannot be found). """ m = _SSH_TO_HTTPS_REGEX.match(url) url = f"https://github.com/{m.group(1)}/{m.group(2)}" if m else url parsed_url = urllib.parse.urlparse(url) if parsed_url.scheme in ["http", "https"]: - # case 1 - GitHub URL + # Case 1 - GitHub URL. if "github.com" in url: if not url.startswith("https://"): logger.warning(f"{url} should start with https://") - return + return None else: - return "github" - # case 2 - remote URL - elif url_exists(url): # not check whether it is a git URL - return "remote" - # case 3 - local path + return RepoType.GITHUB + # Case 2 - remote URL. + elif url_exists(url): # Not check whether it is a git URL + return RepoType.REMOTE + # Case 3 - local path elif is_git_repo(Path(parsed_url.path)): - return "local" + return RepoType.LOCAL logger.warning(f"{url} is not a valid URL") return None @@ -103,11 +109,11 @@ def _split_git_url(url: str) -> Tuple[str, str]: return user_name, repo_name -def _format_dirname(url: str, commit: str) -> str: +def _format_cache_dirname(url: str, commit: str) -> str: user_name, repo_name = _split_git_url(url) - repo_type = repo_type_of_url(url) + repo_type = get_repo_type(url) assert repo_type is not None, f"Invalid url {url}" - if repo_type == "github": + if repo_type == RepoType.GITHUB: return f"{user_name}-{repo_name}-{commit}" else: # git repo return f"gitpython-{repo_name}-{commit}" @@ -117,7 +123,7 @@ def _format_dirname(url: str, commit: str) -> str: def url_to_repo( url: str, num_retries: int = 2, - repo_type: Union[str, None] = None, + repo_type: Optional[RepoType] = None, tmp_dir: Union[Path] = None, ) -> Union[Repo, Repository]: """Convert a URL to a Repo object. @@ -125,7 +131,7 @@ def url_to_repo( Args: url (str): The URL of the repository. num_retries (int): Number of retries in case of failure. - repo_type (Optional[str]): The type of the repository. Defaults to None. + repo_type (Optional[RepoType]): The type of the repository. Defaults to None. tmp_dir (Optional[Path]): The temporary directory to clone the repo to. Defaults to None. Returns: @@ -133,16 +139,18 @@ def url_to_repo( """ url = normalize_url(url) backoff = 1 - tmp_dir = tmp_dir or os.path.join(TMP_DIR or "/tmp", str(uuid.uuid4())[:8]) - repo_type = repo_type or repo_type_of_url(url) + tmp_dir = tmp_dir or os.path.join( + TMP_DIR or "/tmp", next(tempfile._get_candidate_names()) + ) + repo_type = repo_type or get_repo_type(url) assert repo_type is not None, f"Invalid url {url}" while True: try: - if repo_type == "github": + if repo_type == RepoType.GITHUB: return GITHUB.get_repo("/".join(url.split("/")[-2:])) with working_directory(tmp_dir): repo_name = os.path.basename(url) - if repo_type == "local": + if repo_type == RepoType.LOCAL: assert is_git_repo(url), f"Local path {url} is not a git repo" shutil.copytree(url, repo_name) return Repo(repo_name) @@ -174,26 +182,22 @@ def cleanse_string(s: Union[str, Path]) -> str: def _to_commit_hash(repo: Union[Repository, Repo], label: str) -> str: """Convert a tag or branch to a commit hash.""" - # GitHub repository - if isinstance(repo, Repository): + if isinstance(repo, Repository): # GitHub repository logger.debug(f"Querying the commit hash for {repo.name} {label}") try: - commit = repo.get_commit(label).sha - except GithubException as e: + return repo.get_commit(label).sha + except GithubException as ex: raise ValueError(f"Invalid tag or branch: `{label}` for {repo.name}") - # Local or remote Git repository - elif isinstance(repo, Repo): + else: # Local or remote Git repository + assert isinstance(repo, Repo) logger.debug( f"Querying the commit hash for {repo.working_dir} repository {label}" ) try: # Resolve the label to a commit hash - commit = repo.commit(label).hexsha - except Exception as e: + return repo.commit(label).hexsha + except Exception as ex: raise ValueError(f"Error converting ref to commit hash: {e}") - else: - raise TypeError("Unsupported repository type") - return commit @dataclass(eq=True, unsafe_hash=True) @@ -493,8 +497,7 @@ class LeanGitRepo: url: str """The repo's URL. - It can be a GitHub URL that starts with https:// or git@github.com, - a local path, or any other valid Git URL. + It can be a GitHub URL that starts with https:// or git@github.com, a local path, or any other valid Git URL. """ commit: str @@ -512,23 +515,23 @@ class LeanGitRepo: """Required Lean version. """ - repo_type: str = field(init=False, repr=False) - """Type of the repo. It can be ``github``, ``local`` or ``remote``. + repo_type: RepoType = field(init=False, repr=False) + """Type of the repo. It can be ``GITHUB``, ``LOCAL`` or ``REMOTE``. """ def __post_init__(self) -> None: - repo_type = repo_type_of_url(self.url) + repo_type = get_repo_type(self.url) if repo_type is None: raise ValueError(f"{self.url} is not a valid URL") object.__setattr__(self, "repo_type", repo_type) object.__setattr__(self, "url", normalize_url(self.url, repo_type=repo_type)) # set repo and commit - if repo_type == "github": + if repo_type == RepoType.GITHUB: repo = url_to_repo(self.url, repo_type=repo_type) else: # get repo from cache rel_cache_dir = lambda url, commit: Path( - f"{REPO_CACHE_PREFIX}/{_format_dirname(url, commit)}/{self.name}" + f"{REPO_CACHE_PREFIX}/{_format_cache_dirname(url, commit)}/{self.name}" ) cache_repo_dir = repo_cache.get(rel_cache_dir(self.url, self.commit)) if cache_repo_dir is None: @@ -583,18 +586,17 @@ def is_lean4(self) -> bool: def commit_url(self) -> str: return f"{self.url}/tree/{self.commit}" - @property def format_dirname(self) -> Path: """Return the formatted cache directory name""" assert is_commit_hash(self.commit), f"Invalid commit hash: {self.commit}" - return Path(_format_dirname(self.url, self.commit)) + return Path(_format_cache_dirname(self.url, self.commit)) def show(self) -> None: """Show the repo in the default browser.""" webbrowser.open(self.commit_url) def exists(self) -> bool: - if self.repo_type != "github": + if self.repo_type != RepoType.GITHUB: repo = self.repo # git repo try: repo.commit(self.commit) @@ -746,7 +748,7 @@ def _get_config_url(self, filename: str) -> str: def get_config(self, filename: str, num_retries: int = 2) -> Dict[str, Any]: """Return the repo's files.""" - if self.repo_type == "github": + if self.repo_type == RepoType.GITHUB: config_url = self._get_config_url(filename) content = read_url(config_url, num_retries) else: diff --git a/src/lean_dojo/data_extraction/trace.py b/src/lean_dojo/data_extraction/trace.py index c0f3c4b..5a2d90d 100644 --- a/src/lean_dojo/data_extraction/trace.py +++ b/src/lean_dojo/data_extraction/trace.py @@ -204,7 +204,7 @@ def get_traced_repo_path(repo: LeanGitRepo, build_deps: bool = True) -> Path: Returns: Path: The path of the traced repo in the cache, e.g. :file:`/home/kaiyu/.cache/lean_dojo/leanprover-community-mathlib-2196ab363eb097c008d4497125e0dde23fb36db2` """ - rel_cache_dir = repo.format_dirname / repo.name + rel_cache_dir = repo.get_cache_dirname() / repo.name path = cache.get(rel_cache_dir) if path is None: logger.info(f"Tracing {repo}") diff --git a/tests/data_extraction/test_cache.py b/tests/data_extraction/test_cache.py index 4741239..e80d0f8 100644 --- a/tests/data_extraction/test_cache.py +++ b/tests/data_extraction/test_cache.py @@ -1,17 +1,15 @@ # test for cache manager from git import Repo -from lean_dojo.utils import working_directory from pathlib import Path +from lean_dojo.utils import working_directory from lean_dojo.data_extraction.cache import cache -def test_repo_cache(lean4_example_url, remote_example_url, example_commit_hash): +def test_local_repo_cache(lean4_example_url, example_commit_hash): # Note: The `git.Repo` requires the local repo to be cloned in a directory # all cached repos are stored in CACHE_DIR/repos prefix = "repos" repo_name = "lean4-example" - - # test local repo cache with working_directory() as tmp_dir: repo = Repo.clone_from(lean4_example_url, repo_name) repo.git.checkout(example_commit_hash) @@ -24,7 +22,10 @@ def test_repo_cache(lean4_example_url, remote_example_url, example_commit_hash): repo_cache_dir = cache.get(rel_cache_dir) assert repo_cache_dir is not None - # test remote repo cache + +def test_remote_repo_cache(remote_example_url): + prefix = "repos" + repo_name = "lean4-example" with working_directory() as tmp_dir: repo = Repo.clone_from(remote_example_url, repo_name) tmp_remote_dir = tmp_dir / repo_name diff --git a/tests/data_extraction/test_lean_repo.py b/tests/data_extraction/test_lean_repo.py index 04abb7f..818182c 100644 --- a/tests/data_extraction/test_lean_repo.py +++ b/tests/data_extraction/test_lean_repo.py @@ -1,17 +1,17 @@ -# test for the class `LeanGitRepo` -from lean_dojo import LeanGitRepo +# Tests for the class `LeanGitRepo` from git import Repo +from lean_dojo import LeanGitRepo from github.Repository import Repository -from lean_dojo.utils import working_directory from lean_dojo.data_extraction.lean import ( _to_commit_hash, - repo_type_of_url, + get_repo_type, url_to_repo, get_latest_commit, is_commit_hash, GITHUB, - LEAN4_REPO, + RepoType, ) +from lean_dojo.utils import working_directory def test_github_type(lean4_example_url, example_commit_hash): @@ -21,11 +21,11 @@ def test_github_type(lean4_example_url, example_commit_hash): gh_cm_hash = get_latest_commit(lean4_example_url) assert is_commit_hash(gh_cm_hash) - ## url_to_repo & repo_type_of_url + ## url_to_repo & get_repo_type github_repo = url_to_repo(lean4_example_url) - assert repo_type_of_url(lean4_example_url) == "github" - assert repo_type_of_url("git@github.com:yangky11/lean4-example.git") == "github" - assert repo_type_of_url("git@github.com:yangky11/lean4-example") == "github" + assert get_repo_type(lean4_example_url) == RepoType.GITHUB + assert get_repo_type("git@github.com:yangky11/lean4-example.git") == RepoType.GITHUB + assert get_repo_type("git@github.com:yangky11/lean4-example") == RepoType.GITHUB assert isinstance(github_repo, Repository) assert github_repo.name == repo_name @@ -46,7 +46,7 @@ def test_github_type(lean4_example_url, example_commit_hash): LeanGitRepo(lean4_example_url, "main") # init with branch repo = LeanGitRepo(lean4_example_url, example_commit_hash) assert repo.url == lean4_example_url - assert repo.repo_type == "github" + assert repo.repo_type == RepoType.GITHUB assert repo.commit == example_commit_hash assert repo.exists() assert repo.name == repo_name @@ -54,14 +54,14 @@ def test_github_type(lean4_example_url, example_commit_hash): assert repo.commit_url == f"{lean4_example_url}/tree/{example_commit_hash}" # cache name assert isinstance(repo.repo, Repository) - assert str(repo.format_dirname) == f"yangky11-{repo_name}-{example_commit_hash}" + assert str(repo.format_dirname()) == f"yangky11-{repo_name}-{example_commit_hash}" def test_remote_type(remote_example_url, example_commit_hash): repo_name = "lean4-example" remote_repo = url_to_repo(remote_example_url) - assert repo_type_of_url(remote_example_url) == "remote" + assert get_repo_type(remote_example_url) == RepoType.REMOTE assert isinstance(remote_repo, Repo) re_cm_hash = get_latest_commit(remote_example_url) assert re_cm_hash == get_latest_commit(str(remote_repo.working_dir)) @@ -75,7 +75,7 @@ def test_remote_type(remote_example_url, example_commit_hash): LeanGitRepo(remote_example_url, "main") repo = LeanGitRepo(remote_example_url, example_commit_hash) assert repo.url == remote_example_url - assert repo.repo_type == "remote" + assert repo.repo_type == RepoType.REMOTE assert repo.commit == example_commit_hash assert repo.exists() assert repo.name == repo_name @@ -83,7 +83,7 @@ def test_remote_type(remote_example_url, example_commit_hash): assert repo.commit_url == f"{remote_example_url}/tree/{example_commit_hash}" # cache name assert isinstance(repo.repo, Repo) - assert str(repo.format_dirname) == f"gitpython-{repo_name}-{example_commit_hash}" + assert str(repo.format_dirname()) == f"gitpython-{repo_name}-{example_commit_hash}" def test_local_type(lean4_example_url, example_commit_hash): @@ -98,9 +98,9 @@ def test_local_type(lean4_example_url, example_commit_hash): local_url = str((tmp_dir / repo_name).absolute()) assert get_latest_commit(local_url) == gh_cm_hash - ## url_to_repo & repo_type_of_url - local_repo = url_to_repo(local_url, repo_type="local") - assert repo_type_of_url(local_url) == "local" + ## url_to_repo & get_repo_type + local_repo = url_to_repo(local_url, repo_type=RepoType.LOCAL) + assert get_repo_type(local_url) == RepoType.LOCAL assert isinstance(local_repo, Repo) assert ( local_repo.working_dir != local_url @@ -123,7 +123,7 @@ def test_local_type(lean4_example_url, example_commit_hash): repo = LeanGitRepo(local_url, example_commit_hash) repo2 = LeanGitRepo.from_path(local_url) # test from_path assert repo.url == local_url == repo2.url - assert repo.repo_type == "local" == repo2.repo_type + assert repo.repo_type == RepoType.LOCAL == repo2.repo_type assert repo.commit == example_commit_hash and repo2.commit == gh_cm_hash assert repo.exists() and repo2.exists() assert repo.name == repo_name == repo2.name @@ -131,6 +131,6 @@ def test_local_type(lean4_example_url, example_commit_hash): # cache name assert isinstance(repo.repo, Repo) and isinstance(repo2.repo, Repo) assert ( - str(repo.format_dirname) == f"gitpython-{repo_name}-{example_commit_hash}" + str(repo.format_dirname()) == f"gitpython-{repo_name}-{example_commit_hash}" ) - assert str(repo2.format_dirname) == f"gitpython-{repo_name}-{gh_cm_hash}" + assert str(repo2.format_dirname()) == f"gitpython-{repo_name}-{gh_cm_hash}" diff --git a/tests/data_extraction/test_trace.py b/tests/data_extraction/test_trace.py index 24df62f..04bef5f 100644 --- a/tests/data_extraction/test_trace.py +++ b/tests/data_extraction/test_trace.py @@ -2,25 +2,25 @@ from lean_dojo import * from lean_dojo.data_extraction.cache import cache from lean_dojo.utils import working_directory -from lean_dojo.data_extraction.lean import url_to_repo +from lean_dojo.data_extraction.lean import RepoType from git import Repo def test_github_trace(lean4_example_url): # github github_repo = LeanGitRepo(lean4_example_url, "main") - assert github_repo.repo_type == "github" + assert github_repo.repo_type == RepoType.GITHUB trace_repo = trace(github_repo) - path = cache.get(github_repo.format_dirname / github_repo.name) + path = cache.get(github_repo.format_dirname() / github_repo.name) assert path is not None def test_remote_trace(remote_example_url): # remote remote_repo = LeanGitRepo(remote_example_url, "main") - assert remote_repo.repo_type == "remote" + assert remote_repo.repo_type == RepoType.REMOTE trace_repo = trace(remote_repo) - path = cache.get(remote_repo.format_dirname / remote_repo.name) + path = cache.get(remote_repo.format_dirname() / remote_repo.name) assert path is not None @@ -33,9 +33,9 @@ def test_local_trace(lean4_example_url): local_url = str((tmp_dir / "lean4-example").absolute()) local_repo = LeanGitRepo(local_dir, "main") assert local_repo.url == local_url - assert local_repo.repo_type == "local" + assert local_repo.repo_type == RepoType.LOCAL trace_repo = trace(local_repo) - path = cache.get(local_repo.format_dirname / local_repo.name) + path = cache.get(local_repo.format_dirname() / local_repo.name) assert path is not None diff --git a/tests/interaction/test_interaction.py b/tests/interaction/test_interaction.py index 3eff4d9..83b1c45 100644 --- a/tests/interaction/test_interaction.py +++ b/tests/interaction/test_interaction.py @@ -1,15 +1,17 @@ -from lean_dojo import LeanGitRepo, Dojo, ProofFinished, ProofGivenUp, Theorem -from lean_dojo.utils import working_directory -from git import Repo import os +from git import Repo +from lean_dojo.utils import working_directory +from lean_dojo.data_extraction.lean import RepoType +from lean_dojo import LeanGitRepo, Dojo, ProofFinished, ProofGivenUp, Theorem + # Avoid using remote cache -os.environ["DISABLE_REMOTE_CACHE"] = "true" +os.environ["DISABLE_REMOTE_CACHE"] = "1" def test_github_interact(lean4_example_url): repo = LeanGitRepo(url=lean4_example_url, commit="main") - assert repo.repo_type == "github" + assert repo.repo_type == RepoType.GITHUB theorem = Theorem(repo, "Lean4Example.lean", "hello_world") # initial state dojo, state_0 = Dojo(theorem).__enter__() @@ -26,7 +28,7 @@ def test_github_interact(lean4_example_url): def test_remote_interact(remote_example_url): repo = LeanGitRepo(url=remote_example_url, commit="main") - assert repo.repo_type == "remote" + assert repo.repo_type == RepoType.REMOTE theorem = Theorem(repo, "Lean4Example.lean", "hello_world") # initial state dojo, state_0 = Dojo(theorem).__enter__() @@ -49,7 +51,7 @@ def test_local_interact(lean4_example_url): local_dir = str((tmp_dir / "lean4-example")) repo = LeanGitRepo(local_dir, commit="main") - assert repo.repo_type == "local" + assert repo.repo_type == RepoType.LOCAL theorem = Theorem(repo, "Lean4Example.lean", "hello_world") # initial state dojo, state_0 = Dojo(theorem).__enter__() From 0d387073de67bac95e21fb8ede4da9f806827b9e Mon Sep 17 00:00:00 2001 From: Kaiyu Yang Date: Mon, 5 Aug 2024 02:08:04 +0000 Subject: [PATCH 27/34] minox fix --- src/lean_dojo/data_extraction/lean.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/lean_dojo/data_extraction/lean.py b/src/lean_dojo/data_extraction/lean.py index 0d4001f..d7cdf32 100644 --- a/src/lean_dojo/data_extraction/lean.py +++ b/src/lean_dojo/data_extraction/lean.py @@ -8,8 +8,9 @@ import toml import time import urllib -import webbrowser import shutil +import tempfile +import webbrowser from enum import Enum from pathlib import Path from loguru import logger From 493d89026710f54faa59eb9970c51ff63f5011d5 Mon Sep 17 00:00:00 2001 From: Kaiyu Yang Date: Mon, 5 Aug 2024 02:10:21 +0000 Subject: [PATCH 28/34] fix get_cache_dirname --- src/lean_dojo/data_extraction/lean.py | 2 +- tests/data_extraction/test_lean_repo.py | 8 ++++---- tests/data_extraction/test_trace.py | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/lean_dojo/data_extraction/lean.py b/src/lean_dojo/data_extraction/lean.py index d7cdf32..010469a 100644 --- a/src/lean_dojo/data_extraction/lean.py +++ b/src/lean_dojo/data_extraction/lean.py @@ -587,7 +587,7 @@ def is_lean4(self) -> bool: def commit_url(self) -> str: return f"{self.url}/tree/{self.commit}" - def format_dirname(self) -> Path: + def get_cache_dirname(self) -> Path: """Return the formatted cache directory name""" assert is_commit_hash(self.commit), f"Invalid commit hash: {self.commit}" return Path(_format_cache_dirname(self.url, self.commit)) diff --git a/tests/data_extraction/test_lean_repo.py b/tests/data_extraction/test_lean_repo.py index 818182c..5d20bc4 100644 --- a/tests/data_extraction/test_lean_repo.py +++ b/tests/data_extraction/test_lean_repo.py @@ -54,7 +54,7 @@ def test_github_type(lean4_example_url, example_commit_hash): assert repo.commit_url == f"{lean4_example_url}/tree/{example_commit_hash}" # cache name assert isinstance(repo.repo, Repository) - assert str(repo.format_dirname()) == f"yangky11-{repo_name}-{example_commit_hash}" + assert str(repo.get_cache_dirname()) == f"yangky11-{repo_name}-{example_commit_hash}" def test_remote_type(remote_example_url, example_commit_hash): @@ -83,7 +83,7 @@ def test_remote_type(remote_example_url, example_commit_hash): assert repo.commit_url == f"{remote_example_url}/tree/{example_commit_hash}" # cache name assert isinstance(repo.repo, Repo) - assert str(repo.format_dirname()) == f"gitpython-{repo_name}-{example_commit_hash}" + assert str(repo.get_cache_dirname()) == f"gitpython-{repo_name}-{example_commit_hash}" def test_local_type(lean4_example_url, example_commit_hash): @@ -131,6 +131,6 @@ def test_local_type(lean4_example_url, example_commit_hash): # cache name assert isinstance(repo.repo, Repo) and isinstance(repo2.repo, Repo) assert ( - str(repo.format_dirname()) == f"gitpython-{repo_name}-{example_commit_hash}" + str(repo.get_cache_dirname()) == f"gitpython-{repo_name}-{example_commit_hash}" ) - assert str(repo2.format_dirname()) == f"gitpython-{repo_name}-{gh_cm_hash}" + assert str(repo2.get_cache_dirname()) == f"gitpython-{repo_name}-{gh_cm_hash}" diff --git a/tests/data_extraction/test_trace.py b/tests/data_extraction/test_trace.py index 04bef5f..0c9710a 100644 --- a/tests/data_extraction/test_trace.py +++ b/tests/data_extraction/test_trace.py @@ -11,7 +11,7 @@ def test_github_trace(lean4_example_url): github_repo = LeanGitRepo(lean4_example_url, "main") assert github_repo.repo_type == RepoType.GITHUB trace_repo = trace(github_repo) - path = cache.get(github_repo.format_dirname() / github_repo.name) + path = cache.get(github_repo.get_cache_dirname() / github_repo.name) assert path is not None @@ -20,7 +20,7 @@ def test_remote_trace(remote_example_url): remote_repo = LeanGitRepo(remote_example_url, "main") assert remote_repo.repo_type == RepoType.REMOTE trace_repo = trace(remote_repo) - path = cache.get(remote_repo.format_dirname() / remote_repo.name) + path = cache.get(remote_repo.get_cache_dirname() / remote_repo.name) assert path is not None @@ -35,7 +35,7 @@ def test_local_trace(lean4_example_url): assert local_repo.url == local_url assert local_repo.repo_type == RepoType.LOCAL trace_repo = trace(local_repo) - path = cache.get(local_repo.format_dirname() / local_repo.name) + path = cache.get(local_repo.get_cache_dirname() / local_repo.name) assert path is not None From 819ed53bd04c577dc26735422d93030b8f67c4a0 Mon Sep 17 00:00:00 2001 From: Kaiyu Yang Date: Mon, 5 Aug 2024 02:11:22 +0000 Subject: [PATCH 29/34] format code --- tests/data_extraction/test_lean_repo.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/data_extraction/test_lean_repo.py b/tests/data_extraction/test_lean_repo.py index 5d20bc4..42fc5bd 100644 --- a/tests/data_extraction/test_lean_repo.py +++ b/tests/data_extraction/test_lean_repo.py @@ -54,7 +54,9 @@ def test_github_type(lean4_example_url, example_commit_hash): assert repo.commit_url == f"{lean4_example_url}/tree/{example_commit_hash}" # cache name assert isinstance(repo.repo, Repository) - assert str(repo.get_cache_dirname()) == f"yangky11-{repo_name}-{example_commit_hash}" + assert ( + str(repo.get_cache_dirname()) == f"yangky11-{repo_name}-{example_commit_hash}" + ) def test_remote_type(remote_example_url, example_commit_hash): @@ -83,7 +85,9 @@ def test_remote_type(remote_example_url, example_commit_hash): assert repo.commit_url == f"{remote_example_url}/tree/{example_commit_hash}" # cache name assert isinstance(repo.repo, Repo) - assert str(repo.get_cache_dirname()) == f"gitpython-{repo_name}-{example_commit_hash}" + assert ( + str(repo.get_cache_dirname()) == f"gitpython-{repo_name}-{example_commit_hash}" + ) def test_local_type(lean4_example_url, example_commit_hash): @@ -131,6 +135,7 @@ def test_local_type(lean4_example_url, example_commit_hash): # cache name assert isinstance(repo.repo, Repo) and isinstance(repo2.repo, Repo) assert ( - str(repo.get_cache_dirname()) == f"gitpython-{repo_name}-{example_commit_hash}" + str(repo.get_cache_dirname()) + == f"gitpython-{repo_name}-{example_commit_hash}" ) assert str(repo2.get_cache_dirname()) == f"gitpython-{repo_name}-{gh_cm_hash}" From 44742d97544e902b66db77fb1516f40a5a966560 Mon Sep 17 00:00:00 2001 From: Kaiyu Yang Date: Mon, 5 Aug 2024 12:51:36 +0000 Subject: [PATCH 30/34] add @cache --- src/lean_dojo/data_extraction/lean.py | 1 + tests/interaction/test_tactics.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/lean_dojo/data_extraction/lean.py b/src/lean_dojo/data_extraction/lean.py index 010469a..35bb9d2 100644 --- a/src/lean_dojo/data_extraction/lean.py +++ b/src/lean_dojo/data_extraction/lean.py @@ -181,6 +181,7 @@ def cleanse_string(s: Union[str, Path]) -> str: return str(s).replace("/", "_").replace(":", "_") +@cache def _to_commit_hash(repo: Union[Repository, Repo], label: str) -> str: """Convert a tag or branch to a commit hash.""" if isinstance(repo, Repository): # GitHub repository diff --git a/tests/interaction/test_tactics.py b/tests/interaction/test_tactics.py index c0a2ee8..66a347a 100644 --- a/tests/interaction/test_tactics.py +++ b/tests/interaction/test_tactics.py @@ -37,10 +37,10 @@ def test_example_append_subset(batteries_repo: LeanGitRepo) -> None: thm = Theorem( batteries_repo, "Batteries/Data/List/Lemmas.lean", - "List.append_subset", + "List.disjoint_append_left", ) with Dojo(thm) as (dojo, s0): - s1 = dojo.run_tac(s0, "simp [subset_def, or_imp, forall_and]") + s1 = dojo.run_tac(s0, "simp [Disjoint, or_imp, forall_and]") assert isinstance(s1, ProofFinished) assert dojo.is_successful From 777b3c38f5aa0eaa7e209570a5801d40d08264b4 Mon Sep 17 00:00:00 2001 From: Kaiyu Yang Date: Mon, 5 Aug 2024 13:27:41 +0000 Subject: [PATCH 31/34] fix some typing errors --- .github/workflows/type_check.yaml | 2 +- mypy.ini | 9 +++ src/lean_dojo/data_extraction/ast.py | 10 +-- src/lean_dojo/data_extraction/lean.py | 31 ++++---- src/lean_dojo/data_extraction/trace.py | 16 ++--- src/lean_dojo/data_extraction/traced_data.py | 76 ++++++++++++-------- 6 files changed, 86 insertions(+), 58 deletions(-) diff --git a/.github/workflows/type_check.yaml b/.github/workflows/type_check.yaml index 1e54124..c7161c7 100644 --- a/.github/workflows/type_check.yaml +++ b/.github/workflows/type_check.yaml @@ -15,4 +15,4 @@ jobs: - name: Install dependencies run: pip install ".[all]" - name: Type Check (mypy) - run: mypy src/lean_dojo/interaction + run: mypy src/lean_dojo diff --git a/mypy.ini b/mypy.ini index 9ca677c..c48bfef 100644 --- a/mypy.ini +++ b/mypy.ini @@ -8,4 +8,13 @@ disallow_untyped_calls = False follow_imports = skip [mypy-pexpect.*] +ignore_missing_imports = True + +[mypy-lxml.*] +ignore_missing_imports = True + +[mypy-tqdm.*] +ignore_missing_imports = True + +[mypy-networkx.*] ignore_missing_imports = True \ No newline at end of file diff --git a/src/lean_dojo/data_extraction/ast.py b/src/lean_dojo/data_extraction/ast.py index f858c82..2b88640 100644 --- a/src/lean_dojo/data_extraction/ast.py +++ b/src/lean_dojo/data_extraction/ast.py @@ -27,7 +27,7 @@ def from_data(cls, node_data: Dict[str, Any], lean_file: LeanFile) -> "Node": return subcls.from_data(node_data, lean_file) @classmethod - def _kind_to_node_type(cls, kind: str) -> type: + def _kind_to_node_type(cls, kind: str) -> type["Node"]: prefix = "Lean.Parser." if kind.startswith(prefix): kind = kind[len(prefix) :] @@ -83,7 +83,7 @@ def from_xml(cls, tree: etree.Element, lean_file: LeanFile) -> "Node": start = Pos.from_str(tree.attrib["start"]) if "start" in tree.attrib else None end = Pos.from_str(tree.attrib["end"]) if "end" in tree.attrib else None children = [Node.from_xml(subtree, lean_file) for subtree in tree] - kwargs = {} + kwargs: Dict[str, Any] = {} for field in subcls.__dataclass_fields__.values(): if field.name in ("lean_file", "start", "end", "children"): @@ -113,11 +113,13 @@ def from_xml(cls, tree: etree.Element, lean_file: LeanFile) -> "Node": return subcls(lean_file, start, end, children, **kwargs) # type: ignore - def get_closure(self) -> Tuple[Pos, Pos]: + def get_closure(self) -> Tuple[Optional[Pos], Optional[Pos]]: return self.start, self.end -def _parse_pos(info: Dict[str, Any], lean_file: LeanFile) -> Pos: +def _parse_pos( + info: Dict[str, Any], lean_file: LeanFile +) -> Optional[Tuple[Optional[Pos], Optional[Pos]]]: if "synthetic" in info and not info["synthetic"]["canonical"]: return None diff --git a/src/lean_dojo/data_extraction/lean.py b/src/lean_dojo/data_extraction/lean.py index 35bb9d2..e3aabc9 100644 --- a/src/lean_dojo/data_extraction/lean.py +++ b/src/lean_dojo/data_extraction/lean.py @@ -67,7 +67,7 @@ def normalize_url(url: str, repo_type: RepoType = RepoType.GITHUB) -> str: if repo_type == RepoType.LOCAL: # Convert to absolute path if local. return os.path.abspath(url) # Remove trailing `/`. - return _URL_REGEX.fullmatch(url)["url"] + return _URL_REGEX.fullmatch(url)["url"] # type: ignore def get_repo_type(url: str) -> Optional[RepoType]: @@ -80,7 +80,7 @@ def get_repo_type(url: str) -> Optional[RepoType]: """ m = _SSH_TO_HTTPS_REGEX.match(url) url = f"https://github.com/{m.group(1)}/{m.group(2)}" if m else url - parsed_url = urllib.parse.urlparse(url) + parsed_url = urllib.parse.urlparse(url) # type: ignore if parsed_url.scheme in ["http", "https"]: # Case 1 - GitHub URL. if "github.com" in url: @@ -125,7 +125,7 @@ def url_to_repo( url: str, num_retries: int = 2, repo_type: Optional[RepoType] = None, - tmp_dir: Union[Path] = None, + tmp_dir: Optional[Path] = None, ) -> Union[Repo, Repository]: """Convert a URL to a Repo object. @@ -140,9 +140,8 @@ def url_to_repo( """ url = normalize_url(url) backoff = 1 - tmp_dir = tmp_dir or os.path.join( - TMP_DIR or "/tmp", next(tempfile._get_candidate_names()) - ) + if tmp_dir is None: + tmp_dir = (TMP_DIR or Path("/tmp")) / next(tempfile._get_candidate_names()) # type: ignore repo_type = repo_type or get_repo_type(url) assert repo_type is not None, f"Invalid url {url}" while True: @@ -199,7 +198,7 @@ def _to_commit_hash(repo: Union[Repository, Repo], label: str) -> str: # Resolve the label to a commit hash return repo.commit(label).hexsha except Exception as ex: - raise ValueError(f"Error converting ref to commit hash: {e}") + raise ValueError(f"Error converting ref to commit hash: {ex}") @dataclass(eq=True, unsafe_hash=True) @@ -446,7 +445,9 @@ def get_lean4_commit_from_config(config_dict: Dict[str, Any]) -> str: return _to_commit_hash(LEAN4_REPO, version) -URL = TAG = COMMIT = str +URL = str +TAG = str +COMMIT = str @dataclass(frozen=True) @@ -685,13 +686,13 @@ def _parse_deps( deps = [] for m in matches: - url = m["url"] + url = m["url"] # type: ignore if url.endswith(".git"): url = url[:-4] if url.startswith("git@"): url = "https://" + url[4:].replace(":", "/") - rev = m["rev"] + rev = m["rev"] # type: ignore if rev is None: commit = get_latest_commit(url) elif len(rev) == 40 and _COMMIT_REGEX.fullmatch(rev): @@ -703,7 +704,7 @@ def _parse_deps( commit = get_latest_commit(url) assert _COMMIT_REGEX.fullmatch(commit) - deps.append((m["name"], LeanGitRepo(url, commit))) + deps.append((m["name"], LeanGitRepo(url, commit))) # type: ignore return deps @@ -717,8 +718,8 @@ def _parse_lakefile_toml_dependencies( ) matches = dict() - for requirement in _LAKEFILE_TOML_REQUIREMENT_REGEX.finditer(lakefile): - for line in requirement.strip().splitlines(): + for req in _LAKEFILE_TOML_REQUIREMENT_REGEX.finditer(lakefile): + for line in req.group().strip().splitlines(): key, value = line.split("=") key = key.strip() value = value.strip() @@ -731,7 +732,7 @@ def _parse_lakefile_toml_dependencies( if key == "name": matches["name"] = value - return self._parse_deps(lakefile, matches) + return self._parse_deps(matches) def get_license(self) -> Optional[str]: """Return the content of the ``LICENSE`` file.""" @@ -740,7 +741,7 @@ def get_license(self) -> Optional[str]: license_url = f"{url}/{self.commit}/LICENSE" try: return read_url(license_url) - except urllib.error.HTTPError: + except urllib.error.HTTPError: # type: ignore return None def _get_config_url(self, filename: str) -> str: diff --git a/src/lean_dojo/data_extraction/trace.py b/src/lean_dojo/data_extraction/trace.py index 5a2d90d..072abf4 100644 --- a/src/lean_dojo/data_extraction/trace.py +++ b/src/lean_dojo/data_extraction/trace.py @@ -54,7 +54,7 @@ def _monitor(paths: List[Path], num_total: int) -> None: @contextmanager -def launch_progressbar(paths: List[Union[str, Path]]) -> Generator[None, None, None]: +def launch_progressbar(paths: List[Path]) -> Generator[None, None, None]: """Launch an async progressbar to monitor the progress of tracing the repo.""" paths = [Path(p) for p in paths] olean_files = list( @@ -71,7 +71,7 @@ def get_lean_version() -> str: """Get the version of Lean.""" output = execute("lean --version", capture_output=True)[0].strip() m = re.match(r"Lean \(version (?P\S+?),", output) - return m["version"] + return m["version"] # type: ignore def is_new_version(v: str) -> bool: @@ -93,7 +93,7 @@ def is_new_version(v: str) -> bool: return True -def check_files(packages_path: str, no_deps: bool) -> None: +def check_files(packages_path: Path, no_deps: bool) -> None: """Check if all \*.lean files have been processed to produce \*.ast.json and \*.dep_paths files.""" cwd = Path.cwd() packages_path = cwd / packages_path @@ -142,12 +142,12 @@ def _trace(repo: LeanGitRepo, build_deps: bool) -> None: # Copy the Lean 4 stdlib into the path of packages. lean_prefix = execute(f"lean --print-prefix", capture_output=True)[0].strip() if is_new_version(get_lean_version()): - packages_path = ".lake/packages" - build_path = ".lake/build" + packages_path = Path(".lake/packages") + build_path = Path(".lake/build") else: - packages_path = "lake-packages" - build_path = "build" - shutil.copytree(lean_prefix, f"{packages_path}/lean4") + packages_path = Path("lake-packages") + build_path = Path("build") + shutil.copytree(lean_prefix, str(packages_path / "lean4")) # Run ExtractData.lean to extract ASTs, tactic states, and premise information. shutil.copyfile(LEAN4_DATA_EXTRACTOR_PATH, LEAN4_DATA_EXTRACTOR_PATH.name) diff --git a/src/lean_dojo/data_extraction/traced_data.py b/src/lean_dojo/data_extraction/traced_data.py index 82317b2..15a6ff6 100644 --- a/src/lean_dojo/data_extraction/traced_data.py +++ b/src/lean_dojo/data_extraction/traced_data.py @@ -25,7 +25,28 @@ to_json_path, to_xml_path, ) -from .ast import * +from .ast import ( + Node, + FileNode, + OtherNode, + LemmaNode, + IdentNode, + CommandEndNode, + ModuleImportNode, + ModulePreludeNode, + CommandSectionNode, + CommandTheoremNode, + CommandModuledocNode, + CommandNamespaceNode, + CommandDeclarationNode, + MathlibTacticLemmaNode, + TacticTacticseqbracketedNode, + TacticTacticseq1IndentedNode, + CommandNoncomputablesectionNode, + is_leaf, + is_mutual_lean4, + is_potential_premise_lean4, +) from .lean import LeanFile, LeanGitRepo, Theorem, Pos from ..constants import NUM_WORKERS, LOAD_USED_PACKAGES_ONLY, LEAN4_PACKAGES_DIR @@ -184,7 +205,10 @@ def get_annotated_tactic(self) -> Tuple[str, List[Dict[str, Any]]]: Returns: Tuple[str, List[Dict[str, Any]]]: The first return value is the tactic string marked by `` ... ``. The second return value is a list of provenances. """ - assert self.traced_theorem != None + assert ( + self.traced_theorem is not None + and self.traced_theorem.traced_file is not None + ) lean_file = self.traced_theorem.traced_file.lean_file annot_tac = [] provenances = [] @@ -243,7 +267,9 @@ class TracedTheorem: def __post_init__(self) -> None: assert ( - self.root_dir.is_absolute() and self.root_dir == self.traced_file.root_dir + self.root_dir.is_absolute() + and self.traced_file is not None + and self.root_dir == self.traced_file.root_dir ) def __getstate__(self) -> Dict[str, Any]: @@ -272,7 +298,7 @@ def file_path(self) -> Path: return self.theorem.file_path @property - def traced_repo(self) -> "TracedRepo": + def traced_repo(self) -> Optional["TracedRepo"]: """The traced repo this theorem belongs to.""" if self.traced_file is None: return None @@ -325,25 +351,11 @@ def get_tactic_proof(self) -> Optional[str]: def get_theorem_statement(self) -> str: """Return the theorem statement.""" proof_start, _ = self.locate_proof() + assert self.traced_file is not None return get_code_without_comments( self.traced_file.lean_file, self.ast.start, proof_start, self.comments ) - def get_single_tactic_proof(self) -> Optional[str]: - """Wrap the proof into a single (potentially very long) tactic.""" - if not self.has_tactic_proof(): - return None - node = self.get_proof_node() - start, end = node.get_closure() - proof = get_code_without_comments(node.lean_file, start, end, self.comments) - - raise NotImplementedError - assert isinstance(node.children[0], AtomNode) and node.children[0].val == "by" - assert proof.startswith("by") - proof = proof[len("by") :].strip() - - return proof - def get_premise_full_names(self) -> List[str]: """Return the fully qualified names of all premises used in the proof.""" names = [] @@ -712,6 +724,7 @@ def traverse_preorder(self, callback, node_cls: Optional[type] = None): def _get_repo_and_relative_path(self) -> Tuple[LeanGitRepo, Path]: """Return the repo this file belongs to, as well as the file's path relative to it.""" + assert self.traced_repo is not None if self.path.is_relative_to(LEAN4_PACKAGES_DIR): # The theorem belongs to one of the dependencies. p = self.path.relative_to(LEAN4_PACKAGES_DIR) @@ -737,24 +750,26 @@ def get_traced_theorem( def _callback( node: Union[CommandTheoremNode, LemmaNode, MathlibTacticLemmaNode], _ - ) -> None: + ) -> bool: nonlocal result, private_result - if not isinstance( - node, - ( - CommandTheoremNode, - LemmaNode, - MathlibTacticLemmaNode, - ), + if ( + isinstance( + node, + ( + CommandTheoremNode, + LemmaNode, + MathlibTacticLemmaNode, + ), + ) + and node.full_name == thm.full_name ): - return False - if node.full_name == thm.full_name: comments = self._filter_comments(node.start, node.end) t = TracedTheorem(self.root_dir, thm, node, comments, self) if t.is_private: private_result = t else: result = t + return False self.ast.traverse_preorder(_callback, node_cls=None) @@ -769,7 +784,7 @@ def get_traced_theorems(self) -> List[TracedTheorem]: def _callback( node: Union[CommandTheoremNode, LemmaNode, MathlibTacticLemmaNode], _ - ) -> None: + ) -> bool: if not isinstance( node, ( @@ -1128,6 +1143,7 @@ def from_traced_files( def get_traced_file(self, path: Union[str, Path]) -> TracedFile: """Return a traced file by its path.""" + assert self.traced_files_graph is not None return self.traced_files_graph.nodes[str(path)]["traced_file"] def _update_traced_files(self) -> None: From e662bc5e1ba5a940ee3892f9f1995950deae7d4f Mon Sep 17 00:00:00 2001 From: Kaiyu Yang Date: Mon, 5 Aug 2024 13:55:24 +0000 Subject: [PATCH 32/34] minor fix --- src/lean_dojo/data_extraction/traced_data.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lean_dojo/data_extraction/traced_data.py b/src/lean_dojo/data_extraction/traced_data.py index 15a6ff6..7d3fbd4 100644 --- a/src/lean_dojo/data_extraction/traced_data.py +++ b/src/lean_dojo/data_extraction/traced_data.py @@ -38,6 +38,7 @@ CommandTheoremNode, CommandModuledocNode, CommandNamespaceNode, + CommandDoccommentNode, CommandDeclarationNode, MathlibTacticLemmaNode, TacticTacticseqbracketedNode, From ddfe9c5bdb772c808634265c345e9cad6d71c0bf Mon Sep 17 00:00:00 2001 From: Kaiyu Yang Date: Mon, 5 Aug 2024 20:08:20 +0000 Subject: [PATCH 33/34] update tests --- tests/interaction/test_tactics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/interaction/test_tactics.py b/tests/interaction/test_tactics.py index 66a347a..1aba648 100644 --- a/tests/interaction/test_tactics.py +++ b/tests/interaction/test_tactics.py @@ -130,10 +130,10 @@ def test_example_length_le(batteries_repo: LeanGitRepo) -> None: thm = Theorem( batteries_repo, "Batteries/Data/List/Lemmas.lean", - "List.IsSuffix.length_le", + "List.disjoint_of_disjoint_append_right_right", ) with Dojo(thm) as (dojo, s0): - s1 = dojo.run_tac(s0, "exact h.sublist.length_le") + s1 = dojo.run_tac(s0, "exact (disjoint_append_right.1 d).2") assert isinstance(s1, ProofFinished) assert dojo.is_successful From 297b96e6f544b4c8aed221994d3c85b4270cc877 Mon Sep 17 00:00:00 2001 From: Kaiyu Yang Date: Tue, 6 Aug 2024 02:19:51 +0000 Subject: [PATCH 34/34] minor edits --- src/lean_dojo/data_extraction/traced_data.py | 4 ++-- src/lean_dojo/utils.py | 17 ++++++++--------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/lean_dojo/data_extraction/traced_data.py b/src/lean_dojo/data_extraction/traced_data.py index 7d3fbd4..5e56050 100644 --- a/src/lean_dojo/data_extraction/traced_data.py +++ b/src/lean_dojo/data_extraction/traced_data.py @@ -539,7 +539,7 @@ def from_traced_file( def _from_lean4_traced_file( cls, root_dir: Path, json_path: Path, repo: LeanGitRepo ) -> "TracedFile": - lean_path = to_lean_path(root_dir, json_path, repo) + lean_path = to_lean_path(root_dir, json_path) lean_file = LeanFile(root_dir, lean_path) data = json.load(json_path.open()) @@ -922,7 +922,7 @@ def from_xml( root_dir = Path(root_dir) path = Path(path) assert path.suffixes == [".trace", ".xml"] - lean_path = to_lean_path(root_dir, path, repo) + lean_path = to_lean_path(root_dir, path) lean_file = LeanFile(root_dir, lean_path) tree = etree.parse(path).getroot() diff --git a/src/lean_dojo/utils.py b/src/lean_dojo/utils.py index 3efad4c..57f6240 100644 --- a/src/lean_dojo/utils.py +++ b/src/lean_dojo/utils.py @@ -71,7 +71,7 @@ def ray_actor_pool( """ assert not ray.is_initialized() ray.init() - pool = ActorPool([actor_cls.remote(*args, **kwargs) for _ in range(NUM_WORKERS)]) + pool = ActorPool([actor_cls.remote(*args, **kwargs) for _ in range(NUM_WORKERS)]) # type: ignore try: yield pool finally: @@ -154,8 +154,7 @@ def is_optional_type(tp: type) -> bool: def remove_optional_type(tp: type) -> type: """Given Optional[X], return X.""" - if typing.get_origin(tp) != Union: - return False + assert typing.get_origin(tp) == Union args = typing.get_args(tp) if len(args) == 2 and args[1] == type(None): return args[0] @@ -169,11 +168,11 @@ def read_url(url: str, num_retries: int = 2) -> str: backoff = 1 while True: try: - request = urllib.request.Request(url) + request = urllib.request.Request(url) # type: ignore gh_token = os.getenv("GITHUB_ACCESS_TOKEN") if gh_token is not None: request.add_header("Authorization", f"token {gh_token}") - with urllib.request.urlopen(request) as f: + with urllib.request.urlopen(request) as f: # type: ignore return f.read().decode() except Exception as ex: if num_retries <= 0: @@ -188,13 +187,13 @@ def read_url(url: str, num_retries: int = 2) -> str: def url_exists(url: str) -> bool: """Return True if the URL ``url`` exists, using the GITHUB_ACCESS_TOKEN for authentication if provided.""" try: - request = urllib.request.Request(url) + request = urllib.request.Request(url) # type: ignore gh_token = os.getenv("GITHUB_ACCESS_TOKEN") if gh_token is not None: request.add_header("Authorization", f"token {gh_token}") - with urllib.request.urlopen(request) as _: + with urllib.request.urlopen(request) as _: # type: ignore return True - except urllib.error.HTTPError: + except urllib.error.HTTPError: # type: ignore return False @@ -260,7 +259,7 @@ def to_json_path(root_dir: Path, path: Path, repo) -> Path: return _from_lean_path(root_dir, path, repo, ext=".ast.json") -def to_lean_path(root_dir: Path, path: Path, repo) -> bool: +def to_lean_path(root_dir: Path, path: Path) -> Path: if path.is_absolute(): path = path.relative_to(root_dir)