Skip to content

Commit

Permalink
Merge pull request #101 from commit-0/analysis_in_docs
Browse files Browse the repository at this point in the history
submissions analysis + webpage rendering
  • Loading branch information
wenting-zhao authored Dec 7, 2024
2 parents 1b7780d + fc70756 commit df0dc34
Show file tree
Hide file tree
Showing 6 changed files with 198 additions and 45 deletions.
24 changes: 24 additions & 0 deletions docs/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@

Update HF dataset then:
```
python docs/update_submissions_dataset.py
```

Run submissions analysis on SPLIT
```
python docs/render_submissions.py
--do_setup --get_blank_details --get_reference_details # only once, at beginning of setting up environment
--analyze_submissions
--split SPLIT
```

Render webpages on submissions.
```
python docs/render_submissions.py --render_webpages --overwrite_previous_eval
```

Deploy to website.
```
cd ../commit-0.github.io
mkdocs gh-deploy --config-file ../commit0/mkdocs.yml --remote-branch main
```
24 changes: 19 additions & 5 deletions docs/javascripts/tablesort.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,20 @@
document$.subscribe(function() {
var tables = document.querySelectorAll("article table:not([class])")
tables.forEach(function(table) {
new Tablesort(table)
})
})
var tables = document.querySelectorAll("article table:not([class])")
tables.forEach(function(table) {
new Tablesort(table);
// Automatically sort the table by the specified column
var defaultSortColumn = 2; // Index of the column to sort (0-based)
var isAscending = False; // Set to false for descending order

// Delay to ensure Tablesort is fully initialized
setTimeout(function () {
var header = table.querySelectorAll("thead th")[defaultSortColumn];
if (header) {
header.click(); // Simulate a click on the header
if (!isAscending) {
header.click(); // Click again for descending order
}
}
}, 100);
});
});
26 changes: 26 additions & 0 deletions docs/javascripts/tablesort.number.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
(function(){
var cleanNumber = function(i) {
return i.replace(/[^\-?0-9.]/g, '');
},

compareNumber = function(a, b) {
a = parseFloat(a);
b = parseFloat(b);

a = isNaN(a) ? 0 : a;
b = isNaN(b) ? 0 : b;

return a - b;
};

Tablesort.extend('number', function(item) {
return item.match(/^[-+]?[£\x24Û¢´]?\d+\s*([,\.]\d{0,2})/) || // Prefixed currency
item.match(/^[-+]?\d+\s*([,\.]\d{0,2})?[£\x24Û¢´]/) || // Suffixed currency
item.match(/^[-+]?(\d)*-?([,\.]){0,1}-?(\d)+([E,e][\-+][\d]+)?%?$/); // Number
}, function(a, b) {
a = cleanNumber(a);
b = cleanNumber(b);

return compareNumber(b, a);
});
}());
128 changes: 96 additions & 32 deletions docs/render_submissions.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,12 @@
from transformers import AutoTokenizer

from commit0.harness.constants import SPLIT
from commit0.harness.get_pytest_ids import main as get_tests
from commit0.harness.utils import clone_repo
from commit0.cli import write_commit0_config_file

import logging
from typing import Any, NoReturn

logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
Expand All @@ -26,9 +28,13 @@
analysis_files_path = "/share/rush/commit0_analysis_temp"


def get_pytest_info(path_to_logs, repo_name, branch_name):
def get_pytest_info(
path_to_logs: str, repo_name: str, branch_name: str
) -> dict[str, dict[str, Any]]:
pytest_info = {}
for pytest_hash in os.listdir(path_to_logs):
if not os.path.exists(os.path.join(path_to_logs, pytest_hash, "eval.sh")):
continue
eval_script = open(os.path.join(path_to_logs, pytest_hash, "eval.sh")).read()
testname = re.search(r"([\S]+) > test_output", eval_script).group(1)
patch_diff = open(os.path.join(path_to_logs, pytest_hash, "patch.diff")).read()
Expand Down Expand Up @@ -84,19 +90,19 @@ def get_pytest_info(path_to_logs, repo_name, branch_name):
"failure_string": failure_string,
"duration": duration,
}
return pytest_info
return pytest_info if len(pytest_info) else "Could not evaluate"


def get_coverage_info(path_to_logs, repo_name, branch_name):
def get_coverage_info(path_to_logs: str, repo_name: str, branch_name: str) -> Any:
raise NotImplementedError


def get_blank_repo_metrics(
blank_source_code_folder,
spec_filename,
blank_source_code_folder: str,
spec_filename: str,
tokenizer,
code_file_filter=lambda filename: filename,
):
) -> dict[str, Any]:
blank_repo_metrics = {
"functions_to_edit": [],
}
Expand Down Expand Up @@ -164,7 +170,7 @@ def get_blank_repo_metrics(


leaderboard_header = """\n\n## Leaderboard ({split})
| Name | Repos Resolved (/{num_repos}) | Total Tests Passed (/{total_num_tests}) | Test Duration (s) | Date | Analysis | Github |
| Name | Repos Resolved (/{num_repos}) | Avg. pass rate | Test Duration (s) | Date | Analysis | Github |
|------|:-------------------------:|:--------------------:|:--------------------:|:----------:|----|----| """

submission_table_header = """# Submission Name: **{display_name}** (split: {split})
Expand All @@ -178,33 +184,44 @@ def get_blank_repo_metrics(
"""


def render_mds(overwrite_previous, subfolder="docs"):
def render_mds(overwrite_previous: bool, subfolder: str = "docs") -> NoReturn:
leaderboard = {}

split_to_total_tests = {
"lite": 3628,
"all": 140926,
} # hard-coded to skip running it later
for split in tqdm.tqdm(["lite", "all"]):
for split in ["lite", "all"]:
num_repos = len(SPLIT[split])
# total_num_tests = 0
# for repo_name in SPLIT[split]:
# repo_tests = subprocess.run(['commit0', 'get-tests', repo_name], capture_output=True, text=True).stdout.strip()
# total_num_tests += len(repo_tests.splitlines())
leaderboard[split] = leaderboard_header.format(
split=split,
num_repos=num_repos,
total_num_tests=split_to_total_tests[split],
leaderboard[split] = []
leaderboard[split].append(
(
split_to_total_tests[split] + 1,
leaderboard_header.format(
split=split,
num_repos=num_repos,
total_num_tests=split_to_total_tests[split],
),
)
)

for org_path in tqdm.tqdm(glob.glob(os.path.join(analysis_files_path, "*"))):
org_name = os.path.basename(org_path)
if org_name in {"blank", "repos", "submission_repos"}:
continue
for branch_path in glob.glob(os.path.join(org_path, "*.json")):
cum_tests_passed = 0
evaluate_numbers = []
lite_evaluate_numbers = []
# cum_tests_passed = 0
repos_resolved = 0
total_duration = 0.0
# lite_cum_tests_passed = 0
lite_repos_resolved = 0
lite_total_duration = 0.0
branch_metrics = json.load(open(branch_path))
submission_info = branch_metrics["submission_info"]
split = submission_info["split"]
Expand Down Expand Up @@ -234,7 +251,7 @@ def render_mds(overwrite_previous, subfolder="docs"):
subfolder, f"analysis_{org_name}_{branch_name}_{repo_name}.md"
)
if isinstance(repo_pytest_results, str):
submission_repo_page = f"# **{display_name}**: {repo_name}\n\n## Failed to clone\n\n{repo_pytest_results}"
submission_repo_page = f"# **{display_name}**: {repo_name}\n\n## Failed\n\n{repo_pytest_results}"
org_branch_repo_filepath = os.path.join(
subfolder, f"analysis_{org_name}_{branch_name}_{repo_name}.md"
)
Expand All @@ -246,7 +263,7 @@ def render_mds(overwrite_previous, subfolder="docs"):
submission_page = submission_table_header.format(
display_name=display_name, split=split
) + (
f"\n| {repo_name} | No; Failed to clone. | - | - | "
f"\n| {repo_name} | No; {repo_pytest_results} | - | - | "
f"[Analysis](/{f'analysis_{org_name}_{branch_name}_{repo_name}'}) | "
f"[Github]({github_hyperlink}) |"
)
Expand All @@ -267,13 +284,23 @@ def render_mds(overwrite_previous, subfolder="docs"):
)
pytest_details = "Pytest failed"
duration = "Failed."
evaluate_numbers.append(0.0)
if split == "all" and repo_name in SPLIT["lite"]:
lite_evaluate_numbers.append(0.0)
else:
resolved = False
if "passed" in pytest_info["summary"]:
if "skipped" in pytest_info["summary"]:
resolved = pytest_info["summary"]["passed"] + pytest_info["summary"]["skipped"] == pytest_info["summary"]["total"]
resolved = (
pytest_info["summary"]["passed"]
+ pytest_info["summary"]["skipped"]
== pytest_info["summary"]["total"]
)
else:
resolved = pytest_info["summary"]["passed"] == pytest_info["summary"]["total"]
resolved = (
pytest_info["summary"]["passed"]
== pytest_info["summary"]["total"]
)
if write_submission:
submission_repo_page += pytest_summary_table_header.format(
pytest_group=pytest_group
Expand All @@ -295,9 +322,21 @@ def render_mds(overwrite_previous, subfolder="docs"):
f"### {shortened_testname}\n\n<details><summary> <pre>{shortened_testname}"
f"</pre></summary><pre>\n{failure['failure_string']}\n</pre>\n</details>\n"
)
cum_tests_passed += pytest_info["summary"]["passed"]
# cum_tests_passed += pytest_info["summary"]["passed"]
num_tests = len(get_tests(repo_name, verbose=0))
evaluate_numbers.append(
pytest_info["summary"]["passed"] / num_tests
)
total_duration += pytest_info["duration"]
repos_resolved += int(resolved)
if split == "all" and repo_name in SPLIT["lite"]:
lite_evaluate_numbers.append(
pytest_info["summary"]["passed"] / num_tests
)
# lite_cum_tests_passed += pytest_info["summary"]["passed"]
lite_total_duration += pytest_info["duration"]
lite_repos_resolved += int(resolved)

if write_submission:
pytest_details = f"{pytest_info['summary']['passed']} / {pytest_info['summary']['total']}"
duration = f"{pytest_info['duration']:.2f}"
Expand All @@ -322,22 +361,46 @@ def render_mds(overwrite_previous, subfolder="docs"):
wf.write(back_button + "\n" + submission_page)
analysis_link = f"[Analysis](/{f'analysis_{org_name}_{branch_name}'})"
github_link = f"[Github]({project_page_link})"
leaderboard[split] += (
f"\n|{display_name}|"
f"{repos_resolved}|"
f"{cum_tests_passed}|"
f"{total_duration:.2f}|"
f"{submission_date}|"
f"{analysis_link}|"
f"{github_link}|"
avg_pass_rate = sum(evaluate_numbers) / len(evaluate_numbers)
leaderboard[split].append(
(
avg_pass_rate * 100,
f"\n|{display_name}|"
f"{repos_resolved}|"
f"{avg_pass_rate*100:.2f}%|"
f"{total_duration:.2f}|"
f"{submission_date}|"
f"{analysis_link}|"
f"{github_link}|",
)
)
if (split == "all") and ("Reference (Gold)" not in display_name):
avg_lite_pass_rate = sum(lite_evaluate_numbers) / len(
lite_evaluate_numbers
)
leaderboard["lite"].append(
(
avg_lite_pass_rate * 100,
f"\n|{display_name} (subset of `all`)|"
f"{lite_repos_resolved}|"
f"{avg_lite_pass_rate*100:.2f}%|"
f"{lite_total_duration:.2f}|"
f"{submission_date}|"
f"{analysis_link}|"
f"{github_link}|",
)
)

leaderboard_filepath = os.path.join(subfolder, "analysis.md")
for split in ["lite", "all"]:
leaderboard[split] = sorted(leaderboard[split], key=lambda elt: -elt[0])
with open(leaderboard_filepath, "w") as wf:
wf.write(leaderboard["lite"] + "\n\n" + leaderboard["all"])
lite_leaderboard_string = "".join(string for (_, string) in leaderboard["lite"])
all_leaderboard_string = "".join(string for (_, string) in leaderboard["all"])
wf.write(lite_leaderboard_string + "\n\n" + all_leaderboard_string)


def get_args():
def get_args() -> argparse.Namespace:
parser = argparse.ArgumentParser()
parser.add_argument(
"--do_setup", action="store_true", help="Run commit0 setup with specified split"
Expand Down Expand Up @@ -366,14 +429,14 @@ def get_args():
parser.add_argument(
"--overwrite_previous_eval",
action="store_true",
help="Overwrite cached pytest info"
help="Overwrite cached pytest info",
# TODO add finer granularity so can specify which ones to overwrite
)

return parser.parse_args()


def main(args):
def main(args: argparse.Namespace) -> NoReturn:
global analysis_files_path

commit0_dataset_name = "wentingzhao/commit0_combined"
Expand Down Expand Up @@ -493,6 +556,7 @@ def main(args):
)
if os.path.exists(submission_repos_path):
shutil.rmtree(submission_repos_path)
print(f"Removed existing at {submission_repos_path}")
os.makedirs(os.path.join(analysis_files_path, org_name), exist_ok=True)
commit0_config_file = os.path.join(
analysis_files_path,
Expand Down Expand Up @@ -530,7 +594,7 @@ def main(args):
)
# run pytests
os.system(
f"commit0 evaluate --branch {branch_name} "
f"commit0 evaluate --branch {branch_name} --timeout 1800"
f"--commit0-config-file {commit0_config_file}"
)
for example in dataset:
Expand Down
38 changes: 31 additions & 7 deletions docs/update_submissions_dataset.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,36 @@
from datasets import Dataset

submissions = {
"org_name": ["test-save-commit0", "commit0-lite-with-test", "commit0-lite-plain", "commit0-all-plain"],
"branch": ["baseline", "fillin", "fillin", "fillin"],
"display_name": ["Claude Sonnet 3.5 - Base", "Claude Sonnet 3.5 - Fill-in + Unit Test Feedback", "Claude Sonnet 3.5 - Fill-in", "Claude Sonnet 3.5 - Fill-in"],
"submission_date": ["09/25/2024", "09/25/2024", "09/25/2024", "09/25/2024"],
"split": ["lite", "lite", "lite", "all"],
"project_page": ["https://github.com/test-save-commit0", "https://github.com/commit0-lite-with-test", "https://github.com/commit0-lite-plain", "https://github.com/commit0-all-plain"]
"org_name": [
"test-save-commit0",
"commit0-fillin",
"commit0-lite-test",
"openhands-commit0",
"sweagent-commit0",
],
"branch": ["baseline", "sonnet", "sonnet", "openhands", "sweagent"],
"display_name": [
"Claude Sonnet 3.5 - Base",
"Claude Sonnet 3.5 - Fill-in",
"Claude Sonnet 3.5 - Fill-in + Lint & Unit Test Feedback",
"OpenHands",
"SWE-Agent",
],
"submission_date": [
"09/25/2024",
"09/25/2024",
"09/25/2024",
"11/25/2024",
"11/26/2024",
],
"split": ["lite", "all", "lite", "all", "lite"],
"project_page": [
"https://github.com/test-save-commit0",
"https://github.com/commit0-fillin",
"https://github.com/commit0-lite-test",
"https://github.com/openhands-commit0",
"https://github.com/sweagent-commit0",
],
}

Dataset.from_dict(submissions).push_to_hub("celinelee/commit0_submissions")
Dataset.from_dict(submissions).push_to_hub("celinelee/commit0_submissions")
Loading

0 comments on commit df0dc34

Please sign in to comment.