-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Done with bare min - Gemini working, pushing results
- Loading branch information
1 parent
093664a
commit ce62584
Showing
14 changed files
with
2,029 additions
and
1,481 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
import json | ||
from pathlib import Path | ||
from collections import defaultdict | ||
|
||
# Load results | ||
with open('results/results.json') as f: | ||
results = json.load(f) | ||
|
||
# Overall metrics | ||
total_tasks = len(results) | ||
successes = [r for r in results if r.get('final_score', 0) == 1] | ||
failures = [r for r in results if r.get('final_score', 0) != 1] | ||
success_rate = (len(successes) / total_tasks) * 100 if total_tasks > 0 else 0 | ||
|
||
print("\nOverall Metrics:") | ||
print("-" * 80) | ||
print(f"Total Tasks: {total_tasks}") | ||
print(f"Successful Tasks: {len(successes)}") | ||
print(f"Failed Tasks: {len(failures)}") | ||
print(f"Success Rate: {success_rate:.2f}%") | ||
|
||
print("\nSuccessful Tasks:") | ||
print("-" * 80) | ||
for task in successes: | ||
print(f"ID: {task['task_id']}") | ||
print(f"Task: {task.get('task', '')}") | ||
print(f"Website: {task.get('web', '')}") | ||
if task.get('input_text'): | ||
print(f"Input: {task.get('input_text', '')}") | ||
if task.get('target_element'): | ||
print(f"Target: {task['target_element'].get('type', '')}={task['target_element'].get('value', '')}") | ||
print() | ||
|
||
# Analyze element overlaps | ||
success_elements = defaultdict(list) | ||
failure_elements = defaultdict(list) | ||
|
||
for task in successes: | ||
if 'target_element' in task: | ||
element_key = ( | ||
task['target_element'].get('type', ''), | ||
task['target_element'].get('value', '') | ||
) | ||
success_elements[element_key].append(task['task_id']) | ||
|
||
for task in failures: | ||
if 'target_element' in task: | ||
element_key = ( | ||
task['target_element'].get('type', ''), | ||
task['target_element'].get('value', '') | ||
) | ||
failure_elements[element_key].append(task['task_id']) | ||
|
||
# Find overlapping elements | ||
overlapping_elements = set(success_elements.keys()) & set(failure_elements.keys()) | ||
|
||
if overlapping_elements: | ||
print("\nElements that appear in both successes and failures:") | ||
print("-" * 80) | ||
for element in sorted(overlapping_elements): | ||
element_type, element_value = element | ||
print(f"\nElement: {element_type}={element_value}") | ||
print("\nSuccessful tasks:") | ||
for task_id in success_elements[element]: | ||
task = next(t for t in successes if t['task_id'] == task_id) | ||
print(f"- {task_id}: {task.get('task', '')}") | ||
print("\nFailed tasks:") | ||
for task_id in failure_elements[element]: | ||
task = next(t for t in failures if t['task_id'] == task_id) | ||
print(f"- {task_id}: {task.get('task', '')}") | ||
print("-" * 40) | ||
else: | ||
print("\nNo elements appear in both successes and failures.") | ||
|
||
# Group tasks by website | ||
website_tasks = defaultdict(lambda: {'success': [], 'fail': []}) | ||
|
||
for task in results: | ||
website = task.get('web', '') | ||
if not website: | ||
continue | ||
|
||
if task.get('final_score', 0) == 1: | ||
website_tasks[website]['success'].append(task) | ||
else: | ||
website_tasks[website]['fail'].append(task) | ||
|
||
# Find websites with both successes and failures | ||
mixed_websites = { | ||
website: data | ||
for website, data in website_tasks.items() | ||
if data['success'] and data['fail'] | ||
} | ||
|
||
if mixed_websites: | ||
print("\nWebsites with both successful and failed tasks:") | ||
print("-" * 80) | ||
|
||
for website, data in sorted(mixed_websites.items()): | ||
success_count = len(data['success']) | ||
fail_count = len(data['fail']) | ||
total = success_count + fail_count | ||
success_rate = (success_count / total) * 100 | ||
|
||
print(f"\nWebsite: {website}") | ||
print(f"Success Rate: {success_rate:.2f}% ({success_count}/{total} tasks)") | ||
|
||
print("\nSuccessful Tasks:") | ||
for task in sorted(data['success'], key=lambda x: x.get('task', '')): | ||
task_desc = task.get('task', '').strip() | ||
if task_desc: | ||
print(f"✓ {task_desc}") | ||
|
||
print("\nFailed Tasks:") | ||
for task in sorted(data['fail'], key=lambda x: x.get('task', '')): | ||
task_desc = task.get('task', '').strip() | ||
if task_desc: | ||
print(f"✗ {task_desc}") | ||
|
||
print("-" * 80) | ||
else: | ||
print("\nNo websites have both successes and failures - each website either consistently succeeds or fails.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import os | ||
import json | ||
import logging | ||
import argparse | ||
from pathlib import Path | ||
from typing import Dict, Any, List | ||
|
||
from evaluation.auto_eval import run_evaluation | ||
|
||
logging.basicConfig( | ||
level=logging.INFO, | ||
format='%(asctime)s - %(levelname)s - %(message)s' | ||
) | ||
|
||
def load_results(results_file: Path) -> List[Dict[str, Any]]: | ||
"""Load results from a JSON file.""" | ||
if not results_file.exists(): | ||
raise FileNotFoundError(f"Results file not found: {results_file}") | ||
|
||
with open(results_file, 'r') as f: | ||
return json.load(f) | ||
|
||
def save_results(results: List[Dict[str, Any]], output_file: Path): | ||
"""Save results to a JSON file.""" | ||
with open(output_file, 'w') as f: | ||
json.dump(results, f, indent=2) | ||
|
||
def main(): | ||
parser = argparse.ArgumentParser(description='Evaluate DOM benchmark results') | ||
parser.add_argument('--tasks', required=True, help='Path to tasks JSONL file') | ||
parser.add_argument('--results', required=True, help='Path to results JSON file') | ||
parser.add_argument('--output', help='Path to output JSON file (default: results_with_eval.json)') | ||
parser.add_argument('--mode', choices=['serial', 'parallel'], default='serial', help='Evaluation mode') | ||
parser.add_argument('--max-workers', type=int, default=4, help='Max workers for parallel evaluation') | ||
args = parser.parse_args() | ||
|
||
# Set up paths | ||
tasks_file = Path(args.tasks) | ||
results_file = Path(args.results) | ||
output_file = Path(args.output) if args.output else results_file.parent / 'results_with_eval.json' | ||
|
||
# Load existing results | ||
results = load_results(results_file) | ||
logging.info(f"Loaded {len(results)} results from {results_file}") | ||
|
||
# Get OpenAI API key | ||
openai_key = os.getenv('OPENAI_API_KEY') | ||
if not openai_key: | ||
raise ValueError("OPENAI_API_KEY environment variable not set") | ||
|
||
try: | ||
# Run evaluations | ||
eval_results = run_evaluation( | ||
tasks_file=tasks_file, | ||
results_dir=results_file, | ||
output_file=None, # Don't save intermediate results | ||
openai_key=openai_key, | ||
max_workers=args.max_workers if args.mode == 'parallel' else None | ||
) | ||
|
||
# Update results with evaluations | ||
for result in results: | ||
task_id = result['task_id'] | ||
eval_result = next((e for e in eval_results['evaluations'] if e['task_id'] == task_id), None) | ||
if eval_result: | ||
# Get evaluation scores and explanations | ||
result['visual_score'] = eval_result.get('visual_score', 0.0) | ||
result['html_score'] = eval_result.get('html_score', 0.0) | ||
result['visual_explanation'] = eval_result.get('visual_explanation', '') | ||
result['html_explanation'] = eval_result.get('html_explanation', '') | ||
result['total_score'] = (result['visual_score'] + result['html_score']) / 2.0 | ||
|
||
# Save updated results | ||
save_results(results, output_file) | ||
logging.info(f"Saved evaluated results to {output_file}") | ||
|
||
# Print summary | ||
total_score = sum(r.get('total_score', 0.0) for r in results) / len(results) | ||
logging.info(f"Average score across {len(results)} tasks: {total_score:.2f}") | ||
|
||
except Exception as e: | ||
logging.error(f"Evaluation failed: {str(e)}") | ||
raise | ||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.