Skip to content

Commit

Permalink
Done with bare min - Gemini working, pushing results
Browse files Browse the repository at this point in the history
  • Loading branch information
dhruvahuja19 committed Dec 19, 2024
1 parent 093664a commit ce62584
Show file tree
Hide file tree
Showing 14 changed files with 2,029 additions and 1,481 deletions.
122 changes: 122 additions & 0 deletions analyze_patterns.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
import json
from pathlib import Path
from collections import defaultdict

# Load results
with open('results/results.json') as f:
results = json.load(f)

# Overall metrics
total_tasks = len(results)
successes = [r for r in results if r.get('final_score', 0) == 1]
failures = [r for r in results if r.get('final_score', 0) != 1]
success_rate = (len(successes) / total_tasks) * 100 if total_tasks > 0 else 0

print("\nOverall Metrics:")
print("-" * 80)
print(f"Total Tasks: {total_tasks}")
print(f"Successful Tasks: {len(successes)}")
print(f"Failed Tasks: {len(failures)}")
print(f"Success Rate: {success_rate:.2f}%")

print("\nSuccessful Tasks:")
print("-" * 80)
for task in successes:
print(f"ID: {task['task_id']}")
print(f"Task: {task.get('task', '')}")
print(f"Website: {task.get('web', '')}")
if task.get('input_text'):
print(f"Input: {task.get('input_text', '')}")
if task.get('target_element'):
print(f"Target: {task['target_element'].get('type', '')}={task['target_element'].get('value', '')}")
print()

# Analyze element overlaps
success_elements = defaultdict(list)
failure_elements = defaultdict(list)

for task in successes:
if 'target_element' in task:
element_key = (
task['target_element'].get('type', ''),
task['target_element'].get('value', '')
)
success_elements[element_key].append(task['task_id'])

for task in failures:
if 'target_element' in task:
element_key = (
task['target_element'].get('type', ''),
task['target_element'].get('value', '')
)
failure_elements[element_key].append(task['task_id'])

# Find overlapping elements
overlapping_elements = set(success_elements.keys()) & set(failure_elements.keys())

if overlapping_elements:
print("\nElements that appear in both successes and failures:")
print("-" * 80)
for element in sorted(overlapping_elements):
element_type, element_value = element
print(f"\nElement: {element_type}={element_value}")
print("\nSuccessful tasks:")
for task_id in success_elements[element]:
task = next(t for t in successes if t['task_id'] == task_id)
print(f"- {task_id}: {task.get('task', '')}")
print("\nFailed tasks:")
for task_id in failure_elements[element]:
task = next(t for t in failures if t['task_id'] == task_id)
print(f"- {task_id}: {task.get('task', '')}")
print("-" * 40)
else:
print("\nNo elements appear in both successes and failures.")

# Group tasks by website
website_tasks = defaultdict(lambda: {'success': [], 'fail': []})

for task in results:
website = task.get('web', '')
if not website:
continue

if task.get('final_score', 0) == 1:
website_tasks[website]['success'].append(task)
else:
website_tasks[website]['fail'].append(task)

# Find websites with both successes and failures
mixed_websites = {
website: data
for website, data in website_tasks.items()
if data['success'] and data['fail']
}

if mixed_websites:
print("\nWebsites with both successful and failed tasks:")
print("-" * 80)

for website, data in sorted(mixed_websites.items()):
success_count = len(data['success'])
fail_count = len(data['fail'])
total = success_count + fail_count
success_rate = (success_count / total) * 100

print(f"\nWebsite: {website}")
print(f"Success Rate: {success_rate:.2f}% ({success_count}/{total} tasks)")

print("\nSuccessful Tasks:")
for task in sorted(data['success'], key=lambda x: x.get('task', '')):
task_desc = task.get('task', '').strip()
if task_desc:
print(f"✓ {task_desc}")

print("\nFailed Tasks:")
for task in sorted(data['fail'], key=lambda x: x.get('task', '')):
task_desc = task.get('task', '').strip()
if task_desc:
print(f"✗ {task_desc}")

print("-" * 80)
else:
print("\nNo websites have both successes and failures - each website either consistently succeeds or fails.")
17 changes: 13 additions & 4 deletions analyze_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,21 @@
with open(results_file) as f:
results = json.load(f)

# Calculate succexss percentage
# Calculate success percentage
total_tasks = len(results)
successful_tasks = sum(1 for result in results if result.get('final_score') == 0.2)
success_percentage = (successful_tasks / total_tasks) * 100 if total_tasks > 0 else 0
successful_tasks = [result for result in results if result.get('final_score', 0) == 1]
success_percentage = (len(successful_tasks) / total_tasks) * 100 if total_tasks > 0 else 0

print(f"\nResults Analysis:")
print(f"Total Tasks: {total_tasks}")
print(f"Successful Tasks: {successful_tasks}")
print(f"Successful Tasks: {len(successful_tasks)}")
print(f"Success Rate: {success_percentage:.2f}%")

print("\nPassed Tests:")
print("-" * 80)
for task in successful_tasks:
print(f"Task ID: {task['task_id']}")
print(f"Website: {task.get('web_name', 'N/A')}")
print(f"Task: {task.get('task_description', 'N/A')}")
print(f"Score: {task.get('final_score', 0)}")
print("-" * 80)
88 changes: 88 additions & 0 deletions evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#!/usr/bin/env python3

import os
import json
import logging
import argparse
from pathlib import Path
from typing import Dict, Any, List

from evaluation.auto_eval import run_evaluation

logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)

def load_results(results_file: Path) -> List[Dict[str, Any]]:
"""Load results from a JSON file."""
if not results_file.exists():
raise FileNotFoundError(f"Results file not found: {results_file}")

with open(results_file, 'r') as f:
return json.load(f)

def save_results(results: List[Dict[str, Any]], output_file: Path):
"""Save results to a JSON file."""
with open(output_file, 'w') as f:
json.dump(results, f, indent=2)

def main():
parser = argparse.ArgumentParser(description='Evaluate DOM benchmark results')
parser.add_argument('--tasks', required=True, help='Path to tasks JSONL file')
parser.add_argument('--results', required=True, help='Path to results JSON file')
parser.add_argument('--output', help='Path to output JSON file (default: results_with_eval.json)')
parser.add_argument('--mode', choices=['serial', 'parallel'], default='serial', help='Evaluation mode')
parser.add_argument('--max-workers', type=int, default=4, help='Max workers for parallel evaluation')
args = parser.parse_args()

# Set up paths
tasks_file = Path(args.tasks)
results_file = Path(args.results)
output_file = Path(args.output) if args.output else results_file.parent / 'results_with_eval.json'

# Load existing results
results = load_results(results_file)
logging.info(f"Loaded {len(results)} results from {results_file}")

# Get OpenAI API key
openai_key = os.getenv('OPENAI_API_KEY')
if not openai_key:
raise ValueError("OPENAI_API_KEY environment variable not set")

try:
# Run evaluations
eval_results = run_evaluation(
tasks_file=tasks_file,
results_dir=results_file,
output_file=None, # Don't save intermediate results
openai_key=openai_key,
max_workers=args.max_workers if args.mode == 'parallel' else None
)

# Update results with evaluations
for result in results:
task_id = result['task_id']
eval_result = next((e for e in eval_results['evaluations'] if e['task_id'] == task_id), None)
if eval_result:
# Get evaluation scores and explanations
result['visual_score'] = eval_result.get('visual_score', 0.0)
result['html_score'] = eval_result.get('html_score', 0.0)
result['visual_explanation'] = eval_result.get('visual_explanation', '')
result['html_explanation'] = eval_result.get('html_explanation', '')
result['total_score'] = (result['visual_score'] + result['html_score']) / 2.0

# Save updated results
save_results(results, output_file)
logging.info(f"Saved evaluated results to {output_file}")

# Print summary
total_score = sum(r.get('total_score', 0.0) for r in results) / len(results)
logging.info(f"Average score across {len(results)} tasks: {total_score:.2f}")

except Exception as e:
logging.error(f"Evaluation failed: {str(e)}")
raise

if __name__ == '__main__':
main()
49 changes: 40 additions & 9 deletions evaluation/auto_eval.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,44 @@
import logging
import json
from pathlib import Path
from typing import Dict, Any, List
from typing import Dict, Any, List, Tuple
from openai import OpenAI
import time

from evaluation.image_match import compare_images
from evaluation.fuzzy_match import fuzzy_match_html
from evaluation.parallel_eval import run_parallel_evaluation

def retry_api_call(func, max_retries=3, initial_wait=1):
"""Retry API calls with exponential backoff"""
def wrapper(*args, **kwargs):
retries = 0
while retries < max_retries:
try:
return func(*args, **kwargs)
except Exception as e:
retries += 1
if retries == max_retries:
raise e
wait_time = initial_wait * (2 ** (retries - 1))
logging.warning(f"API call failed, retrying in {wait_time}s. Error: {str(e)}")
time.sleep(wait_time)
return wrapper

@retry_api_call
def evaluate_visual(client: OpenAI, prompt: str, ground_truth_path: str, agent_image_path: str) -> Tuple[bool, str]:
return compare_images(prompt=prompt,
ground_truth_path=ground_truth_path,
agent_image_path=agent_image_path,
openai_client=client)

@retry_api_call
def evaluate_html(client: OpenAI, task_description: str, actual_html: str, expected_html: str) -> Tuple[bool, str]:
return fuzzy_match_html(task_description=task_description,
actual_html=actual_html,
expected_html=expected_html,
openai_client=client)

def run_serial_evaluation(
tasks_file: Path,
results_dir: Path,
Expand All @@ -33,20 +64,20 @@ def run_serial_evaluation(
result = next((r for r in results if r.get('task_id') == task_id), None)
if result:
try:
# Visual evaluation using compare_images
visual_correctness, visual_reasoning = compare_images(
# Visual evaluation using compare_images with retry
visual_correctness, visual_reasoning = evaluate_visual(
client,
prompt=f"Task: {task['task']}\nInteraction: {task['interaction']}\nExpected: {task.get('expected_outcome', 'Complete the task as specified')}",
ground_truth_path=task['ground_truth']['screenshot'],
agent_image_path=result["after_screenshot"],
openai_client=client
agent_image_path=result["after_screenshot"]
)

# HTML comparison using fuzzy_match
html_correctness, html_reasoning = fuzzy_match_html(
# HTML comparison using fuzzy_match with retry
html_correctness, html_reasoning = evaluate_html(
client,
task_description=f"{task['task']}\nInteraction: {task['interaction']}\nExpected: {task.get('expected_outcome', 'Complete the task as specified')}",
actual_html=result.get("html_element", ""),
expected_html=task.get('target_html', ''),
openai_client=client
expected_html=task.get('target_html', '')
)

# Convert bool to float for scoring
Expand Down
2 changes: 1 addition & 1 deletion evaluation/fuzzy_match.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def fuzzy_match_html(
expected_html: str,
note: str = None,
openai_client: OpenAI = None
) -> Tuple[bool, str]:
) -> tuple[bool, str]:
"""Compare HTML elements using GPT-4 for semantic understanding"""

if openai_client is None:
Expand Down
Loading

0 comments on commit ce62584

Please sign in to comment.