Done with bare min - Gemini working, pushing results

dhruvahuja19 · Dec 19, 2024 · ce62584 · ce62584
1 parent 093664a
commit ce62584
Show file tree

Hide file tree

Showing 14 changed files with 2,029 additions and 1,481 deletions.
diff --git a/analyze_patterns.py b/analyze_patterns.py
@@ -0,0 +1,122 @@
+import json
+from pathlib import Path
+from collections import defaultdict
+
+# Load results
+with open('results/results.json') as f:
+    results = json.load(f)
+
+# Overall metrics
+total_tasks = len(results)
+successes = [r for r in results if r.get('final_score', 0) == 1]
+failures = [r for r in results if r.get('final_score', 0) != 1]
+success_rate = (len(successes) / total_tasks) * 100 if total_tasks > 0 else 0
+
+print("\nOverall Metrics:")
+print("-" * 80)
+print(f"Total Tasks: {total_tasks}")
+print(f"Successful Tasks: {len(successes)}")
+print(f"Failed Tasks: {len(failures)}")
+print(f"Success Rate: {success_rate:.2f}%")
+
+print("\nSuccessful Tasks:")
+print("-" * 80)
+for task in successes:
+    print(f"ID: {task['task_id']}")
+    print(f"Task: {task.get('task', '')}")
+    print(f"Website: {task.get('web', '')}")
+    if task.get('input_text'):
+        print(f"Input: {task.get('input_text', '')}")
+    if task.get('target_element'):
+        print(f"Target: {task['target_element'].get('type', '')}={task['target_element'].get('value', '')}")
+    print()
+
+# Analyze element overlaps
+success_elements = defaultdict(list)
+failure_elements = defaultdict(list)
+
+for task in successes:
+    if 'target_element' in task:
+        element_key = (
+            task['target_element'].get('type', ''),
+            task['target_element'].get('value', '')
+        )
+        success_elements[element_key].append(task['task_id'])
+
+for task in failures:
+    if 'target_element' in task:
+        element_key = (
+            task['target_element'].get('type', ''),
+            task['target_element'].get('value', '')
+        )
+        failure_elements[element_key].append(task['task_id'])
+
+# Find overlapping elements
+overlapping_elements = set(success_elements.keys()) & set(failure_elements.keys())
+
+if overlapping_elements:
+    print("\nElements that appear in both successes and failures:")
+    print("-" * 80)
+    for element in sorted(overlapping_elements):
+        element_type, element_value = element
+        print(f"\nElement: {element_type}={element_value}")
+        print("\nSuccessful tasks:")
+        for task_id in success_elements[element]:
+            task = next(t for t in successes if t['task_id'] == task_id)
+            print(f"- {task_id}: {task.get('task', '')}")
+        print("\nFailed tasks:")
+        for task_id in failure_elements[element]:
+            task = next(t for t in failures if t['task_id'] == task_id)
+            print(f"- {task_id}: {task.get('task', '')}")
+        print("-" * 40)
+else:
+    print("\nNo elements appear in both successes and failures.")
+
+# Group tasks by website
+website_tasks = defaultdict(lambda: {'success': [], 'fail': []})
+
+for task in results:
+    website = task.get('web', '')
+    if not website:
+        continue
+
+    if task.get('final_score', 0) == 1:
+        website_tasks[website]['success'].append(task)
+    else:
+        website_tasks[website]['fail'].append(task)
+
+# Find websites with both successes and failures
+mixed_websites = {
+    website: data 
+    for website, data in website_tasks.items() 
+    if data['success'] and data['fail']
+}
+
+if mixed_websites:
+    print("\nWebsites with both successful and failed tasks:")
+    print("-" * 80)
+
+    for website, data in sorted(mixed_websites.items()):
+        success_count = len(data['success'])
+        fail_count = len(data['fail'])
+        total = success_count + fail_count
+        success_rate = (success_count / total) * 100
+
+        print(f"\nWebsite: {website}")
+        print(f"Success Rate: {success_rate:.2f}% ({success_count}/{total} tasks)")
+
+        print("\nSuccessful Tasks:")
+        for task in sorted(data['success'], key=lambda x: x.get('task', '')):
+            task_desc = task.get('task', '').strip()
+            if task_desc:
+                print(f"✓ {task_desc}")
+
+        print("\nFailed Tasks:")
+        for task in sorted(data['fail'], key=lambda x: x.get('task', '')):
+            task_desc = task.get('task', '').strip()
+            if task_desc:
+                print(f"✗ {task_desc}")
+
+        print("-" * 80)
+else:
+    print("\nNo websites have both successes and failures - each website either consistently succeeds or fails.")
diff --git a/analyze_results.py b/analyze_results.py
@@ -6,12 +6,21 @@
 with open(results_file) as f:
     results = json.load(f)
 
-# Calculate succexss percentage
+# Calculate success percentage
 total_tasks = len(results)
-successful_tasks = sum(1 for result in results if result.get('final_score') == 0.2)
-success_percentage = (successful_tasks / total_tasks) * 100 if total_tasks > 0 else 0
+successful_tasks = [result for result in results if result.get('final_score', 0) == 1]
+success_percentage = (len(successful_tasks) / total_tasks) * 100 if total_tasks > 0 else 0
 
 print(f"\nResults Analysis:")
 print(f"Total Tasks: {total_tasks}")
-print(f"Successful Tasks: {successful_tasks}")
+print(f"Successful Tasks: {len(successful_tasks)}")
 print(f"Success Rate: {success_percentage:.2f}%")
+
+print("\nPassed Tests:")
+print("-" * 80)
+for task in successful_tasks:
+    print(f"Task ID: {task['task_id']}")
+    print(f"Website: {task.get('web_name', 'N/A')}")
+    print(f"Task: {task.get('task_description', 'N/A')}")
+    print(f"Score: {task.get('final_score', 0)}")
+    print("-" * 80)
diff --git a/evaluate.py b/evaluate.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+
+import os
+import json
+import logging
+import argparse
+from pathlib import Path
+from typing import Dict, Any, List
+
+from evaluation.auto_eval import run_evaluation
+
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+
+def load_results(results_file: Path) -> List[Dict[str, Any]]:
+    """Load results from a JSON file."""
+    if not results_file.exists():
+        raise FileNotFoundError(f"Results file not found: {results_file}")
+
+    with open(results_file, 'r') as f:
+        return json.load(f)
+
+def save_results(results: List[Dict[str, Any]], output_file: Path):
+    """Save results to a JSON file."""
+    with open(output_file, 'w') as f:
+        json.dump(results, f, indent=2)
+
+def main():
+    parser = argparse.ArgumentParser(description='Evaluate DOM benchmark results')
+    parser.add_argument('--tasks', required=True, help='Path to tasks JSONL file')
+    parser.add_argument('--results', required=True, help='Path to results JSON file')
+    parser.add_argument('--output', help='Path to output JSON file (default: results_with_eval.json)')
+    parser.add_argument('--mode', choices=['serial', 'parallel'], default='serial', help='Evaluation mode')
+    parser.add_argument('--max-workers', type=int, default=4, help='Max workers for parallel evaluation')
+    args = parser.parse_args()
+
+    # Set up paths
+    tasks_file = Path(args.tasks)
+    results_file = Path(args.results)
+    output_file = Path(args.output) if args.output else results_file.parent / 'results_with_eval.json'
+
+    # Load existing results
+    results = load_results(results_file)
+    logging.info(f"Loaded {len(results)} results from {results_file}")
+
+    # Get OpenAI API key
+    openai_key = os.getenv('OPENAI_API_KEY')
+    if not openai_key:
+        raise ValueError("OPENAI_API_KEY environment variable not set")
+
+    try:
+        # Run evaluations
+        eval_results = run_evaluation(
+            tasks_file=tasks_file,
+            results_dir=results_file,
+            output_file=None,  # Don't save intermediate results
+            openai_key=openai_key,
+            max_workers=args.max_workers if args.mode == 'parallel' else None
+        )
+
+        # Update results with evaluations
+        for result in results:
+            task_id = result['task_id']
+            eval_result = next((e for e in eval_results['evaluations'] if e['task_id'] == task_id), None)
+            if eval_result:
+                # Get evaluation scores and explanations
+                result['visual_score'] = eval_result.get('visual_score', 0.0)
+                result['html_score'] = eval_result.get('html_score', 0.0)
+                result['visual_explanation'] = eval_result.get('visual_explanation', '')
+                result['html_explanation'] = eval_result.get('html_explanation', '')
+                result['total_score'] = (result['visual_score'] + result['html_score']) / 2.0
+
+        # Save updated results
+        save_results(results, output_file)
+        logging.info(f"Saved evaluated results to {output_file}")
+
+        # Print summary
+        total_score = sum(r.get('total_score', 0.0) for r in results) / len(results)
+        logging.info(f"Average score across {len(results)} tasks: {total_score:.2f}")
+
+    except Exception as e:
+        logging.error(f"Evaluation failed: {str(e)}")
+        raise
+
+if __name__ == '__main__':
+    main()
diff --git a/evaluation/auto_eval.py b/evaluation/auto_eval.py
@@ -1,13 +1,44 @@
 import logging
 import json
 from pathlib import Path
-from typing import Dict, Any, List
+from typing import Dict, Any, List, Tuple
 from openai import OpenAI
+import time
 
 from evaluation.image_match import compare_images
 from evaluation.fuzzy_match import fuzzy_match_html
 from evaluation.parallel_eval import run_parallel_evaluation
 
+def retry_api_call(func, max_retries=3, initial_wait=1):
+    """Retry API calls with exponential backoff"""
+    def wrapper(*args, **kwargs):
+        retries = 0
+        while retries < max_retries:
+            try:
+                return func(*args, **kwargs)
+            except Exception as e:
+                retries += 1
+                if retries == max_retries:
+                    raise e
+                wait_time = initial_wait * (2 ** (retries - 1))
+                logging.warning(f"API call failed, retrying in {wait_time}s. Error: {str(e)}")
+                time.sleep(wait_time)
+    return wrapper
+
+@retry_api_call
+def evaluate_visual(client: OpenAI, prompt: str, ground_truth_path: str, agent_image_path: str) -> Tuple[bool, str]:
+    return compare_images(prompt=prompt, 
+                        ground_truth_path=ground_truth_path,
+                        agent_image_path=agent_image_path,
+                        openai_client=client)
+
+@retry_api_call
+def evaluate_html(client: OpenAI, task_description: str, actual_html: str, expected_html: str) -> Tuple[bool, str]:
+    return fuzzy_match_html(task_description=task_description,
+                          actual_html=actual_html,
+                          expected_html=expected_html,
+                          openai_client=client)
+
 def run_serial_evaluation(
     tasks_file: Path,
     results_dir: Path,
@@ -33,20 +64,20 @@ def run_serial_evaluation(
         result = next((r for r in results if r.get('task_id') == task_id), None)
         if result:
             try:
-                # Visual evaluation using compare_images
-                visual_correctness, visual_reasoning = compare_images(
+                # Visual evaluation using compare_images with retry
+                visual_correctness, visual_reasoning = evaluate_visual(
+                    client,
                     prompt=f"Task: {task['task']}\nInteraction: {task['interaction']}\nExpected: {task.get('expected_outcome', 'Complete the task as specified')}",
                     ground_truth_path=task['ground_truth']['screenshot'],
-                    agent_image_path=result["after_screenshot"],
-                    openai_client=client
+                    agent_image_path=result["after_screenshot"]
                 )
 
-                # HTML comparison using fuzzy_match
-                html_correctness, html_reasoning = fuzzy_match_html(
+                # HTML comparison using fuzzy_match with retry
+                html_correctness, html_reasoning = evaluate_html(
+                    client,
                     task_description=f"{task['task']}\nInteraction: {task['interaction']}\nExpected: {task.get('expected_outcome', 'Complete the task as specified')}",
                     actual_html=result.get("html_element", ""),
-                    expected_html=task.get('target_html', ''),
-                    openai_client=client
+                    expected_html=task.get('target_html', '')
                 )
 
                 # Convert bool to float for scoring

diff --git a/evaluation/fuzzy_match.py b/evaluation/fuzzy_match.py
@@ -18,7 +18,7 @@ def fuzzy_match_html(
     expected_html: str,
     note: str = None,
     openai_client: OpenAI = None
-) -> Tuple[bool, str]:
+) -> tuple[bool, str]:
     """Compare HTML elements using GPT-4 for semantic understanding"""
 
     if openai_client is None: