diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..a86798c --- /dev/null +++ b/.env.example @@ -0,0 +1,13 @@ +# OpenAI API Key for GPT-4 model +OPENAI_API_KEY=your-openai-api-key-here + +# Anthropic API Key for Claude model +ANTHROPIC_API_KEY=your-anthropic-api-key-here + +# Optional: Model configurations +GPT4_MODEL=gpt-4-turbo-preview # or gpt-4 +CLAUDE_MODEL=claude-3-opus-20240229 + +# Optional: Execution settings +MAX_WORKERS=4 +TIMEOUT_SECONDS=30 diff --git a/README.md b/README.md index 9f1dbd0..baab19e 100644 --- a/README.md +++ b/README.md @@ -10,175 +10,125 @@ DOM and DOMer-2 focuses on testing a model's ability to interact with web elemen 2. Real websites with diverse DOM structures 3. Ground truth screenshots for validation 4. GPT-4V based evaluation +5. Support for both serial and parallel execution -## Directory Structure - -``` -DOMe-and-DOMer-2/ -├── data/ -│ ├── dom_tasks.jsonl # Task definitions -│ └── ground_truth/ # Ground truth screenshots -│ ├── amazon_search_1_gt.png -│ └── ... -├── evaluation/ -│ ├── auto_eval.py # GPT-4V evaluation script -│ └── README.md # Evaluation documentation -├── results/ # Results for each run -│ └── run_001/ -│ ├── before_*.png # Screenshots before interaction -│ ├── after_*.png # Screenshots after interaction -│ ├── accessibility_*.json # Accessibility trees -│ ├── results.json # Raw results -│ ├── evaluation.json # GPT-4V evaluations -│ └── benchmark.log # Detailed logs -├── prompts.py # LLM system prompts -├── run.py # Main benchmark runner -├── utils.py # Utility functions -└── requirements.txt # Dependencies - -## Task Format - -Tasks are defined in `data/dom_tasks.jsonl`: - -```json -{ - "web_name": "Cambridge Dictionary", - "id": "cambridge_lookup_1", - "task": "Click the search box and type 'hello'", - "web": "https://dictionary.cambridge.org/", - "element_type": "input", - "interaction": "type", - "target_element": { - "type": "id", - "value": "searchword" - }, - "input_text": "hello", - "target_html": "", - "ground_truth": { - "screenshot": "evaluation/ground_truth/task_1_gt.png", - "description": "The word 'hello' has been entered in the search box" - } -} -``` - -Key fields: -- `target_element`: Selector information for finding the element -- `target_html`: Expected HTML structure of the element -- `ground_truth`: Reference screenshot and description - -## Ground Truth - -Ground truth is provided in two forms: -1. **Screenshots**: Visual state after successful interaction -2. **Descriptions**: Text description of expected changes - -Located in `data/ground_truth/`, each task has: -- `[task_id]_gt.png`: Screenshot of successful interaction -- Description in task JSON explaining expected changes - -## Environment Setup +## Installation -1. Create a virtual environment and install dependencies: +1. Clone the repository: ```bash -python -m venv venv -source venv/bin/activate # On Windows: venv\Scripts\activate -pip install -r requirements.txt +git clone https://github.com/yourusername/DOMe-and-DOMer-2.git +cd DOMe-and-DOMer-2 ``` -2. Set up environment variables in `.env`: +2. Install dependencies using pip: ```bash -OPENAI_API_KEY=your_openai_api_key +pip install -e . ``` -## Running the Benchmark - +Required dependencies: +- selenium +- webdriver-manager +- Pillow +- numpy +- requests +- beautifulsoup4 +- openai +- python-dotenv + +3. Set up your OpenAI API key in a `.env` file: ```bash -python run.py --tasks data/dom_tasks.jsonl --output data/results --evaluate +OPENAI_API_KEY=your_api_key_here ``` -This will: -1. Execute each task in the tasks file -2. Save screenshots and results -3. Compare actual HTML elements with expected ones -4. Run GPT-4V evaluation on screenshots - -## Ground Truth Management - -Ground truth images are stored in `evaluation/ground_truth/` with a consistent naming scheme: -``` -evaluation/ground_truth/ -└── task_1_gt.png -└── task_2_gt.png -... -``` - -The tasks file references these images using relative paths: -```json -{ - "id": 1, - "ground_truth": { - "screenshot": "evaluation/ground_truth/task_1_gt.png" - } -} -``` +## Usage -## Testing +The benchmark can be run in either serial or parallel mode: -Run environment tests: +### Parallel Mode (Default) ```bash -python test_env.py +python run.py --tasks data/dom_tasks.jsonl --output results --max-workers 4 --evaluate ``` -Run OpenAI API connection test: +### Serial Mode ```bash -python test_openai.py +python run.py --tasks data/dom_tasks.jsonl --output results --mode serial --evaluate ``` -## Evaluation Process +### Key Arguments +- `--tasks`: Path to JSONL file containing tasks +- `--output`: Output directory for results +- `--mode`: Run tasks in 'serial' or 'parallel' mode (default: parallel) +- `--max-workers`: Number of parallel workers (default: 4) +- `--evaluate`: Run GPT-4V evaluation after tasks complete +- `--evaluate-mode`: Run evaluations in 'serial' or 'parallel' mode (default: parallel) +- `--save-accessibility-tree`: Save accessibility trees for each task +- `--wait-time`: Wait time between actions in seconds (default: 2.0) -1. **Technical Validation**: - - Element found and interacted with - - No errors during interaction - - Accessibility tree verification +## Directory Structure -2. **Visual Validation**: - - Compare after screenshot with ground truth - - Verify expected visual changes - - Check for unintended side effects +``` +DOMe-and-DOMer-2/ +├── data/ +│ ├── dom_tasks.jsonl # Task definitions +│ └── task_schema.json # JSON schema for tasks +├── evaluation/ +│ ├── auto_eval.py # Evaluation orchestrator +│ ├── parallel_eval.py # Parallel evaluation implementation +│ ├── image_match.py # GPT-4V image comparison +│ └── fuzzy_match.py # HTML structure comparison +├── parallel_runner.py # Parallel task execution +├── serial_runner.py # Serial task execution +├── utils.py # Shared utilities +├── run.py # Main entry point +└── pyproject.toml # Project configuration and dependencies + +## Output Structure + +Results are saved in the specified output directory: +``` +output_dir/ +├── results.json # Task execution results +├── evaluation.json # GPT-4V evaluation results +├── benchmark.log # Execution logs +├── *_before.png # Screenshots before interaction +├── *_after.png # Screenshots after interaction +└── *_tree.json # Accessibility trees (if enabled) +``` -3. **GPT-4V Analysis**: - - Compare before/after/ground-truth screenshots - - Verify interaction success - - Check visual state matches expectations +## Task Format -## Output Format +Tasks are defined in `data/dom_tasks.jsonl`: ```json { - "total_tasks": 10, - "successful_tasks": 8, - "evaluations": [ - { - "task_id": "amazon_search_1", - "success": true, - "evaluation": "Detailed evaluation text...", - "timestamp": 1234567890 - } - ] + "id": "task_id", + "task": "Click the search box and type 'hello'", + "web": "https://example.com", + "interaction": "type", + "target_element": { + "type": "css", + "value": "#searchbox" + }, + "input_text": "hello", + "ground_truth": { + "screenshot": "path/to/ground_truth.png" + } } ``` -## Requirements +## Evaluation + +The benchmark uses GPT-4V to evaluate task success by comparing: +1. Before/after screenshots with ground truth +2. DOM structure changes +3. Task completion criteria -- Python 3.8+ -- Chrome/Chromium browser -- OpenAI API key (for evaluation) -- Required packages in `requirements.txt` +Evaluation can be run in parallel or serial mode and produces detailed scoring and reasoning for each task. ## Contributing -[Contributing guidelines will be added] +Contributions are welcome! Please feel free to submit a Pull Request. ## License -[License information will be added] +This project is licensed under the MIT License - see the LICENSE file for details. diff --git a/evaluation/ground_truth/task_2_gt.png b/evaluation/ground_truth/task_2_gt.png new file mode 100644 index 0000000..fd3d47f Binary files /dev/null and b/evaluation/ground_truth/task_2_gt.png differ diff --git a/evaluation/ground_truth/task_3_gt.png b/evaluation/ground_truth/task_3_gt.png new file mode 100644 index 0000000..d9e77fd Binary files /dev/null and b/evaluation/ground_truth/task_3_gt.png differ diff --git a/evaluation/ground_truth/task_4_gt.png b/evaluation/ground_truth/task_4_gt.png new file mode 100644 index 0000000..dfbca1a Binary files /dev/null and b/evaluation/ground_truth/task_4_gt.png differ diff --git a/evaluation/ground_truth/task_5_gt.png b/evaluation/ground_truth/task_5_gt.png new file mode 100644 index 0000000..2e4ef51 Binary files /dev/null and b/evaluation/ground_truth/task_5_gt.png differ diff --git a/examples/model_usage.py b/examples/model_usage.py new file mode 100644 index 0000000..1315c92 --- /dev/null +++ b/examples/model_usage.py @@ -0,0 +1,52 @@ +"""Example usage of different models in the DOM benchmark.""" + +import os +from dotenv import load_dotenv +from models import GPT4Model, ClaudeModel +from utils import TaskExecutor + +# Load environment variables +load_dotenv() + +def run_example_task(model, task): + """Run a single task with the given model and print results.""" + executor = TaskExecutor() + print(f"\nRunning task with {model.__class__.__name__}:") + print(f"Task: {task['task']}") + + result = model.run_task(task, executor) + + print(f"Success: {result.success}") + if result.error: + print(f"Error: {result.error}") + print(f"Time taken: {result.time_taken:.2f}s") + return result + +def main(): + # Initialize models + gpt4_model = GPT4Model(api_key=os.getenv("OPENAI_API_KEY")) + claude_model = ClaudeModel(api_key=os.getenv("ANTHROPIC_API_KEY")) + + # Example task + task = { + "task": "Click the 'Sign In' button", + "target_element": { + "type": "css", + "value": "#signin-button" + }, + "interaction": "click" + } + + # Run with both models + gpt4_result = run_example_task(gpt4_model, task) + claude_result = run_example_task(claude_model, task) + + # Compare results + print("\nComparison:") + print(f"GPT-4 success: {gpt4_result.success}") + print(f"Claude success: {claude_result.success}") + print(f"GPT-4 time: {gpt4_result.time_taken:.2f}s") + print(f"Claude time: {claude_result.time_taken:.2f}s") + +if __name__ == "__main__": + main() diff --git a/models/__init__.py b/models/__init__.py new file mode 100644 index 0000000..3ee30b0 --- /dev/null +++ b/models/__init__.py @@ -0,0 +1,5 @@ +from .base import BaseModel, WebInteraction, TaskResult +from .gpt4 import GPT4Model +from .claude import ClaudeModel + +__all__ = ['BaseModel', 'WebInteraction', 'TaskResult', 'GPT4Model', 'ClaudeModel'] diff --git a/models/base.py b/models/base.py new file mode 100644 index 0000000..0228efe --- /dev/null +++ b/models/base.py @@ -0,0 +1,119 @@ +from typing import Dict, Any, List, Optional +from dataclasses import dataclass +from abc import ABC, abstractmethod +from pathlib import Path + +@dataclass +class WebInteraction: + """Represents a web interaction instruction.""" + action: str # click, type, hover + selector_type: str # css, xpath, id + selector_value: str + input_text: Optional[str] = None + description: Optional[str] = None + +@dataclass +class TaskResult: + """Represents the result of executing a task.""" + task_id: str + success: bool + before_screenshot: Optional[str] = None + after_screenshot: Optional[str] = None + html_element: Optional[str] = None + accessibility_tree: Optional[Dict[str, Any]] = None + error: Optional[str] = None + metadata: Optional[Dict[str, Any]] = None + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary for serialization.""" + return { + "task_id": self.task_id, + "success": self.success, + "before_screenshot": self.before_screenshot, + "after_screenshot": self.after_screenshot, + "html_element": self.html_element, + "accessibility_tree": self.accessibility_tree, + "error": self.error, + "metadata": self.metadata + } + +class BaseModel(ABC): + """Base class for all models that can run the DOM benchmark.""" + + def __init__(self, model_name: str, model_config: Dict[str, Any]): + self.model_name = model_name + self.model_config = model_config + + @abstractmethod + def parse_task(self, task: Dict[str, Any]) -> WebInteraction: + """Parse a task definition into a web interaction instruction. + + Args: + task: Task definition from dom_tasks.jsonl + + Returns: + WebInteraction object with parsed instructions + """ + pass + + @abstractmethod + def handle_error(self, task: Dict[str, Any], error: str) -> WebInteraction: + """Handle errors during task execution and optionally retry. + + Args: + task: Original task definition + error: Error message from failed execution + + Returns: + New WebInteraction to try, or None to give up + """ + pass + + @abstractmethod + def validate_result(self, task: Dict[str, Any], result: TaskResult) -> bool: + """Validate if the task execution was successful. + + Args: + task: Original task definition + result: Result from task execution + + Returns: + True if task was successful, False otherwise + """ + pass + + def run_task(self, task: Dict[str, Any], executor) -> TaskResult: + """Run a single task using the model's logic. + + Args: + task: Task definition from dom_tasks.jsonl + executor: TaskExecutor instance to run web interactions + + Returns: + TaskResult with execution results + """ + try: + # Parse task into web interaction + interaction = self.parse_task(task) + + # Execute the interaction + result = executor.execute_interaction(interaction) + + # Validate the result + success = self.validate_result(task, result) + result.success = success + + return result + + except Exception as e: + # Try to handle the error + try_again = self.handle_error(task, str(e)) + if try_again: + return self.run_task(task, executor) + + # If can't handle, return failure + return TaskResult( + task_id=task["id"], + success=False, + error=str(e) + ) diff --git a/models/claude.py b/models/claude.py new file mode 100644 index 0000000..a3d2ca5 --- /dev/null +++ b/models/claude.py @@ -0,0 +1,137 @@ +import json +from typing import Dict, Any, Optional +from anthropic import Anthropic +from .base import BaseModel, WebInteraction, TaskResult + +class ClaudeModel(BaseModel): + """Claude model implementation for the DOM benchmark.""" + + def __init__(self, api_key: str, model_config: Dict[str, Any] = None): + super().__init__("claude-3", model_config or {}) + self.client = Anthropic(api_key=api_key) + + # Default system prompt + self.system_prompt = """You are an AI assistant that helps users interact with web elements. +Your task is to understand the user's intent and generate precise web element interactions. +You should focus on the specific interaction requested, using the provided element selectors. + +For each task, you will: +1. Understand the required interaction (click, type, hover) +2. Identify the correct element using the provided selector +3. Generate the appropriate interaction instruction + +Respond only with the exact interaction needed, no explanations or additional text. + +The response should be a JSON object with the following structure: +{ + "action": "click|type|hover", + "selector_type": "css|xpath|id", + "selector_value": "string", + "input_text": "string" (optional) +}""" + + def parse_task(self, task: Dict[str, Any]) -> WebInteraction: + """Parse task using Claude to understand the interaction.""" + # Construct prompt + prompt = f"""Task: {task['task']} +Target Element: {json.dumps(task['target_element'])} +Interaction Type: {task.get('interaction', 'click')} +Input Text: {task.get('input_text', '')} + +Generate the web interaction instruction as a JSON object.""" + + # Get Claude completion + response = self.client.messages.create( + model="claude-3-opus-20240229", + max_tokens=150, + temperature=0, + system=self.system_prompt, + messages=[ + {"role": "user", "content": prompt} + ] + ) + + # Parse JSON response + try: + interaction_data = json.loads(response.content[0].text) + return WebInteraction( + action=interaction_data.get('action', task.get('interaction', 'click')), + selector_type=interaction_data.get('selector_type', task['target_element']['type']), + selector_value=interaction_data.get('selector_value', task['target_element']['value']), + input_text=interaction_data.get('input_text', task.get('input_text')), + description=task['task'] + ) + except json.JSONDecodeError: + # Fallback to task values if Claude's response isn't valid JSON + return WebInteraction( + action=task.get('interaction', 'click'), + selector_type=task['target_element']['type'], + selector_value=task['target_element']['value'], + input_text=task.get('input_text'), + description=task['task'] + ) + + def handle_error(self, task: Dict[str, Any], error: str) -> Optional[WebInteraction]: + """Use Claude to understand and handle errors.""" + prompt = f"""Task: {task['task']} +Error: {error} + +Analyze the error and suggest a modified interaction. Respond with a JSON object for the new interaction. +If the error is unrecoverable, respond with exactly "GIVE UP".""" + + response = self.client.messages.create( + model="claude-3-opus-20240229", + max_tokens=150, + temperature=0, + system=self.system_prompt, + messages=[ + {"role": "user", "content": prompt} + ] + ) + + suggestion = response.content[0].text.strip() + if suggestion == "GIVE UP": + return None + + try: + # Try to parse Claude's suggestion + interaction_data = json.loads(suggestion) + return WebInteraction( + action=interaction_data['action'], + selector_type=interaction_data['selector_type'], + selector_value=interaction_data['selector_value'], + input_text=interaction_data.get('input_text'), + description=task['task'] + ) + except (json.JSONDecodeError, KeyError): + # If Claude's suggestion isn't valid, try one more time with original task + return self.parse_task(task) + + def validate_result(self, task: Dict[str, Any], result: TaskResult) -> bool: + """Use Claude to validate if the task was successful.""" + if result.error: + return False + + prompt = f"""Task: {task['task']} +Target Element HTML: {result.html_element} +Before Screenshot: {result.before_screenshot} +After Screenshot: {result.after_screenshot} + +Analyze if the interaction was successful. Consider: +1. The HTML element matches the expected interaction +2. The screenshots show the expected change +3. No errors occurred + +Respond with exactly 'YES' or 'NO'.""" + + response = self.client.messages.create( + model="claude-3-opus-20240229", + max_tokens=10, + temperature=0, + system=self.system_prompt, + messages=[ + {"role": "user", "content": prompt} + ] + ) + + return response.content[0].text.strip() == "YES" diff --git a/models/gpt4.py b/models/gpt4.py new file mode 100644 index 0000000..0a591c1 --- /dev/null +++ b/models/gpt4.py @@ -0,0 +1,96 @@ +import json +from typing import Dict, Any, Optional +from openai import OpenAI +from .base import BaseModel, WebInteraction, TaskResult + +class GPT4Model(BaseModel): + """GPT-4 model implementation for the DOM benchmark.""" + + def __init__(self, api_key: str, model_config: Dict[str, Any] = None): + super().__init__("gpt-4", model_config or {}) + self.client = OpenAI(api_key=api_key) + + # Default system prompt + self.system_prompt = """You are an AI assistant that helps users interact with web elements. +Your task is to understand the user's intent and generate precise web element interactions. +You should focus on the specific interaction requested, using the provided element selectors. + +For each task, you will: +1. Understand the required interaction (click, type, hover) +2. Identify the correct element using the provided selector +3. Generate the appropriate interaction instruction + +Respond only with the exact interaction needed, no explanations or additional text.""" + + def parse_task(self, task: Dict[str, Any]) -> WebInteraction: + """Parse task using GPT-4 to understand the interaction.""" + # Construct prompt + prompt = f"""Task: {task['task']} +Target Element: {json.dumps(task['target_element'])} +Interaction Type: {task.get('interaction', 'click')} +Input Text: {task.get('input_text', '')} + +Generate the web interaction instruction.""" + + # Get GPT-4 completion + response = self.client.chat.completions.create( + model="gpt-4", + messages=[ + {"role": "system", "content": self.system_prompt}, + {"role": "user", "content": prompt} + ], + temperature=0 + ) + + # Parse response into WebInteraction + return WebInteraction( + action=task.get('interaction', 'click'), + selector_type=task['target_element']['type'], + selector_value=task['target_element']['value'], + input_text=task.get('input_text'), + description=task['task'] + ) + + def handle_error(self, task: Dict[str, Any], error: str) -> Optional[WebInteraction]: + """Use GPT-4 to understand and handle errors.""" + prompt = f"""Task: {task['task']} +Error: {error} + +How should we modify the interaction to handle this error? +If the error is unrecoverable, respond with "GIVE UP".""" + + response = self.client.chat.completions.create( + model="gpt-4", + messages=[ + {"role": "system", "content": self.system_prompt}, + {"role": "user", "content": prompt} + ], + temperature=0 + ) + + suggestion = response.choices[0].message.content + if suggestion == "GIVE UP": + return None + + # Try to generate a new interaction based on GPT-4's suggestion + return self.parse_task(task) + + def validate_result(self, task: Dict[str, Any], result: TaskResult) -> bool: + """Use GPT-4 to validate if the task was successful.""" + if result.error: + return False + + prompt = f"""Task: {task['task']} +Target Element HTML: {result.html_element} +Was this interaction successful? Answer with just 'YES' or 'NO'.""" + + response = self.client.chat.completions.create( + model="gpt-4", + messages=[ + {"role": "system", "content": self.system_prompt}, + {"role": "user", "content": prompt} + ], + temperature=0 + ) + + return response.choices[0].message.content == "YES" diff --git a/models/prompts.py b/models/prompts.py new file mode 100644 index 0000000..7d057ac --- /dev/null +++ b/models/prompts.py @@ -0,0 +1,74 @@ +"""Specialized prompts for different web interaction scenarios.""" + +CLICK_PROMPT = """Analyze the following click interaction task: +Task: {task} +Element: {element} + +Consider: +1. Is this a simple click or does it require special handling (e.g., double-click, right-click)? +2. Are there any potential timing issues (e.g., waiting for element to be clickable)? +3. Should we verify any state changes after the click? + +Generate a JSON interaction specification.""" + +TYPE_PROMPT = """Analyze the following text input task: +Task: {task} +Element: {element} +Text to Input: {input_text} + +Consider: +1. Should we clear existing text first? +2. Are there any special characters that need handling? +3. Should we simulate natural typing speed? +4. Do we need to trigger any events after typing (e.g., Enter key)? + +Generate a JSON interaction specification.""" + +HOVER_PROMPT = """Analyze the following hover interaction task: +Task: {task} +Element: {element} + +Consider: +1. How long should the hover last? +2. Are there any tooltip or dropdown menus that need time to appear? +3. Should we verify the hover state visually? + +Generate a JSON interaction specification.""" + +NAVIGATION_PROMPT = """Analyze the following navigation task: +Task: {task} +Element: {element} + +Consider: +1. Should we wait for any redirects? +2. Are there any confirmation dialogs? +3. Should we verify the new URL? + +Generate a JSON interaction specification.""" + +ERROR_ANALYSIS_PROMPT = """Analyze the following error: +Task: {task} +Error: {error} +Previous Attempts: {attempts} + +Consider: +1. Is this a timing issue? +2. Is the element actually present but not visible/clickable? +3. Has the page structure changed? +4. Are we using the right selector? + +Suggest a modified interaction or respond with "GIVE UP" if unrecoverable.""" + +VALIDATION_PROMPT = """Validate the following interaction result: +Task: {task} +Element Before: {before_html} +Element After: {after_html} +Screenshots: {screenshots} + +Consider: +1. Did the element state change as expected? +2. Are there any visible changes in the screenshots? +3. Did any errors occur? +4. Is the result consistent with the task goal? + +Respond with exactly 'YES' or 'NO'.""" diff --git a/pyproject.toml b/pyproject.toml index d30ead9..e8b23bb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,14 +24,14 @@ classifiers = [ "Programming Language :: Python :: 3.11", ] dependencies = [ - "selenium==4.15.2", - "webdriver-manager==4.0.1", - "Pillow==10.1.0", - "numpy==1.24.3", - "requests==2.31.0", - "beautifulsoup4==4.12.2", - "openai==1.3.7", - "python-dotenv==1.0.0", + "selenium", + "webdriver-manager", + "Pillow", + "numpy", + "requests", + "beautifulsoup4", + "openai", + "python-dotenv", ] [project.urls] diff --git a/scripts/compare_models.py b/scripts/compare_models.py new file mode 100644 index 0000000..89c6c7c --- /dev/null +++ b/scripts/compare_models.py @@ -0,0 +1,116 @@ +"""Script to compare different model performances on the DOM benchmark.""" + +import os +import json +import time +import argparse +from typing import List, Dict +from concurrent.futures import ThreadPoolExecutor +from dotenv import load_dotenv + +from models import GPT4Model, ClaudeModel +from utils import TaskExecutor + +def load_tasks(task_file: str) -> List[Dict]: + """Load benchmark tasks from a JSON file.""" + with open(task_file, 'r') as f: + return [json.loads(line) for line in f] + +def run_model_on_task(model, task, executor): + """Run a single task with timing and error handling.""" + start_time = time.time() + try: + result = model.run_task(task, executor) + end_time = time.time() + return { + 'task': task['task'], + 'success': result.success, + 'error': result.error, + 'time_taken': end_time - start_time + } + except Exception as e: + end_time = time.time() + return { + 'task': task['task'], + 'success': False, + 'error': str(e), + 'time_taken': end_time - start_time + } + +def evaluate_model(model, tasks: List[Dict], num_workers: int = 4): + """Evaluate a model on all tasks.""" + results = [] + executor = TaskExecutor() + + with ThreadPoolExecutor(max_workers=num_workers) as pool: + futures = [ + pool.submit(run_model_on_task, model, task, executor) + for task in tasks + ] + results = [f.result() for f in futures] + + return results + +def calculate_metrics(results: List[Dict]): + """Calculate performance metrics from results.""" + total_tasks = len(results) + successful_tasks = sum(1 for r in results if r['success']) + total_time = sum(r['time_taken'] for r in results) + error_types = {} + + for r in results: + if r['error']: + error_type = type(r['error']).__name__ + error_types[error_type] = error_types.get(error_type, 0) + 1 + + return { + 'total_tasks': total_tasks, + 'successful_tasks': successful_tasks, + 'success_rate': successful_tasks / total_tasks * 100, + 'average_time': total_time / total_tasks, + 'total_time': total_time, + 'error_types': error_types + } + +def main(): + parser = argparse.ArgumentParser(description='Compare model performances on DOM benchmark') + parser.add_argument('--task-file', default='data/dom_tasks.jsonl', help='Path to task file') + parser.add_argument('--num-workers', type=int, default=4, help='Number of parallel workers') + parser.add_argument('--output', default='results/comparison.json', help='Output file for results') + args = parser.parse_args() + + # Load environment variables and tasks + load_dotenv() + tasks = load_tasks(args.task_file) + + # Initialize models + models = { + 'gpt4': GPT4Model(api_key=os.getenv("OPENAI_API_KEY")), + 'claude': ClaudeModel(api_key=os.getenv("ANTHROPIC_API_KEY")) + } + + # Run evaluation + results = {} + for model_name, model in models.items(): + print(f"\nEvaluating {model_name}...") + model_results = evaluate_model(model, tasks, args.num_workers) + metrics = calculate_metrics(model_results) + results[model_name] = { + 'metrics': metrics, + 'task_results': model_results + } + + print(f"\nResults for {model_name}:") + print(f"Success rate: {metrics['success_rate']:.2f}%") + print(f"Average time per task: {metrics['average_time']:.2f}s") + print("Error types:", metrics['error_types']) + + # Save results + os.makedirs(os.path.dirname(args.output), exist_ok=True) + with open(args.output, 'w') as f: + json.dump(results, f, indent=2) + + print(f"\nResults saved to {args.output}") + +if __name__ == "__main__": + main()