diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000..a86798c
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,13 @@
+# OpenAI API Key for GPT-4 model
+OPENAI_API_KEY=your-openai-api-key-here
+
+# Anthropic API Key for Claude model
+ANTHROPIC_API_KEY=your-anthropic-api-key-here
+
+# Optional: Model configurations
+GPT4_MODEL=gpt-4-turbo-preview  # or gpt-4
+CLAUDE_MODEL=claude-3-opus-20240229
+
+# Optional: Execution settings
+MAX_WORKERS=4
+TIMEOUT_SECONDS=30
diff --git a/README.md b/README.md
index 9f1dbd0..baab19e 100644
--- a/README.md
+++ b/README.md
@@ -10,175 +10,125 @@ DOM and DOMer-2 focuses on testing a model's ability to interact with web elemen
 2. Real websites with diverse DOM structures
 3. Ground truth screenshots for validation
 4. GPT-4V based evaluation
+5. Support for both serial and parallel execution
 
-## Directory Structure
-
-```
-DOMe-and-DOMer-2/
-├── data/
-│   ├── dom_tasks.jsonl         # Task definitions
-│   └── ground_truth/          # Ground truth screenshots
-│       ├── amazon_search_1_gt.png
-│       └── ...
-├── evaluation/
-│   ├── auto_eval.py           # GPT-4V evaluation script
-│   └── README.md              # Evaluation documentation
-├── results/                   # Results for each run
-│   └── run_001/
-│       ├── before_*.png       # Screenshots before interaction
-│       ├── after_*.png        # Screenshots after interaction
-│       ├── accessibility_*.json  # Accessibility trees
-│       ├── results.json       # Raw results
-│       ├── evaluation.json    # GPT-4V evaluations
-│       └── benchmark.log      # Detailed logs
-├── prompts.py                # LLM system prompts
-├── run.py                    # Main benchmark runner
-├── utils.py                 # Utility functions
-└── requirements.txt         # Dependencies
-
-## Task Format
-
-Tasks are defined in `data/dom_tasks.jsonl`:
-
-```json
-{
-    "web_name": "Cambridge Dictionary",
-    "id": "cambridge_lookup_1",
-    "task": "Click the search box and type 'hello'",
-    "web": "https://dictionary.cambridge.org/",
-    "element_type": "input",
-    "interaction": "type",
-    "target_element": {
-        "type": "id",
-        "value": "searchword"
-    },
-    "input_text": "hello",
-    "target_html": "<input id='searchword' type='text' ...>",
-    "ground_truth": {
-        "screenshot": "evaluation/ground_truth/task_1_gt.png",
-        "description": "The word 'hello' has been entered in the search box"
-    }
-}
-```
-
-Key fields:
-- `target_element`: Selector information for finding the element
-- `target_html`: Expected HTML structure of the element
-- `ground_truth`: Reference screenshot and description
-
-## Ground Truth
-
-Ground truth is provided in two forms:
-1. **Screenshots**: Visual state after successful interaction
-2. **Descriptions**: Text description of expected changes
-
-Located in `data/ground_truth/`, each task has:
-- `[task_id]_gt.png`: Screenshot of successful interaction
-- Description in task JSON explaining expected changes
-
-## Environment Setup
+## Installation
 
-1. Create a virtual environment and install dependencies:
+1. Clone the repository:
 ```bash
-python -m venv venv
-source venv/bin/activate  # On Windows: venv\Scripts\activate
-pip install -r requirements.txt
+git clone https://github.com/yourusername/DOMe-and-DOMer-2.git
+cd DOMe-and-DOMer-2
 ```
 
-2. Set up environment variables in `.env`:
+2. Install dependencies using pip:
 ```bash
-OPENAI_API_KEY=your_openai_api_key
+pip install -e .
 ```
 
-## Running the Benchmark
-
+Required dependencies:
+- selenium
+- webdriver-manager
+- Pillow
+- numpy
+- requests
+- beautifulsoup4
+- openai
+- python-dotenv
+
+3. Set up your OpenAI API key in a `.env` file:
 ```bash
-python run.py --tasks data/dom_tasks.jsonl --output data/results --evaluate
+OPENAI_API_KEY=your_api_key_here
 ```
 
-This will:
-1. Execute each task in the tasks file
-2. Save screenshots and results
-3. Compare actual HTML elements with expected ones
-4. Run GPT-4V evaluation on screenshots
-
-## Ground Truth Management
-
-Ground truth images are stored in `evaluation/ground_truth/` with a consistent naming scheme:
-```
-evaluation/ground_truth/
-└── task_1_gt.png
-└── task_2_gt.png
-...
-```
-
-The tasks file references these images using relative paths:
-```json
-{
-  "id": 1,
-  "ground_truth": {
-    "screenshot": "evaluation/ground_truth/task_1_gt.png"
-  }
-}
-```
+## Usage
 
-## Testing
+The benchmark can be run in either serial or parallel mode:
 
-Run environment tests:
+### Parallel Mode (Default)
 ```bash
-python test_env.py
+python run.py --tasks data/dom_tasks.jsonl --output results --max-workers 4 --evaluate
 ```
 
-Run OpenAI API connection test:
+### Serial Mode
 ```bash
-python test_openai.py
+python run.py --tasks data/dom_tasks.jsonl --output results --mode serial --evaluate
 ```
 
-## Evaluation Process
+### Key Arguments
+- `--tasks`: Path to JSONL file containing tasks
+- `--output`: Output directory for results
+- `--mode`: Run tasks in 'serial' or 'parallel' mode (default: parallel)
+- `--max-workers`: Number of parallel workers (default: 4)
+- `--evaluate`: Run GPT-4V evaluation after tasks complete
+- `--evaluate-mode`: Run evaluations in 'serial' or 'parallel' mode (default: parallel)
+- `--save-accessibility-tree`: Save accessibility trees for each task
+- `--wait-time`: Wait time between actions in seconds (default: 2.0)
 
-1. **Technical Validation**:
-   - Element found and interacted with
-   - No errors during interaction
-   - Accessibility tree verification
+## Directory Structure
 
-2. **Visual Validation**:
-   - Compare after screenshot with ground truth
-   - Verify expected visual changes
-   - Check for unintended side effects
+```
+DOMe-and-DOMer-2/
+├── data/
+│   ├── dom_tasks.jsonl         # Task definitions
+│   └── task_schema.json        # JSON schema for tasks
+├── evaluation/
+│   ├── auto_eval.py           # Evaluation orchestrator
+│   ├── parallel_eval.py       # Parallel evaluation implementation
+│   ├── image_match.py         # GPT-4V image comparison
+│   └── fuzzy_match.py         # HTML structure comparison
+├── parallel_runner.py         # Parallel task execution
+├── serial_runner.py          # Serial task execution
+├── utils.py                  # Shared utilities
+├── run.py                    # Main entry point
+└── pyproject.toml           # Project configuration and dependencies
+
+## Output Structure
+
+Results are saved in the specified output directory:
+```
+output_dir/
+├── results.json              # Task execution results
+├── evaluation.json           # GPT-4V evaluation results
+├── benchmark.log            # Execution logs
+├── *_before.png            # Screenshots before interaction
+├── *_after.png             # Screenshots after interaction
+└── *_tree.json            # Accessibility trees (if enabled)
+```
 
-3. **GPT-4V Analysis**:
-   - Compare before/after/ground-truth screenshots
-   - Verify interaction success
-   - Check visual state matches expectations
+## Task Format
 
-## Output Format
+Tasks are defined in `data/dom_tasks.jsonl`:
 
 ```json
 {
-    "total_tasks": 10,
-    "successful_tasks": 8,
-    "evaluations": [
-        {
-            "task_id": "amazon_search_1",
-            "success": true,
-            "evaluation": "Detailed evaluation text...",
-            "timestamp": 1234567890
-        }
-    ]
+    "id": "task_id",
+    "task": "Click the search box and type 'hello'",
+    "web": "https://example.com",
+    "interaction": "type",
+    "target_element": {
+        "type": "css",
+        "value": "#searchbox"
+    },
+    "input_text": "hello",
+    "ground_truth": {
+        "screenshot": "path/to/ground_truth.png"
+    }
 }
 ```
 
-## Requirements
+## Evaluation
+
+The benchmark uses GPT-4V to evaluate task success by comparing:
+1. Before/after screenshots with ground truth
+2. DOM structure changes
+3. Task completion criteria
 
-- Python 3.8+
-- Chrome/Chromium browser
-- OpenAI API key (for evaluation)
-- Required packages in `requirements.txt`
+Evaluation can be run in parallel or serial mode and produces detailed scoring and reasoning for each task.
 
 ## Contributing
 
-[Contributing guidelines will be added]
+Contributions are welcome! Please feel free to submit a Pull Request.
 
 ## License
 
-[License information will be added]
+This project is licensed under the MIT License - see the LICENSE file for details.
diff --git a/evaluation/ground_truth/task_2_gt.png b/evaluation/ground_truth/task_2_gt.png
new file mode 100644
index 0000000..fd3d47f
Binary files /dev/null and b/evaluation/ground_truth/task_2_gt.png differ
diff --git a/evaluation/ground_truth/task_3_gt.png b/evaluation/ground_truth/task_3_gt.png
new file mode 100644
index 0000000..d9e77fd
Binary files /dev/null and b/evaluation/ground_truth/task_3_gt.png differ
diff --git a/evaluation/ground_truth/task_4_gt.png b/evaluation/ground_truth/task_4_gt.png
new file mode 100644
index 0000000..dfbca1a
Binary files /dev/null and b/evaluation/ground_truth/task_4_gt.png differ
diff --git a/evaluation/ground_truth/task_5_gt.png b/evaluation/ground_truth/task_5_gt.png
new file mode 100644
index 0000000..2e4ef51
Binary files /dev/null and b/evaluation/ground_truth/task_5_gt.png differ
diff --git a/examples/model_usage.py b/examples/model_usage.py
new file mode 100644
index 0000000..1315c92
--- /dev/null
+++ b/examples/model_usage.py
@@ -0,0 +1,52 @@
+"""Example usage of different models in the DOM benchmark."""
+
+import os
+from dotenv import load_dotenv
+from models import GPT4Model, ClaudeModel
+from utils import TaskExecutor
+
+# Load environment variables
+load_dotenv()
+
+def run_example_task(model, task):
+    """Run a single task with the given model and print results."""
+    executor = TaskExecutor()
+    print(f"\nRunning task with {model.__class__.__name__}:")
+    print(f"Task: {task['task']}")
+    
+    result = model.run_task(task, executor)
+    
+    print(f"Success: {result.success}")
+    if result.error:
+        print(f"Error: {result.error}")
+    print(f"Time taken: {result.time_taken:.2f}s")
+    return result
+
+def main():
+    # Initialize models
+    gpt4_model = GPT4Model(api_key=os.getenv("OPENAI_API_KEY"))
+    claude_model = ClaudeModel(api_key=os.getenv("ANTHROPIC_API_KEY"))
+    
+    # Example task
+    task = {
+        "task": "Click the 'Sign In' button",
+        "target_element": {
+            "type": "css",
+            "value": "#signin-button"
+        },
+        "interaction": "click"
+    }
+    
+    # Run with both models
+    gpt4_result = run_example_task(gpt4_model, task)
+    claude_result = run_example_task(claude_model, task)
+    
+    # Compare results
+    print("\nComparison:")
+    print(f"GPT-4 success: {gpt4_result.success}")
+    print(f"Claude success: {claude_result.success}")
+    print(f"GPT-4 time: {gpt4_result.time_taken:.2f}s")
+    print(f"Claude time: {claude_result.time_taken:.2f}s")
+
+if __name__ == "__main__":
+    main()
diff --git a/models/__init__.py b/models/__init__.py
new file mode 100644
index 0000000..3ee30b0
--- /dev/null
+++ b/models/__init__.py
@@ -0,0 +1,5 @@
+from .base import BaseModel, WebInteraction, TaskResult
+from .gpt4 import GPT4Model
+from .claude import ClaudeModel
+
+__all__ = ['BaseModel', 'WebInteraction', 'TaskResult', 'GPT4Model', 'ClaudeModel']
diff --git a/models/base.py b/models/base.py
new file mode 100644
index 0000000..0228efe
--- /dev/null
+++ b/models/base.py
@@ -0,0 +1,119 @@
+from typing import Dict, Any, List, Optional
+from dataclasses import dataclass
+from abc import ABC, abstractmethod
+from pathlib import Path
+
+@dataclass
+class WebInteraction:
+    """Represents a web interaction instruction."""
+    action: str  # click, type, hover
+    selector_type: str  # css, xpath, id
+    selector_value: str
+    input_text: Optional[str] = None
+    description: Optional[str] = None
+
+@dataclass
+class TaskResult:
+    """Represents the result of executing a task."""
+    task_id: str
+    success: bool
+    before_screenshot: Optional[str] = None
+    after_screenshot: Optional[str] = None
+    html_element: Optional[str] = None
+    accessibility_tree: Optional[Dict[str, Any]] = None
+    error: Optional[str] = None
+    metadata: Optional[Dict[str, Any]] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for serialization."""
+        return {
+            "task_id": self.task_id,
+            "success": self.success,
+            "before_screenshot": self.before_screenshot,
+            "after_screenshot": self.after_screenshot,
+            "html_element": self.html_element,
+            "accessibility_tree": self.accessibility_tree,
+            "error": self.error,
+            "metadata": self.metadata
+        }
+
+class BaseModel(ABC):
+    """Base class for all models that can run the DOM benchmark."""
+    
+    def __init__(self, model_name: str, model_config: Dict[str, Any]):
+        self.model_name = model_name
+        self.model_config = model_config
+        
+    @abstractmethod
+    def parse_task(self, task: Dict[str, Any]) -> WebInteraction:
+        """Parse a task definition into a web interaction instruction.
+        
+        Args:
+            task: Task definition from dom_tasks.jsonl
+            
+        Returns:
+            WebInteraction object with parsed instructions
+        """
+        pass
+    
+    @abstractmethod
+    def handle_error(self, task: Dict[str, Any], error: str) -> WebInteraction:
+        """Handle errors during task execution and optionally retry.
+        
+        Args:
+            task: Original task definition
+            error: Error message from failed execution
+            
+        Returns:
+            New WebInteraction to try, or None to give up
+        """
+        pass
+    
+    @abstractmethod
+    def validate_result(self, task: Dict[str, Any], result: TaskResult) -> bool:
+        """Validate if the task execution was successful.
+        
+        Args:
+            task: Original task definition
+            result: Result from task execution
+            
+        Returns:
+            True if task was successful, False otherwise
+        """
+        pass
+
+    def run_task(self, task: Dict[str, Any], executor) -> TaskResult:
+        """Run a single task using the model's logic.
+        
+        Args:
+            task: Task definition from dom_tasks.jsonl
+            executor: TaskExecutor instance to run web interactions
+            
+        Returns:
+            TaskResult with execution results
+        """
+        try:
+            # Parse task into web interaction
+            interaction = self.parse_task(task)
+            
+            # Execute the interaction
+            result = executor.execute_interaction(interaction)
+            
+            # Validate the result
+            success = self.validate_result(task, result)
+            result.success = success
+            
+            return result
+            
+        except Exception as e:
+            # Try to handle the error
+            try_again = self.handle_error(task, str(e))
+            if try_again:
+                return self.run_task(task, executor)
+            
+            # If can't handle, return failure
+            return TaskResult(
+                task_id=task["id"],
+                success=False,
+                error=str(e)
+            )
diff --git a/models/claude.py b/models/claude.py
new file mode 100644
index 0000000..a3d2ca5
--- /dev/null
+++ b/models/claude.py
@@ -0,0 +1,137 @@
+import json
+from typing import Dict, Any, Optional
+from anthropic import Anthropic
+from .base import BaseModel, WebInteraction, TaskResult
+
+class ClaudeModel(BaseModel):
+    """Claude model implementation for the DOM benchmark."""
+    
+    def __init__(self, api_key: str, model_config: Dict[str, Any] = None):
+        super().__init__("claude-3", model_config or {})
+        self.client = Anthropic(api_key=api_key)
+        
+        # Default system prompt
+        self.system_prompt = """You are an AI assistant that helps users interact with web elements.
+Your task is to understand the user's intent and generate precise web element interactions.
+You should focus on the specific interaction requested, using the provided element selectors.
+
+For each task, you will:
+1. Understand the required interaction (click, type, hover)
+2. Identify the correct element using the provided selector
+3. Generate the appropriate interaction instruction
+
+Respond only with the exact interaction needed, no explanations or additional text.
+
+The response should be a JSON object with the following structure:
+{
+    "action": "click|type|hover",
+    "selector_type": "css|xpath|id",
+    "selector_value": "string",
+    "input_text": "string" (optional)
+}"""
+        
+    def parse_task(self, task: Dict[str, Any]) -> WebInteraction:
+        """Parse task using Claude to understand the interaction."""
+        # Construct prompt
+        prompt = f"""Task: {task['task']}
+Target Element: {json.dumps(task['target_element'])}
+Interaction Type: {task.get('interaction', 'click')}
+Input Text: {task.get('input_text', '')}
+
+Generate the web interaction instruction as a JSON object."""
+
+        # Get Claude completion
+        response = self.client.messages.create(
+            model="claude-3-opus-20240229",
+            max_tokens=150,
+            temperature=0,
+            system=self.system_prompt,
+            messages=[
+                {"role": "user", "content": prompt}
+            ]
+        )
+        
+        # Parse JSON response
+        try:
+            interaction_data = json.loads(response.content[0].text)
+            return WebInteraction(
+                action=interaction_data.get('action', task.get('interaction', 'click')),
+                selector_type=interaction_data.get('selector_type', task['target_element']['type']),
+                selector_value=interaction_data.get('selector_value', task['target_element']['value']),
+                input_text=interaction_data.get('input_text', task.get('input_text')),
+                description=task['task']
+            )
+        except json.JSONDecodeError:
+            # Fallback to task values if Claude's response isn't valid JSON
+            return WebInteraction(
+                action=task.get('interaction', 'click'),
+                selector_type=task['target_element']['type'],
+                selector_value=task['target_element']['value'],
+                input_text=task.get('input_text'),
+                description=task['task']
+            )
+    
+    def handle_error(self, task: Dict[str, Any], error: str) -> Optional[WebInteraction]:
+        """Use Claude to understand and handle errors."""
+        prompt = f"""Task: {task['task']}
+Error: {error}
+
+Analyze the error and suggest a modified interaction. Respond with a JSON object for the new interaction.
+If the error is unrecoverable, respond with exactly "GIVE UP"."""
+
+        response = self.client.messages.create(
+            model="claude-3-opus-20240229",
+            max_tokens=150,
+            temperature=0,
+            system=self.system_prompt,
+            messages=[
+                {"role": "user", "content": prompt}
+            ]
+        )
+        
+        suggestion = response.content[0].text.strip()
+        if suggestion == "GIVE UP":
+            return None
+            
+        try:
+            # Try to parse Claude's suggestion
+            interaction_data = json.loads(suggestion)
+            return WebInteraction(
+                action=interaction_data['action'],
+                selector_type=interaction_data['selector_type'],
+                selector_value=interaction_data['selector_value'],
+                input_text=interaction_data.get('input_text'),
+                description=task['task']
+            )
+        except (json.JSONDecodeError, KeyError):
+            # If Claude's suggestion isn't valid, try one more time with original task
+            return self.parse_task(task)
+    
+    def validate_result(self, task: Dict[str, Any], result: TaskResult) -> bool:
+        """Use Claude to validate if the task was successful."""
+        if result.error:
+            return False
+            
+        prompt = f"""Task: {task['task']}
+Target Element HTML: {result.html_element}
+Before Screenshot: {result.before_screenshot}
+After Screenshot: {result.after_screenshot}
+
+Analyze if the interaction was successful. Consider:
+1. The HTML element matches the expected interaction
+2. The screenshots show the expected change
+3. No errors occurred
+
+Respond with exactly 'YES' or 'NO'."""
+
+        response = self.client.messages.create(
+            model="claude-3-opus-20240229",
+            max_tokens=10,
+            temperature=0,
+            system=self.system_prompt,
+            messages=[
+                {"role": "user", "content": prompt}
+            ]
+        )
+        
+        return response.content[0].text.strip() == "YES"
diff --git a/models/gpt4.py b/models/gpt4.py
new file mode 100644
index 0000000..0a591c1
--- /dev/null
+++ b/models/gpt4.py
@@ -0,0 +1,96 @@
+import json
+from typing import Dict, Any, Optional
+from openai import OpenAI
+from .base import BaseModel, WebInteraction, TaskResult
+
+class GPT4Model(BaseModel):
+    """GPT-4 model implementation for the DOM benchmark."""
+    
+    def __init__(self, api_key: str, model_config: Dict[str, Any] = None):
+        super().__init__("gpt-4", model_config or {})
+        self.client = OpenAI(api_key=api_key)
+        
+        # Default system prompt
+        self.system_prompt = """You are an AI assistant that helps users interact with web elements.
+Your task is to understand the user's intent and generate precise web element interactions.
+You should focus on the specific interaction requested, using the provided element selectors.
+
+For each task, you will:
+1. Understand the required interaction (click, type, hover)
+2. Identify the correct element using the provided selector
+3. Generate the appropriate interaction instruction
+
+Respond only with the exact interaction needed, no explanations or additional text."""
+        
+    def parse_task(self, task: Dict[str, Any]) -> WebInteraction:
+        """Parse task using GPT-4 to understand the interaction."""
+        # Construct prompt
+        prompt = f"""Task: {task['task']}
+Target Element: {json.dumps(task['target_element'])}
+Interaction Type: {task.get('interaction', 'click')}
+Input Text: {task.get('input_text', '')}
+
+Generate the web interaction instruction."""
+
+        # Get GPT-4 completion
+        response = self.client.chat.completions.create(
+            model="gpt-4",
+            messages=[
+                {"role": "system", "content": self.system_prompt},
+                {"role": "user", "content": prompt}
+            ],
+            temperature=0
+        )
+        
+        # Parse response into WebInteraction
+        return WebInteraction(
+            action=task.get('interaction', 'click'),
+            selector_type=task['target_element']['type'],
+            selector_value=task['target_element']['value'],
+            input_text=task.get('input_text'),
+            description=task['task']
+        )
+    
+    def handle_error(self, task: Dict[str, Any], error: str) -> Optional[WebInteraction]:
+        """Use GPT-4 to understand and handle errors."""
+        prompt = f"""Task: {task['task']}
+Error: {error}
+
+How should we modify the interaction to handle this error?
+If the error is unrecoverable, respond with "GIVE UP"."""
+
+        response = self.client.chat.completions.create(
+            model="gpt-4",
+            messages=[
+                {"role": "system", "content": self.system_prompt},
+                {"role": "user", "content": prompt}
+            ],
+            temperature=0
+        )
+        
+        suggestion = response.choices[0].message.content
+        if suggestion == "GIVE UP":
+            return None
+            
+        # Try to generate a new interaction based on GPT-4's suggestion
+        return self.parse_task(task)
+    
+    def validate_result(self, task: Dict[str, Any], result: TaskResult) -> bool:
+        """Use GPT-4 to validate if the task was successful."""
+        if result.error:
+            return False
+            
+        prompt = f"""Task: {task['task']}
+Target Element HTML: {result.html_element}
+Was this interaction successful? Answer with just 'YES' or 'NO'."""
+
+        response = self.client.chat.completions.create(
+            model="gpt-4",
+            messages=[
+                {"role": "system", "content": self.system_prompt},
+                {"role": "user", "content": prompt}
+            ],
+            temperature=0
+        )
+        
+        return response.choices[0].message.content == "YES"
diff --git a/models/prompts.py b/models/prompts.py
new file mode 100644
index 0000000..7d057ac
--- /dev/null
+++ b/models/prompts.py
@@ -0,0 +1,74 @@
+"""Specialized prompts for different web interaction scenarios."""
+
+CLICK_PROMPT = """Analyze the following click interaction task:
+Task: {task}
+Element: {element}
+
+Consider:
+1. Is this a simple click or does it require special handling (e.g., double-click, right-click)?
+2. Are there any potential timing issues (e.g., waiting for element to be clickable)?
+3. Should we verify any state changes after the click?
+
+Generate a JSON interaction specification."""
+
+TYPE_PROMPT = """Analyze the following text input task:
+Task: {task}
+Element: {element}
+Text to Input: {input_text}
+
+Consider:
+1. Should we clear existing text first?
+2. Are there any special characters that need handling?
+3. Should we simulate natural typing speed?
+4. Do we need to trigger any events after typing (e.g., Enter key)?
+
+Generate a JSON interaction specification."""
+
+HOVER_PROMPT = """Analyze the following hover interaction task:
+Task: {task}
+Element: {element}
+
+Consider:
+1. How long should the hover last?
+2. Are there any tooltip or dropdown menus that need time to appear?
+3. Should we verify the hover state visually?
+
+Generate a JSON interaction specification."""
+
+NAVIGATION_PROMPT = """Analyze the following navigation task:
+Task: {task}
+Element: {element}
+
+Consider:
+1. Should we wait for any redirects?
+2. Are there any confirmation dialogs?
+3. Should we verify the new URL?
+
+Generate a JSON interaction specification."""
+
+ERROR_ANALYSIS_PROMPT = """Analyze the following error:
+Task: {task}
+Error: {error}
+Previous Attempts: {attempts}
+
+Consider:
+1. Is this a timing issue?
+2. Is the element actually present but not visible/clickable?
+3. Has the page structure changed?
+4. Are we using the right selector?
+
+Suggest a modified interaction or respond with "GIVE UP" if unrecoverable."""
+
+VALIDATION_PROMPT = """Validate the following interaction result:
+Task: {task}
+Element Before: {before_html}
+Element After: {after_html}
+Screenshots: {screenshots}
+
+Consider:
+1. Did the element state change as expected?
+2. Are there any visible changes in the screenshots?
+3. Did any errors occur?
+4. Is the result consistent with the task goal?
+
+Respond with exactly 'YES' or 'NO'."""
diff --git a/pyproject.toml b/pyproject.toml
index d30ead9..e8b23bb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,14 +24,14 @@ classifiers = [
     "Programming Language :: Python :: 3.11",
 ]
 dependencies = [
-    "selenium==4.15.2",
-    "webdriver-manager==4.0.1",
-    "Pillow==10.1.0",
-    "numpy==1.24.3",
-    "requests==2.31.0",
-    "beautifulsoup4==4.12.2",
-    "openai==1.3.7",
-    "python-dotenv==1.0.0",
+    "selenium",
+    "webdriver-manager",
+    "Pillow",
+    "numpy",
+    "requests",
+    "beautifulsoup4",
+    "openai",
+    "python-dotenv",
 ]
 
 [project.urls]
diff --git a/scripts/compare_models.py b/scripts/compare_models.py
new file mode 100644
index 0000000..89c6c7c
--- /dev/null
+++ b/scripts/compare_models.py
@@ -0,0 +1,116 @@
+"""Script to compare different model performances on the DOM benchmark."""
+
+import os
+import json
+import time
+import argparse
+from typing import List, Dict
+from concurrent.futures import ThreadPoolExecutor
+from dotenv import load_dotenv
+
+from models import GPT4Model, ClaudeModel
+from utils import TaskExecutor
+
+def load_tasks(task_file: str) -> List[Dict]:
+    """Load benchmark tasks from a JSON file."""
+    with open(task_file, 'r') as f:
+        return [json.loads(line) for line in f]
+
+def run_model_on_task(model, task, executor):
+    """Run a single task with timing and error handling."""
+    start_time = time.time()
+    try:
+        result = model.run_task(task, executor)
+        end_time = time.time()
+        return {
+            'task': task['task'],
+            'success': result.success,
+            'error': result.error,
+            'time_taken': end_time - start_time
+        }
+    except Exception as e:
+        end_time = time.time()
+        return {
+            'task': task['task'],
+            'success': False,
+            'error': str(e),
+            'time_taken': end_time - start_time
+        }
+
+def evaluate_model(model, tasks: List[Dict], num_workers: int = 4):
+    """Evaluate a model on all tasks."""
+    results = []
+    executor = TaskExecutor()
+    
+    with ThreadPoolExecutor(max_workers=num_workers) as pool:
+        futures = [
+            pool.submit(run_model_on_task, model, task, executor)
+            for task in tasks
+        ]
+        results = [f.result() for f in futures]
+    
+    return results
+
+def calculate_metrics(results: List[Dict]):
+    """Calculate performance metrics from results."""
+    total_tasks = len(results)
+    successful_tasks = sum(1 for r in results if r['success'])
+    total_time = sum(r['time_taken'] for r in results)
+    error_types = {}
+    
+    for r in results:
+        if r['error']:
+            error_type = type(r['error']).__name__
+            error_types[error_type] = error_types.get(error_type, 0) + 1
+    
+    return {
+        'total_tasks': total_tasks,
+        'successful_tasks': successful_tasks,
+        'success_rate': successful_tasks / total_tasks * 100,
+        'average_time': total_time / total_tasks,
+        'total_time': total_time,
+        'error_types': error_types
+    }
+
+def main():
+    parser = argparse.ArgumentParser(description='Compare model performances on DOM benchmark')
+    parser.add_argument('--task-file', default='data/dom_tasks.jsonl', help='Path to task file')
+    parser.add_argument('--num-workers', type=int, default=4, help='Number of parallel workers')
+    parser.add_argument('--output', default='results/comparison.json', help='Output file for results')
+    args = parser.parse_args()
+    
+    # Load environment variables and tasks
+    load_dotenv()
+    tasks = load_tasks(args.task_file)
+    
+    # Initialize models
+    models = {
+        'gpt4': GPT4Model(api_key=os.getenv("OPENAI_API_KEY")),
+        'claude': ClaudeModel(api_key=os.getenv("ANTHROPIC_API_KEY"))
+    }
+    
+    # Run evaluation
+    results = {}
+    for model_name, model in models.items():
+        print(f"\nEvaluating {model_name}...")
+        model_results = evaluate_model(model, tasks, args.num_workers)
+        metrics = calculate_metrics(model_results)
+        results[model_name] = {
+            'metrics': metrics,
+            'task_results': model_results
+        }
+        
+        print(f"\nResults for {model_name}:")
+        print(f"Success rate: {metrics['success_rate']:.2f}%")
+        print(f"Average time per task: {metrics['average_time']:.2f}s")
+        print("Error types:", metrics['error_types'])
+    
+    # Save results
+    os.makedirs(os.path.dirname(args.output), exist_ok=True)
+    with open(args.output, 'w') as f:
+        json.dump(results, f, indent=2)
+    
+    print(f"\nResults saved to {args.output}")
+
+if __name__ == "__main__":
+    main()