From 117a792709a3662cf7f6d5f054797fe62b923223 Mon Sep 17 00:00:00 2001
From: Dhruv Ahuja <dhruv_ahuja@berkeley.edu>
Date: Thu, 19 Dec 2024 19:54:48 -0800
Subject: [PATCH] Final Commit

---
 README.md                 | 119 ++++++-------
 analyze_insights.py       | 109 ++++++++++++
 analyze_results.py        |   2 +-
 dataset_cleaner.py        | 118 +++++++++++++
 evaluation/fuzzy_match.py |  11 +-
 evaluation/image_match.py |  18 +-
 models/claude.py          | 229 ++++++++++++++++---------
 models/gemini.py          | 139 ++++++++--------
 models/gpt4.py            | 342 +++++++++++++++-----------------------
 parallel_runner.py        |  57 +++----
 serial_runner.py          |   6 +-
 11 files changed, 705 insertions(+), 445 deletions(-)
 create mode 100644 analyze_insights.py
 create mode 100644 dataset_cleaner.py

diff --git a/README.md b/README.md
index baab19e..066a92a 100644
--- a/README.md
+++ b/README.md
@@ -33,97 +33,100 @@ Required dependencies:
 - requests
 - beautifulsoup4
 - openai
+- anthropic
+- google-generativeai
 - python-dotenv
 
-3. Set up your OpenAI API key in a `.env` file:
+3. Set up your API keys in a `.env` file:
 ```bash
-OPENAI_API_KEY=your_api_key_here
+OPENAI_API_KEY=your_openai_key_here
+ANTHROPIC_API_KEY=your_anthropic_key_here
+GOOGLE_API_KEY=your_google_key_here
 ```
 
+## Supported Models
+
+The benchmark currently supports the following models:
+
+1. **GPT-4 Turbo** (OpenAI)
+   - Default model for both task execution and evaluation
+   - High accuracy but subject to rate limits (3500 RPM)
+
+2. **Claude 3 Haiku** (Anthropic)
+   - Fast and efficient for task execution
+   - Subject to stricter rate limits (5 RPM)
+   - Use `--serial` flag for best results
+
+3. **Gemini 1.5 Pro** (Google)
+   - Latest version of Google's Gemini model
+   - Good balance of speed and accuracy
+
 ## Usage
 
 The benchmark can be run in either serial or parallel mode:
 
 ### Parallel Mode (Default)
 ```bash
-python run.py --tasks data/dom_tasks.jsonl --output results --max-workers 4 --evaluate
+# Run with GPT-4
+python -m benchmark --model gpt4 --tasks data/test_tasks.jsonl --output-dir results
+
+# Run with Claude
+python -m benchmark --model claude --tasks data/test_tasks.jsonl --output-dir results --serial
+
+# Run with Gemini
+python -m benchmark --model gemini --tasks data/test_tasks.jsonl --output-dir results
 ```
 
 ### Serial Mode
 ```bash
-python run.py --tasks data/dom_tasks.jsonl --output results --mode serial --evaluate
+python -m benchmark --model [gpt4|claude|gemini] --tasks data/test_tasks.jsonl --output-dir results --serial
 ```
 
-### Key Arguments
-- `--tasks`: Path to JSONL file containing tasks
-- `--output`: Output directory for results
-- `--mode`: Run tasks in 'serial' or 'parallel' mode (default: parallel)
-- `--max-workers`: Number of parallel workers (default: 4)
-- `--evaluate`: Run GPT-4V evaluation after tasks complete
-- `--evaluate-mode`: Run evaluations in 'serial' or 'parallel' mode (default: parallel)
-- `--save-accessibility-tree`: Save accessibility trees for each task
-- `--wait-time`: Wait time between actions in seconds (default: 2.0)
+### Evaluation
+Results are automatically evaluated using GPT-4V for visual comparison and GPT-4 for HTML structure matching:
 
-## Directory Structure
-
-```
-DOMe-and-DOMer-2/
-├── data/
-│   ├── dom_tasks.jsonl         # Task definitions
-│   └── task_schema.json        # JSON schema for tasks
-├── evaluation/
-│   ├── auto_eval.py           # Evaluation orchestrator
-│   ├── parallel_eval.py       # Parallel evaluation implementation
-│   ├── image_match.py         # GPT-4V image comparison
-│   └── fuzzy_match.py         # HTML structure comparison
-├── parallel_runner.py         # Parallel task execution
-├── serial_runner.py          # Serial task execution
-├── utils.py                  # Shared utilities
-├── run.py                    # Main entry point
-└── pyproject.toml           # Project configuration and dependencies
-
-## Output Structure
-
-Results are saved in the specified output directory:
-```
-output_dir/
-├── results.json              # Task execution results
-├── evaluation.json           # GPT-4V evaluation results
-├── benchmark.log            # Execution logs
-├── *_before.png            # Screenshots before interaction
-├── *_after.png             # Screenshots after interaction
-└── *_tree.json            # Accessibility trees (if enabled)
+```bash
+python -m evaluate --tasks data/test_tasks.jsonl --results-dir results --output results/evaluation.json
 ```
 
 ## Task Format
 
-Tasks are defined in `data/dom_tasks.jsonl`:
-
+Tasks are defined in JSONL format with the following structure:
 ```json
 {
-    "id": "task_id",
-    "task": "Click the search box and type 'hello'",
-    "web": "https://example.com",
-    "interaction": "type",
+    "web_name": "Website Name",
+    "id": "unique_task_id",
+    "task": "Description of the interaction task",
+    "web": "https://website.url",
+    "element_type": "button|input|link",
+    "interaction": "click|type|hover",
     "target_element": {
-        "type": "css",
-        "value": "#searchbox"
+        "type": "id|class|xpath",
+        "value": "selector_value"
     },
-    "input_text": "hello",
+    "input_text": "Text to type (for type interactions)",
+    "target_html": "HTML of target element",
     "ground_truth": {
-        "screenshot": "path/to/ground_truth.png"
+        "screenshot": "path/to/screenshot.png",
+        "description": "Expected result description"
     }
 }
 ```
 
-## Evaluation
+## Rate Limits
+
+Different models have different rate limits:
+- GPT-4: 3500 requests per minute
+- Claude: 5 requests per minute
+- Gemini: 60 requests per minute
+
+Use the `--serial` flag for models with strict rate limits (e.g., Claude) to avoid hitting limits.
 
-The benchmark uses GPT-4V to evaluate task success by comparing:
-1. Before/after screenshots with ground truth
-2. DOM structure changes
-3. Task completion criteria
+## Test Tasks
 
-Evaluation can be run in parallel or serial mode and produces detailed scoring and reasoning for each task.
+The repository includes two task sets:
+- `data/test_tasks.jsonl`: Full test set with 100+ tasks
+- `data/test_tasks_10.jsonl`: Smaller set of 10 tasks for quick testing
 
 ## Contributing
 
diff --git a/analyze_insights.py b/analyze_insights.py
new file mode 100644
index 0000000..41ab1d5
--- /dev/null
+++ b/analyze_insights.py
@@ -0,0 +1,109 @@
+import json
+from collections import defaultdict
+from typing import Dict, List, Any
+
+def load_results() -> List[Dict[str, Any]]:
+    with open('results/results.json') as f:
+        return json.load(f)
+
+def analyze_results(results: List[Dict[str, Any]]) -> None:
+    total_tasks = len(results)
+    successes = [r for r in results if r.get('success', False)]
+    failures = [r for r in results if not r.get('success', False)]
+    
+    print("\n=== Overall Statistics ===")
+    print(f"Total Tasks: {total_tasks}")
+    print(f"Success Rate: {len(successes)/total_tasks*100:.2f}% ({len(successes)} successes, {len(failures)} failures)")
+
+    # Error Analysis
+    error_types = defaultdict(int)
+    for task in failures:
+        error = task.get('error', 'Unknown error')
+        if isinstance(error, str):
+            # Simplify error messages to group similar errors
+            if 'has no attribute' in error:
+                error = "Missing attribute error"
+            elif 'timeout' in error.lower():
+                error = "Timeout error"
+            elif 'not found' in error.lower():
+                error = "Element not found"
+            elif 'failed evaluation' in error.lower():
+                error = "Failed evaluation checks"
+        error_types[error] += 1
+
+    print("\n=== Error Analysis ===")
+    print("Common failure reasons:")
+    for error, count in sorted(error_types.items(), key=lambda x: x[1], reverse=True):
+        percentage = (count / len(failures)) * 100
+        print(f"{error}: {percentage:.1f}% ({count} tasks)")
+
+    # Task Type Analysis
+    def categorize_task(task_desc: str) -> str:
+        desc = task_desc.lower()
+        if 'click' in desc:
+            return 'Click'
+        elif 'type' in desc or 'enter' in desc:
+            return 'Type/Input'
+        elif 'search' in desc:
+            return 'Search'
+        elif 'hover' in desc:
+            return 'Hover'
+        return 'Other'
+
+    task_types = defaultdict(lambda: {'success': 0, 'fail': 0})
+    for task in results:
+        task_type = categorize_task(task.get('task_description', ''))
+        if task.get('success', False):
+            task_types[task_type]['success'] += 1
+        else:
+            task_types[task_type]['fail'] += 1
+
+    print("\n=== Task Type Analysis ===")
+    for task_type, stats in task_types.items():
+        total = stats['success'] + stats['fail']
+        success_rate = (stats['success']/total*100) if total > 0 else 0
+        print(f"{task_type}: {success_rate:.1f}% success rate ({stats['success']}/{total} tasks)")
+
+    # Website Analysis
+    def extract_website(task_id: str) -> str:
+        return task_id.split('_')[0] if '_' in task_id else 'unknown'
+
+    website_stats = defaultdict(lambda: {'success': 0, 'fail': 0})
+    for task in results:
+        website = extract_website(task.get('task_id', 'unknown'))
+        if task.get('success', False):
+            website_stats[website]['success'] += 1
+        else:
+            website_stats[website]['fail'] += 1
+
+    print("\n=== Website Performance ===")
+    for website, stats in sorted(website_stats.items(), 
+                               key=lambda x: (x[1]['success'] + x[1]['fail']), 
+                               reverse=True):
+        total = stats['success'] + stats['fail']
+        if total < 2:  # Skip websites with very few tasks
+            continue
+        success_rate = (stats['success']/total*100)
+        print(f"{website}: {success_rate:.1f}% success rate ({stats['success']}/{total} tasks)")
+
+    # Example Analysis
+    print("\n=== Example Cases ===")
+    print("\nSuccessful Tasks:")
+    for task in successes[:3]:
+        print(f"✓ {task.get('task_description', '')}")
+        print(f"  ID: {task.get('task_id', '')}")
+        if task.get('error'):
+            print(f"  Note: {task['error']}")
+        print()
+
+    print("\nFailed Tasks:")
+    for task in failures[:3]:
+        print(f"✗ {task.get('task_description', '')}")
+        print(f"  ID: {task.get('task_id', '')}")
+        if task.get('error'):
+            print(f"  Error: {task['error']}")
+        print()
+
+if __name__ == "__main__":
+    results = load_results()
+    analyze_results(results)
diff --git a/analyze_results.py b/analyze_results.py
index 20541e2..a9ec371 100644
--- a/analyze_results.py
+++ b/analyze_results.py
@@ -8,7 +8,7 @@
 
 # Calculate success percentage
 total_tasks = len(results)
-successful_tasks = [result for result in results if result.get('final_score', 0) == 1]
+successful_tasks = [result for result in results if result.get('final_score', 0) >= .8]
 success_percentage = (len(successful_tasks) / total_tasks) * 100 if total_tasks > 0 else 0
 
 print(f"\nResults Analysis:")
diff --git a/dataset_cleaner.py b/dataset_cleaner.py
new file mode 100644
index 0000000..3c69321
--- /dev/null
+++ b/dataset_cleaner.py
@@ -0,0 +1,118 @@
+import json
+import os
+from pathlib import Path
+from typing import Dict, List, Any, Optional
+from openai import OpenAI
+from dotenv import load_dotenv
+
+load_dotenv()
+
+class DatasetCleaner:
+    def __init__(self, results_file: str, api_key: Optional[str] = None):
+        """Initialize the dataset cleaner.
+        
+        Args:
+            results_file: Path to results.json file
+            api_key: OpenAI API key (optional, will use environment variable if not provided)
+        """
+        self.results_file = Path(results_file)
+        self.client = OpenAI(api_key=api_key)
+        
+    def analyze_result(self, result: Dict[str, Any]) -> Dict[str, Any]:
+        """Analyze a single result entry to determine if it's valid."""
+        response = self.client.chat.completions.create(
+            model="gpt-4-turbo",
+            messages=[
+                {
+                    "role": "system",
+                    "content": """You are an expert at analyzing web automation test results to determine if a test case is invalid.
+A test case should be considered invalid if it encounters issues that make it unsuitable for benchmarking, such as:
+1. CAPTCHA or verification challenges
+2. Network or connection issues
+3. Page timeouts or loading failures
+4. Security blocks or authentication requirements
+5. Missing or broken page elements
+6. Browser crashes
+7. Rate limiting or API errors
+8. Geolocation restrictions"""
+                },
+                {
+                    "role": "user",
+                    "content": f"""Analyze this test result and determine if it should be excluded from benchmarking:
+
+Task ID: {result['task_id']}
+Success: {result['success']}
+Error: {result.get('error', 'None')}
+Task Description: {result['task_description']}
+HTML Element: {result.get('html_element', 'None')}
+
+Respond with a JSON object containing:
+{{
+    "is_valid": boolean,
+    "reason": string explaining why the test case is invalid (if applicable),
+    "confidence": float between 0 and 1
+}}"""
+                }
+            ],
+            response_format={"type": "json_object"}
+        )
+        
+        return json.loads(response.choices[0].message.content)
+        
+    def clean_dataset(self, min_confidence: float = 0.8) -> Dict[str, List[str]]:
+        """Clean the dataset by analyzing results.json entries.
+        
+        Args:
+            min_confidence: Minimum confidence threshold for filtering (default: 0.8)
+            
+        Returns:
+            Dictionary containing lists of valid and invalid test cases
+        """
+        results = {
+            "valid": [],
+            "invalid": []
+        }
+        
+        # Load and process results.json
+        with open(self.results_file) as f:
+            test_results = json.load(f)
+            
+        for result in test_results:
+            analysis = self.analyze_result(result)
+            
+            if analysis["is_valid"] or analysis["confidence"] < min_confidence:
+                results["valid"].append(result["task_id"])
+            else:
+                results["invalid"].append({
+                    "task_id": result["task_id"],
+                    "reason": analysis["reason"],
+                    "confidence": analysis["confidence"]
+                })
+        
+        # Save results
+        output_path = self.results_file.parent / "dataset_cleaning_results.json"
+        with open(output_path, "w") as f:
+            json.dump(results, f, indent=2)
+            
+        print(f"Dataset cleaning results saved to {output_path}")
+        print(f"Valid test cases: {len(results['valid'])}")
+        print(f"Invalid test cases: {len(results['invalid'])}")
+        print("\nInvalid test cases and reasons:")
+        for invalid in results["invalid"]:
+            print(f"- {invalid['task_id']}: {invalid['reason']} (confidence: {invalid['confidence']:.2f})")
+        
+        return results
+
+if __name__ == "__main__":
+    import argparse
+    
+    parser = argparse.ArgumentParser(description="Clean benchmark dataset by filtering invalid test cases")
+    parser.add_argument("results_file", help="Path to results.json file")
+    parser.add_argument("--min-confidence", type=float, default=0.8,
+                       help="Minimum confidence threshold for filtering (default: 0.8)")
+    parser.add_argument("--api-key", help="OpenAI API key (optional)")
+    
+    args = parser.parse_args()
+    
+    cleaner = DatasetCleaner(args.results_file, os.getenv("OPENAI_API_KEY"))
+    results = cleaner.clean_dataset(min_confidence=args.min_confidence)
diff --git a/evaluation/fuzzy_match.py b/evaluation/fuzzy_match.py
index d26034f..dde4c28 100644
--- a/evaluation/fuzzy_match.py
+++ b/evaluation/fuzzy_match.py
@@ -5,11 +5,18 @@
 load_dotenv()
 
 system_prompt = """
-You are evaluating if a web automation task was completed successfully. Compare the HTML elements and determine if the task's goal was achieved, focusing on the relevant HTML changes that indicate success.
+You are evaluating if a web automation task interacted with the correct HTML element. Your goal is to verify that the agent interacted with the intended element based on the task description and HTML.
+
+Guidelines:
+1. Focus on element matching, not page state changes
+2. Check if the element's attributes (id, class, text) match the task requirements
+3. Verify the element is the correct type (button, link, input, etc.)
+4. Ignore differences in element state or content after interaction
+5. For forms/inputs, verify the correct input field was targeted
 
 Your output should be in the following format:
 Correctness: [True/False]
-Reason: [Reason for the correctness/incorrectness of the agent's output]
+Reason: [Explain if the correct element was targeted based on HTML attributes and type]
 """
 
 def fuzzy_match_html(
diff --git a/evaluation/image_match.py b/evaluation/image_match.py
index c7c4925..ad9e3c6 100644
--- a/evaluation/image_match.py
+++ b/evaluation/image_match.py
@@ -7,11 +7,25 @@
 from pathlib import Path
 
 system_prompt = """
-A task required an agent to create an image based on a prompt and your task is to compare the image it generated with the image it was supposed to generate. 
+You are evaluating if a web automation task achieved its intended final state. Your goal is to compare the final screenshot with the expected ground truth image.
+
+Guidelines:
+1. Focus on the FINAL STATE of the page, not the process
+2. Compare key visual elements that indicate task completion:
+   - For navigation: correct page/section is shown
+   - For form inputs: text appears in the right field
+   - For clicks: expected content/menu is visible
+3. Ignore temporary visual elements like:
+   - Loading indicators
+   - Tooltips
+   - Hover states
+   - Transition animations
+4. Don't try to verify the action being taken, only the end result
+5. Minor visual differences (e.g., slight layout shifts, different ads) are acceptable
 
 Your output should be in the following format:
 Correctness: [True/False]
-Reason: [Reason for the correctness/incorrectness of the agent's output]
+Reason: [Explain if the final state matches the expected outcome, focusing on key visual indicators of task completion]
 """
 
 class ImageServer:
diff --git a/models/claude.py b/models/claude.py
index 59a9741..a2101e3 100644
--- a/models/claude.py
+++ b/models/claude.py
@@ -1,68 +1,154 @@
 import json
+import time
+import os
+import logging
+from pathlib import Path
 from typing import Dict, Any, Optional
 from anthropic import Anthropic
 from .base import BaseModel, WebInteraction, TaskResult
+from bs4 import BeautifulSoup
+
+class RequestPool:
+    def __init__(self, max_requests_per_minute=5):  # Claude has lower rate limits
+        self.requests = []
+        self.max_requests = max_requests_per_minute
+        self.window = 60  # 1 minute window
+        self.min_wait = 12  # Minimum 12 seconds between requests (5 per minute)
+
+    def can_make_request(self):
+        now = time.time()
+        # Remove old requests
+        self.requests = [t for t in self.requests if now - t < self.window]
+        
+        # Check if we've made any requests in the last min_wait seconds
+        if self.requests and (now - self.requests[-1]) < self.min_wait:
+            return False
+            
+        return len(self.requests) < self.max_requests
+
+    def add_request(self):
+        self.requests.append(time.time())
 
 class ClaudeModel(BaseModel):
     """Claude model implementation for the DOM benchmark."""
     
-    def __init__(self, api_key: str, model_config: Dict[str, Any] = None):
-        super().__init__("claude-3", model_config or {})
-        self.client = Anthropic(api_key=api_key)
+    def __init__(self, api_key: str = None, model_config: Dict[str, Any] = None):
+        """Initialize ClaudeModel with Anthropic API key"""
+        super().__init__("claude-3-5-haiku-20241022", model_config or {})
+        self.api_key = api_key or os.getenv('ANTHROPIC_API_KEY')
+        if not self.api_key:
+            raise ValueError("Anthropic API key not provided")
+            
+        self.client = Anthropic(api_key=self.api_key)
+        self.temperature = 0
+        self.request_pool = RequestPool()  # Add request pooling
+        self.model = "claude-3-5-haiku-20241022"  # Set the specific model name
+        
+        # Setup logging for skipped tasks
+        self.output_dir = Path("results/skipped_tasks")
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        self.skipped_logger = logging.getLogger('skipped_tasks')
+        handler = logging.FileHandler(self.output_dir / 'skipped_tasks.log')
+        handler.setFormatter(logging.Formatter('%(asctime)s - %(message)s'))
+        self.skipped_logger.addHandler(handler)
+        self.skipped_logger.setLevel(logging.INFO)
         
         # Default system prompt
         self.system_prompt = """You are an AI assistant that helps users interact with web elements.
 Your task is to understand the user's intent and generate precise web element interactions.
-You should focus on the specific interaction requested, using the provided element selectors.
 
-For each task, you will:
-1. Understand the required interaction (click, type, hover)
-2. Identify the correct element using the provided selector
-3. Generate the appropriate interaction instruction
+IMPORTANT: You MUST respond with ONLY a valid JSON object. No other text, explanations, or formatting.
+The JSON object MUST follow this exact schema:
+{
+    "action": "click" | "type" | "hover",
+    "selector_type": "css" | "xpath" | "id" | "class",
+    "selector_value": "string",
+    "input_text": "string"  // Only required for type actions
+}
 
-Respond only with the exact interaction needed, no explanations or additional text.
+Guidelines for generating selectors:
+1. Prefer stable selectors (id, unique class names) over dynamic ones
+2. Consider element visibility and interactability
+3. Handle dynamic content and loading states
+4. Pay attention to timing and wait states
 
-The response should be a JSON object with the following structure:
+Example valid response:
 {
-    "action": "click|type|hover",
-    "selector_type": "css|xpath|id",
-    "selector_value": "string",
-    "input_text": "string" (optional)
+    "action": "click",
+    "selector_type": "css",
+    "selector_value": "#submit-button",
+    "input_text": null
 }"""
         
-    def parse_task(self, task: Dict[str, Any], page_html: str = None) -> WebInteraction:
+    def _clean_html(self, html: str) -> str:
+        """Keep only relevant semantic HTML elements and attributes for content analysis."""
+        soup = BeautifulSoup(html, 'html.parser')
+        
+        # Define elements we want to keep
+        allowed_elements = {
+            'div', 'span', 'p', 'a', 'button', 'input', 'select', 'option',
+            'form', 'label', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'nav',
+            'header', 'footer', 'main', 'section', 'article', 'aside',
+            'ul', 'ol', 'li', 'table', 'tr', 'td', 'th', 'thead', 'tbody',
+            'dialog', 'details', 'summary'
+        }
+        
+        # Define attributes we want to keep
+        allowed_attributes = {
+            'a': ['href', 'title'],
+            'img': ['alt', 'src'],
+            '*': ['id', 'class']  # Allow these on any element
+        }
+        
+        # Function to clean a tag
+        def clean_tag(tag):
+            if tag.name not in allowed_elements:
+                tag.unwrap()  # Keep content but remove the tag
+                return
+                
+            # Remove all attributes except allowed ones
+            allowed_for_tag = allowed_attributes.get(tag.name, []) + allowed_attributes['*']
+            attrs = dict(tag.attrs)  # Create a copy since we're modifying
+            for attr in attrs:
+                if attr not in allowed_for_tag:
+                    del tag[attr]
+        
+        # Clean all tags in the document
+        for tag in soup.find_all(True):
+            clean_tag(tag)
+            
+        return str(soup)
+
+    def parse_task(self, task: Dict[str, Any], page_html: str = None) -> Optional[WebInteraction]:
         """Parse task using Claude to understand the interaction."""
+        # Clean HTML if provided
+        if page_html:
+            page_html = self._clean_html(page_html)
+            
         # Construct prompt
-        prompt = f"""Task: {task['task']}
+        user_prompt = f"""Task: {task['task']}
 Current Page HTML: {page_html if page_html else 'Not available'}
 
-Based on the task description and current page HTML:
-1. Determine the type of interaction needed (click, type, hover)
-2. Identify the target element using the most reliable selector
-3. Extract any input text if needed for type interactions
-
-Generate the web interaction instruction as a JSON object with:
-{
-    "action": "click|type|hover",
-    "selector_type": "css|xpath|id|class",
-    "selector_value": "string",
-    "input_text": "string" (optional)
-}"""
+Based on the task description and current page HTML, generate a web interaction as a JSON object."""
 
-        # Get Claude completion
-        response = self.client.messages.create(
-            model="claude-3-opus-20240229",
-            max_tokens=150,
-            temperature=0,
-            system=self.system_prompt,
-            messages=[
-                {"role": "user", "content": prompt}
-            ]
-        )
-        
-        # Parse JSON response
         try:
-            interaction_data = json.loads(response.content[0].text)
+            # Wait for rate limit if needed
+            while not self.request_pool.can_make_request():
+                time.sleep(1)
+            self.request_pool.add_request()
+            
+            response = self.client.messages.create(
+                model=self.model,
+                max_tokens=1024,
+                temperature=self.temperature,
+                system=self.system_prompt,
+                messages=[
+                    {"role": "user", "content": user_prompt}
+                ]
+            )
+            
+            # Extract and parse the JSON response
+            interaction_data = json.loads(response.content)
             return WebInteraction(
                 action=interaction_data.get('action', 'click'),
                 selector_type=interaction_data.get('selector_type', 'css'),
@@ -70,52 +156,41 @@ def parse_task(self, task: Dict[str, Any], page_html: str = None) -> WebInteract
                 input_text=interaction_data.get('input_text'),
                 description=task['task']
             )
-        except json.JSONDecodeError:
-            # Fallback to task values if Claude's response isn't valid JSON
-            return WebInteraction(
-                action='click',
-                selector_type='css',
-                selector_value='',
-                input_text='',
-                description=task['task']
-            )
-    
+        except Exception as e:
+            print(f"Error in Claude task parsing: {str(e)}")
+            return None
+
     def handle_error(self, task: Dict[str, Any], error: str) -> Optional[WebInteraction]:
         """Use Claude to understand and handle errors."""
-        prompt = f"""Task: {task['task']}
+        error_prompt = f"""Task: {task['task']}
 Error: {error}
 
-Analyze the error and suggest a modified interaction. Respond with a JSON object for the new interaction.
-If the error is unrecoverable, respond with exactly "GIVE UP"."""
+Based on the task and error message, suggest a new web interaction that might work better.
+Respond with a JSON object following the same schema as before."""
 
-        response = self.client.messages.create(
-            model="claude-3-opus-20240229",
-            max_tokens=150,
-            temperature=0,
-            system=self.system_prompt,
-            messages=[
-                {"role": "user", "content": prompt}
-            ]
-        )
-        
-        suggestion = response.content[0].text.strip()
-        if suggestion == "GIVE UP":
-            return None
-            
         try:
-            # Try to parse Claude's suggestion
-            interaction_data = json.loads(suggestion)
+            response = self.client.messages.create(
+                model=self.model,
+                max_tokens=1024,
+                temperature=self.temperature,
+                system=self.system_prompt,
+                messages=[
+                    {"role": "user", "content": error_prompt}
+                ]
+            )
+            
+            interaction_data = json.loads(response.content)
             return WebInteraction(
-                action=interaction_data['action'],
-                selector_type=interaction_data['selector_type'],
-                selector_value=interaction_data['selector_value'],
+                action=interaction_data.get('action', 'click'),
+                selector_type=interaction_data.get('selector_type', 'css'),
+                selector_value=interaction_data.get('selector_value'),
                 input_text=interaction_data.get('input_text'),
                 description=task['task']
             )
-        except (json.JSONDecodeError, KeyError):
-            # If Claude's suggestion isn't valid, try one more time with original task
-            return self.parse_task(task)
-    
+        except Exception as e:
+            print(f"Error in Claude error handling: {str(e)}")
+            return None
+
     def validate_result(self, task: Dict[str, Any], result: TaskResult) -> bool:
         """Use Claude to validate if the task was successful."""
         if result.error:
diff --git a/models/gemini.py b/models/gemini.py
index 44a364e..5f2146e 100644
--- a/models/gemini.py
+++ b/models/gemini.py
@@ -13,13 +13,15 @@
 class GeminiModel(BaseModel):
     """Gemini model implementation for the DOM benchmark."""
     
-    def __init__(self, api_key: str, model_config: Dict[str, Any] = None):
-        super().__init__("gemini-1.5-pro", model_config or {})
+    def __init__(self, api_key: str = None, model_config: Dict[str, Any] = None):
+        """Initialize GeminiModel."""
+        super().__init__("gemini-pro", model_config or {})
+        
+        # Configure Gemini API
         genai.configure(api_key=api_key)
         self.model = genai.GenerativeModel('gemini-1.5-pro')
         self.max_retries = 10
-        self.temperature = model_config.get("temperature", 0)
-        self.max_tokens = 32000
+        self.temperature = 0
         # Use GPT-4 tokenizer as an approximation since Gemini uses similar tokenization
         self.tokenizer = tiktoken.encoding_for_model("gpt-4")
         self.function_parser = FunctionParser()
@@ -63,62 +65,65 @@ def __init__(self, api_key: str, model_config: Dict[str, Any] = None):
 </args>"""
 
     def _clean_html(self, html: str) -> str:
-        """Remove all JavaScript and CSS from HTML to reduce size."""
-        # First use BeautifulSoup for robust HTML parsing
-        soup = BeautifulSoup(html, "html.parser")
+        """Keep only relevant semantic HTML elements and attributes for content analysis."""
+        # Count tokens before cleaning
+        tokenizer = genai.GenerativeModel("gemini-pro").count_tokens
+        initial_tokens = tokenizer(html).total_tokens
+        print(f"[Gemini] Initial HTML context length: {initial_tokens} tokens")
         
-        # Remove script tags and their contents
-        for script in soup.find_all('script'):
-            script.decompose()
+        # Use BeautifulSoup for robust HTML parsing
+        soup = BeautifulSoup(html, "html.parser")
         
-        # Remove style tags and their contents
-        for style in soup.find_all('style'):
-            style.decompose()
+        # Define elements we want to keep
+        allowed_elements = {
+            # Text content elements
+            'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+            'ul', 'ol', 'li', 'a', 'table', 'tr', 'td', 'th',
+            'div', 'span', 'strong', 'em', 'code', 'pre',
+            'blockquote', 'article', 'section', 'main',
             
-        # Remove link tags for stylesheets
-        for link in soup.find_all('link', rel="stylesheet"):
-            link.decompose()
+            # Interactive elements
+            'button', 'input', 'select', 'option', 'textarea', 'form',
+            'label', 'fieldset', 'legend', 'datalist', 'output',
             
-        # Remove all style attributes
-        for tag in soup.find_all():
-            if tag.has_attr('style'):
-                del tag['style']
-                
-        # Get the cleaned HTML
-        cleaned_html = str(soup)
-        
-        # Additional regex-based cleaning for things BeautifulSoup might miss
-        # Remove noscript tags and their contents
-        cleaned_html = re.sub(r'<noscript\b[^<]*(?:(?!<\/noscript>)<[^<]*)*<\/noscript>', '', cleaned_html)
-        
-        # Remove template tags (often used by JS frameworks)
-        cleaned_html = re.sub(r'<template\b[^<]*(?:(?!<\/template>)<[^<]*)*<\/template>', '', cleaned_html)
-        
-        # Remove preloaded resources
-        cleaned_html = re.sub(r'<link[^>]*rel="preload"[^>]*>', '', cleaned_html)
-        
-        # Remove meta tags with CSS/JS content
-        cleaned_html = re.sub(r'<meta[^>]*http-equiv="Content-Style-Type"[^>]*>', '', cleaned_html)
-        cleaned_html = re.sub(r'<meta[^>]*http-equiv="Content-Script-Type"[^>]*>', '', cleaned_html)
-        
-        # Remove inline event handlers
-        cleaned_html = re.sub(r'\son\w+="[^"]*"', '', cleaned_html)
-        
-        # Remove javascript: URLs
-        cleaned_html = re.sub(r'href="javascript:[^"]*"', '', cleaned_html)
+            # Media elements that might be clickable
+            'img', 'svg', 'canvas', 'video', 'audio',
+            
+            # Navigation elements
+            'nav', 'header', 'footer', 'menu', 'menuitem',
+            
+            # Interactive containers
+            'dialog', 'details', 'summary'
+        }
         
-        # Remove data attributes (often used for JS functionality)
-        cleaned_html = re.sub(r'\sdata-[a-zA-Z0-9\-_]+="[^"]*"', '', cleaned_html)
+        # Define attributes we want to keep
+        allowed_attributes = {
+            'a': ['href', 'title'],
+            'img': ['alt', 'src'],
+            '*': ['id', 'class']  # Allow these on any element
+        }
         
-        # Remove framework-specific attributes
-        cleaned_html = re.sub(r'\s(?:ng|v|x)-[a-zA-Z0-9\-_]+="[^"]*"', '', cleaned_html)
+        # Function to clean a tag
+        def clean_tag(tag):
+            if tag.name not in allowed_elements:
+                tag.unwrap()  # Keep content but remove the tag
+                return
+                
+            # Remove all attributes except allowed ones
+            allowed_for_tag = allowed_attributes.get(tag.name, []) + allowed_attributes['*']
+            attrs = dict(tag.attrs)  # Create a copy since we're modifying
+            for attr in attrs:
+                if attr not in allowed_for_tag:
+                    del tag[attr]
         
-        # Remove old-style HTML styling attributes
-        attrs_to_remove = ['align', 'bgcolor', 'border', 'cellpadding', 'cellspacing', 
-                          'color', 'face', 'height', 'hspace', 'marginheight', 'marginwidth',
-                          'size', 'valign', 'vspace', 'width']
-        for attr in attrs_to_remove:
-            cleaned_html = re.sub(fr'\s{attr}="[^"]*"', '', cleaned_html)
+        # Clean all tags in the document
+        for tag in soup.find_all(True):
+            clean_tag(tag)
+            
+        cleaned_html = str(soup)
+        final_tokens = tokenizer(cleaned_html).total_tokens
+        print(f"[Gemini] Final HTML context length: {final_tokens} tokens")
+        print(f"[Gemini] Reduced by: {initial_tokens - final_tokens} tokens ({((initial_tokens - final_tokens) / initial_tokens * 100):.1f}%)")
         
         return cleaned_html
 
@@ -126,26 +131,26 @@ def _call_api(self, messages: list, retry_count: int = 0) -> Tuple[Optional[dict
         """Helper method to call Gemini API with retry logic."""
         try:
             # Convert messages to Gemini format
-            prompt = ""
+            gemini_messages = []
             for msg in messages:
-                role_prefix = "System: " if msg["role"] == "system" else "User: " if msg["role"] == "user" else "Assistant: "
-                prompt += f"{role_prefix}{msg['content']}\n\n"
-
-            # Add explicit instruction for JSON output
-            prompt += "\nPlease respond with a valid JSON object following the specified format."
-
+                if msg["role"] == "system":
+                    # Prepend system message to user message since Gemini doesn't support system
+                    continue
+                elif msg["role"] == "user":
+                    gemini_messages.append(msg["content"])
+                elif msg["role"] == "assistant":
+                    gemini_messages.append(msg["content"])
+            
+            # Join all messages with newlines
+            prompt = "\n".join(gemini_messages)
+            
+            # Make API call
             response = self.model.generate_content(
                 prompt,
                 generation_config=genai.types.GenerationConfig(
-                    temperature=self.temperature,
-                    max_output_tokens=self.max_tokens
+                    temperature=self.temperature
                 )
             )
-            
-            # Ensure the response was generated successfully
-            if not response.parts:
-                raise Exception("Empty response from Gemini")
-                
             return response, False
         except Exception as e:
             if any(err in str(e).lower() for err in ["too_long", "length", "token limit"]):
diff --git a/models/gpt4.py b/models/gpt4.py
index 0427776..5328faf 100644
--- a/models/gpt4.py
+++ b/models/gpt4.py
@@ -11,6 +11,21 @@
 from bs4 import BeautifulSoup
 from .base import BaseModel, WebInteraction, TaskResult
 
+class RequestPool:
+    def __init__(self, max_requests_per_minute=3500):
+        self.requests = []
+        self.max_requests = max_requests_per_minute
+        self.window = 60  # 1 minute window
+
+    def can_make_request(self):
+        now = time.time()
+        # Remove old requests
+        self.requests = [t for t in self.requests if now - t < self.window]
+        return len(self.requests) < self.max_requests
+
+    def add_request(self):
+        self.requests.append(time.time())
+
 class GPT4Model(BaseModel):
     """GPT-4 model implementation for the DOM benchmark."""
     
@@ -22,9 +37,10 @@ def __init__(self, api_key: str = None):
         
         self.client = OpenAI(api_key=self.api_key)
         self.max_retries = 10
-        self.model = "gpt-4"  # Switched to gpt-4 model
+        self.model = "gpt-4-turbo"  # Switched to gpt-4 model
         self.temperature = 0
-        self.tokenizer = tiktoken.encoding_for_model("gpt-4")
+        self.tokenizer = tiktoken.encoding_for_model("gpt-4-turbo")
+        self.request_pool = RequestPool()  # Add request pooling
         
         # Setup logging for skipped tasks
         self.output_dir = Path("results/skipped_tasks")
@@ -39,35 +55,44 @@ def __init__(self, api_key: str = None):
         self.system_prompt = """You are an AI assistant that helps users interact with web elements.
 Your task is to understand the user's intent and generate precise web element interactions.
 
-For each task, analyze:
-1. The user's goal and required interaction (click, type, hover)
-2. The target element's properties and accessibility
-3. Any constraints or special conditions
+IMPORTANT: You MUST respond with ONLY a valid JSON object. No other text, explanations, or formatting.
+The JSON object MUST follow this exact schema:
+{
+    "action": "click" | "type" | "hover",
+    "selector_type": "css" | "xpath" | "id" | "class",
+    "selector_value": "string",
+    "input_text": "string"  // Only required for type actions
+}
 
-Key Guidelines:
+Guidelines for generating selectors:
 1. Prefer stable selectors (id, unique class names) over dynamic ones
 2. Consider element visibility and interactability
 3. Handle dynamic content and loading states
 4. Pay attention to timing and wait states
 
-Generate interactions in this JSON format:
+Example valid response:
 {
-    "action": "click|type|hover",
-    "selector_type": "css|xpath|id|class",
-    "selector_value": "string",
-    "input_text": "string",  # For type actions
-    "description": "string"  # Optional description of the interaction
+    "action": "click",
+    "selector_type": "css",
+    "selector_value": "#submit-button",
+    "input_text": null
 }"""
 
     def _call_api(self, messages: list, retry_count: int = 0) -> Tuple[Optional[dict], bool]:
         """Helper method to call OpenAI API with retry logic."""
+        if not self.request_pool.can_make_request():
+            wait_time = 1
+            print(f"Rate limit exceeded, waiting {wait_time}s before retrying.")
+            time.sleep(wait_time)
+            return self._call_api(messages, retry_count)
+        
         try:
             response = self.client.chat.completions.create(
                 model=self.model,
                 messages=messages,
-                temperature=self.temperature,
-                max_tokens=self.max_tokens
+                temperature=self.temperature
             )
+            self.request_pool.add_request()
             return response, False
         except Exception as e:
             if retry_count >= self.max_retries:
@@ -77,7 +102,7 @@ def _call_api(self, messages: list, retry_count: int = 0) -> Tuple[Optional[dict
             wait_time = min(2 ** retry_count, 60)  # Exponential backoff
             if hasattr(e, "__class__"):
                 if e.__class__.__name__ == "RateLimitError":
-                    wait_time = max(wait_time, 10)
+                    wait_time = max(wait_time, 10)  # Back to original wait time
                 elif e.__class__.__name__ == "APIError":
                     wait_time = max(wait_time, 15)
                 
@@ -86,134 +111,142 @@ def _call_api(self, messages: list, retry_count: int = 0) -> Tuple[Optional[dict
             return self._call_api(messages, retry_count + 1)
         
     def _clean_html(self, html: str) -> str:
-        """Aggressively clean and truncate HTML to reduce size while keeping key elements."""
-        # First use BeautifulSoup for robust HTML parsing
+        """Keep only relevant semantic HTML elements and attributes for content analysis."""
+        # Count tokens before cleaning
+        initial_tokens = len(self.tokenizer.encode(html))
+        print(f"[GPT-4] Initial HTML context length: {initial_tokens} tokens")
+        
+        # Use BeautifulSoup for robust HTML parsing
         soup = BeautifulSoup(html, "html.parser")
         
-        # Remove all tags except those needed for interaction
-        keep_tags = {'input', 'button', 'a', 'select', 'textarea', 'form', 'label'}
-        for tag in soup.find_all():
-            if tag.name not in keep_tags:
-                # Keep the text content but remove the tag
-                tag.replace_with(tag.get_text(' ', strip=True))
+        # Define elements we want to keep
+        allowed_elements = {
+            # Text content elements
+            'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+            'ul', 'ol', 'li', 'a', 'table', 'tr', 'td', 'th',
+            'div', 'span', 'strong', 'em', 'code', 'pre',
+            'blockquote', 'article', 'section', 'main',
+            
+            # Interactive elements
+            'button', 'input', 'select', 'option', 'textarea', 'form',
+            'label', 'fieldset', 'legend', 'datalist', 'output',
+            
+            # Media elements that might be clickable
+            'img', 'svg', 'canvas', 'video', 'audio',
+            
+            # Navigation elements
+            'nav', 'header', 'footer', 'menu', 'menuitem',
+            
+            # Interactive containers
+            'dialog', 'details', 'summary'
+        }
+        
+        # Define attributes we want to keep
+        allowed_attributes = {
+            'a': ['href', 'title'],
+            'img': ['alt', 'src'],
+            '*': ['id', 'class']  # Allow these on any element
+        }
         
-        # Keep only essential attributes
-        essential_attrs = {'id', 'class', 'name', 'type', 'value', 'href', 'role', 'aria-label'}
-        for tag in soup.find_all():
+        # Function to clean a tag
+        def clean_tag(tag):
+            if tag.name not in allowed_elements:
+                tag.unwrap()  # Keep content but remove the tag
+                return
+                
+            # Remove all attributes except allowed ones
+            allowed_for_tag = allowed_attributes.get(tag.name, []) + allowed_attributes['*']
             attrs = dict(tag.attrs)  # Create a copy since we're modifying
             for attr in attrs:
-                if attr not in essential_attrs:
+                if attr not in allowed_for_tag:
                     del tag[attr]
-                elif attr == 'class':  # Truncate long class names
-                    classes = tag['class'][:2] if isinstance(tag['class'], list) else tag['class'].split()[:2]
-                    tag['class'] = ' '.join(classes)
         
-        # Get the cleaned HTML
+        # Clean all tags in the document
+        for tag in soup.find_all(True):
+            clean_tag(tag)
+            
         cleaned_html = str(soup)
-        
-        # Remove extra whitespace and newlines
-        cleaned_html = ' '.join(cleaned_html.split())
-        
-        # Truncate very long attribute values
-        cleaned_html = re.sub(r'((?:id|class|name)="[^"]{30})[^"]*"', r'\1..."', cleaned_html)
-        
-        # Remove empty or whitespace-only text nodes
-        cleaned_html = re.sub(r'>\s+<', '><', cleaned_html)
+        final_tokens = len(self.tokenizer.encode(cleaned_html))
+        print(f"[GPT-4] Final HTML context length: {final_tokens} tokens")
+        print(f"[GPT-4] Reduced by: {initial_tokens - final_tokens} tokens ({((initial_tokens - final_tokens) / initial_tokens * 100):.1f}%)")
         
         return cleaned_html
 
     def parse_task(self, task: Dict[str, Any], page_html: str = None) -> Optional[WebInteraction]:
         """Parse task using GPT-4 to understand the interaction."""
+        # Clean HTML if provided
         if page_html:
             page_html = self._clean_html(page_html)
             
-        # Construct messages
         messages = [
             {"role": "system", "content": self.system_prompt},
-            {"role": "user", "content": f"""Task Description: {task['task']}
+            {"role": "user", "content": f"""Task: {task['task']}
 Current Page HTML: {page_html if page_html else 'Not available'}
 
-Based on the task description and current page HTML:
-1. Determine the type of interaction needed (click, type, hover)
-2. Identify the target element using the most reliable selector
-3. Extract any input text if needed for type interactions"""}
+Based on the task description and current page HTML, generate a web interaction as a JSON object."""}
         ]
         
-        # Check total token count before making API call
-        total_tokens = sum(len(self.tokenizer.encode(msg["content"])) for msg in messages)
-        if total_tokens > 128000:  # GPT-4 Turbo's context limit
-            self.skipped_logger.info(
-                f"Task skipped due to length - URL: {task.get('url', 'N/A')}, "
-                f"Task: {task.get('task', 'N/A')}, "
-                f"Token count: {total_tokens} (limit: 128000)"
-            )
-            return None  # Skip the task instead of using ground truth
-            
-        response, error = self._call_api(messages)
-        if error or not response:
-            return None  # Skip on API errors instead of using ground truth
-            
         try:
-            content = response.choices[0].message.content
-            interaction_data = json.loads(content)
+            # Wait for rate limit if needed
+            while not self.request_pool.can_make_request():
+                time.sleep(1)
+            self.request_pool.add_request()
+            
+            response = self.client.chat.completions.create(
+                model=self.model,
+                messages=messages,
+                temperature=self.temperature,
+                response_format={"type": "json_object"}
+            )
             
+            interaction_data = json.loads(response.choices[0].message.content)
             return WebInteraction(
-                action=interaction_data.get('action', task.get('interaction', 'click')).lower(),
-                selector_type=interaction_data.get('selector_type', task['target_element']['type']).lower(),
-                selector_value=interaction_data.get('selector_value', task['target_element']['value']),
-                input_text=interaction_data.get('input_text', task.get('input_text')),
-                description=task.get('task')
+                action=interaction_data.get('action', 'click'),
+                selector_type=interaction_data.get('selector_type', 'css'),
+                selector_value=interaction_data.get('selector_value'),
+                input_text=interaction_data.get('input_text'),
+                description=task['task']
             )
         except Exception as e:
-            print(f"Error parsing GPT-4 response: {str(e)}")
-            return self._create_fallback_interaction(task)
-    
+            print(f"Error in GPT-4 task parsing: {str(e)}")
+            return None
+
     def _create_fallback_interaction(self, task: Dict[str, Any]) -> Optional[WebInteraction]:
         """Create a fallback interaction when API calls or parsing fails."""
         # Don't use ground truth, just skip the task
         return None
     
     def handle_error(self, task: Dict[str, Any], error: str) -> Optional[WebInteraction]:
-        """Use GPT-4 to understand and handle errors with enhanced error analysis."""
-        prompt = f"""Task: {task['task']}
-Original Error: {error}
-Previous Interaction: {json.dumps(task.get('previous_interaction', {}), indent=2)}
-
-Analyze the error and suggest a solution considering:
-1. Is this a timing/loading issue?
-2. Is the selector still valid?
-3. Is the element interactive?
-4. For hover: is the element hoverable?
-5. Are there any prerequisite steps missing?
+        """Use GPT-4 to understand and handle errors."""
+        error_prompt = f"""Task: {task['task']}
+Error: {error}
 
-Generate a modified interaction as a JSON object or respond with "GIVE UP" if unrecoverable."""
+Based on the task and error message, suggest a new web interaction that might work better.
+Respond with a JSON object following the same schema as before."""
 
-        messages = [
-            {"role": "system", "content": self.system_prompt},
-            {"role": "user", "content": prompt}
-        ]
-        
-        response, api_error = self._call_api(messages)
-        if api_error or not response:
-            return self.parse_task(task)
-            
-        content = response.choices[0].message.content
-        if content.strip() == "GIVE UP":
-            return None
-            
         try:
-            interaction_data = json.loads(content)
+            response = self.client.chat.completions.create(
+                model=self.model,
+                messages=[
+                    {"role": "system", "content": self.system_prompt},
+                    {"role": "user", "content": error_prompt}
+                ],
+                temperature=self.temperature,
+                response_format={"type": "json_object"}
+            )
+            
+            interaction_data = json.loads(response.choices[0].message.content)
             return WebInteraction(
-                action=interaction_data['action'],
-                selector_type=interaction_data['selector_type'],
-                selector_value=interaction_data['selector_value'],
+                action=interaction_data.get('action', 'click'),
+                selector_type=interaction_data.get('selector_type', 'css'),
+                selector_value=interaction_data.get('selector_value'),
                 input_text=interaction_data.get('input_text'),
-                description=f"Error recovery: {task['task']}"
+                description=task['task']
             )
         except Exception as e:
-            print(f"Error in error handling: {str(e)}")
-            return self.parse_task(task)
-    
+            print(f"Error in GPT-4 error handling: {str(e)}")
+            return None
+
     def validate_result(self, task: Dict[str, Any], result: TaskResult) -> bool:
         """Enhanced validation using GPT-4 with detailed success criteria."""
         if result.error:
@@ -254,104 +287,3 @@ def validate_result(self, task: Dict[str, Any], result: TaskResult) -> bool:
             if failure_reason:
                 print(f"Validation failed: {failure_reason}")
             return False
-
-    def evaluate_image_similarity(self, actual_img: str, expected_img: str) -> Dict[str, Any]:
-        """
-        Evaluate similarity between actual and expected screenshots
-        
-        Args:
-            actual_img: Path to actual screenshot
-            expected_img: Path to expected (ground truth) screenshot
-            
-        Returns:
-            Dict containing similarity score and explanation
-        """
-        try:
-            # Load images
-            with open(actual_img, "rb") as actual, open(expected_img, "rb") as expected:
-                response = self.client.chat.completions.create(
-                    model="gpt-4o",
-                    messages=[
-                        {
-                            "role": "system",
-                            "content": "You are an expert at comparing web page screenshots to determine if the same interaction was performed."
-                        },
-                        {
-                            "role": "user",
-                            "content": [
-                                {
-                                    "type": "text",
-                                    "text": "Compare these two screenshots and determine if they show the same web interaction was performed. Focus on the relevant UI changes, not minor visual differences."
-                                },
-                                {
-                                    "type": "image_url",
-                                    "image_url": {"url": f"data:image/png;base64,{base64.b64encode(actual.read()).decode()}"}
-                                },
-                                {
-                                    "type": "image_url",
-                                    "image_url": {"url": f"data:image/png;base64,{base64.b64encode(expected.read()).decode()}"}
-                                }
-                            ]
-                        }
-                    ],
-                    max_tokens=300
-                )
-                
-            return {
-                "score": 1.0 if "same" in response.choices[0].message.content.lower() else 0.0,
-                "explanation": response.choices[0].message.content
-            }
-            
-        except Exception as e:
-            logging.error(f"Error evaluating image similarity: {str(e)}")
-            return {
-                "score": 0.0,
-                "explanation": f"Error evaluating images: {str(e)}"
-            }
-            
-    def evaluate_html_similarity(self, actual_html: str, expected_html: str) -> Dict[str, Any]:
-        """
-        Evaluate similarity between actual and expected HTML
-        
-        Args:
-            actual_html: Actual HTML string
-            expected_html: Expected HTML string
-            
-        Returns:
-            Dict containing similarity score and explanation
-        """
-        try:
-            response = self.client.chat.completions.create(
-                model="gpt-4",
-                messages=[
-                    {
-                        "role": "system",
-                        "content": "You are an expert at comparing HTML elements to determine if they refer to the same interactive element."
-                    },
-                    {
-                        "role": "user",
-                        "content": f"""Compare these two HTML elements and determine if they refer to the same interactive element:
-                        
-                        Actual HTML:
-                        {actual_html}
-                        
-                        Expected HTML:
-                        {expected_html}
-                        
-                        Focus on key attributes like id, class, role, and text content. Ignore minor differences in formatting or dynamic attributes."""
-                    }
-                ],
-                max_tokens=300
-            )
-            
-            return {
-                "score": 1.0 if "same" in response.choices[0].message.content.lower() else 0.0,
-                "explanation": response.choices[0].message.content
-            }
-            
-        except Exception as e:
-            logging.error(f"Error evaluating HTML similarity: {str(e)}")
-            return {
-                "score": 0.0,
-                "explanation": f"Error comparing HTML: {str(e)}"
-            }
diff --git a/parallel_runner.py b/parallel_runner.py
index a271ba3..101fffd 100644
--- a/parallel_runner.py
+++ b/parallel_runner.py
@@ -31,7 +31,7 @@ def __init__(self,
                  model,
                  max_workers: int = 4,
                  output_dir: Path = None,
-                 wait_time: float = 2.0,
+                 wait_time: float = 3.0,
                  page_load_timeout: int = 300,  # 5 minutes
                  element_timeout: int = 300):   # 5 minutes
         """
@@ -202,38 +202,35 @@ def run_tasks(self, tasks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         """Run tasks in parallel using ThreadPoolExecutor"""
         
         results = []
-        
-        # Process tasks in smaller batches to avoid overwhelming the system
-        batch_size = self.max_workers # Process at most 5 tasks at a time
         total_tasks = len(tasks)
         logging.info(f"Starting parallel execution of {total_tasks} tasks with {self.max_workers} workers")
         
-        for i in range(0, total_tasks, batch_size):
-            batch = tasks[i:i + batch_size]
-            batch_num = i // batch_size + 1
-            total_batches = (total_tasks + batch_size - 1) // batch_size
-            logging.info(f"Processing batch {batch_num}/{total_batches} ({len(batch)} tasks)")
+        # Use a single ThreadPoolExecutor for all tasks
+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+            # Submit all tasks at once
+            future_to_task = {
+                executor.submit(self.execute_task, task): task
+                for task in tasks
+            }
             
-            with ThreadPoolExecutor(max_workers=batch_size) as executor:
-                future_to_task = {
-                    executor.submit(self.execute_task, task): task
-                    for task in batch
-                }
-                
-                for future in as_completed(future_to_task):
-                    task = future_to_task[future]
-                    try:
-                        result = future.result()
-                        results.append(result)
-                        logging.info(f"Task {task.get('id', 'unknown')} completed with success={result['success']}")
-                    except Exception as e:
-                        error_msg = f"Error processing task {task.get('id', 'unknown')}: {str(e)}"
-                        logging.error(error_msg, exc_info=True)
-                        results.append({
-                            "task_id": task.get("id", "unknown"),
-                            "success": False,
-                            "error": str(e)
-                        })
+            completed = 0
+            # Process results as they complete
+            for future in as_completed(future_to_task):
+                task = future_to_task[future]
+                try:
+                    result = future.result()
+                    results.append(result)
+                    completed += 1
+                    logging.info(f"Task {task.get('id', 'unknown')} completed with success={result['success']} ({completed}/{total_tasks})")
+                except Exception as e:
+                    error_msg = f"Error processing task {task.get('id', 'unknown')}: {str(e)}"
+                    logging.error(error_msg, exc_info=True)
+                    results.append({
+                        "task_id": task.get("id", "unknown"),
+                        "success": False,
+                        "error": str(e)
+                    })
+                    completed += 1
         
         logging.info(f"Completed all {total_tasks} tasks")
         return results
@@ -326,7 +323,7 @@ def run_parallel_benchmark(
     output_dir: str,
     model,
     max_workers: int = 4,
-    wait_time: float = 2.0,
+    wait_time: float = 3.0,
     page_load_timeout: int = 300,  # 5 minutes
     element_timeout: int = 300     # 5 minutes
 ) -> List[Dict[str, Any]]:
diff --git a/serial_runner.py b/serial_runner.py
index c8bf31f..6c88c70 100644
--- a/serial_runner.py
+++ b/serial_runner.py
@@ -23,14 +23,14 @@ def __init__(self,
                  model,
                  output_dir: Path = None,
                  save_accessibility_tree: bool = True,
-                 wait_time: float = 2.0):
+                 wait_time: float = 3.0):
         """
         Initialize SerialTaskRunner
         
         Args:
             model: Language model to use for task parsing
             output_dir: Directory for results and screenshots
-            save_accessibility_tree: Whether to save accessibility trees
+            save_accessibility_tree: Whether to save accessibility tree
             wait_time: Wait time between actions in seconds
         """
         self.model = model
@@ -195,7 +195,7 @@ def run_serial_benchmark(
     output_dir: str,
     model,
     save_accessibility_tree: bool = True,
-    wait_time: float = 2.0
+    wait_time: float = 3.0
 ) -> List[Dict[str, Any]]:
     """
     Run benchmark tasks serially