Done

dhruvahuja19 · Dec 20, 2024 · 767836d · 767836d
1 parent 117a792
commit 767836d
Show file tree

Hide file tree

Showing 3 changed files with 133 additions and 99 deletions.
diff --git a/README.md b/README.md
@@ -128,6 +128,58 @@ The repository includes two task sets:
 - `data/test_tasks.jsonl`: Full test set with 100+ tasks
 - `data/test_tasks_10.jsonl`: Smaller set of 10 tasks for quick testing
 
+## Detailed Setup Instructions
+- **Environment Configuration**: Copy `.env.example` to `.env` and fill in your API keys.
+- **Dependencies**: Install dependencies using `pip install -r requirements.txt`.
+- **Virtual Environment**: (Optional) Set up a virtual environment using `venv`.
+
+## Running Benchmarks
+- **Main Script**: Use `run.py` to execute benchmarks. Example:
+  ```bash
+  python run.py --tasks data/test_tasks.jsonl --output results --model gpt4
+  ```
+- **Parallel and Serial Execution**: Use `parallel_runner.py` or `serial_runner.py` for specific execution modes.
+
+## Adding New Models
+- **Model Class**: Create a new class in `models/` inheriting from `BaseModel`.
+- **Integration**: Implement required methods and integrate with `run.py`.
+- **Testing**: Validate the new model with existing task sets.
+
+## Interpreting Results
+- **Results Directory**: Check the `results/` directory for output files and logs.
+- **Evaluation**: Use `evaluate.py` to assess model performance.
+- **Logs**: Review logs for insights into model behavior and errors.
+
+## Baseline Results
+- **Reference Scores**: Baseline results are available in `results/baseline_results/`.
+- **Comparison**: Use these scores to evaluate new models or configurations.
+
+## Additional Resources
+- **Scripts**: Explore the `scripts/` directory for additional utilities.
+- **Examples**: Check the `examples/` directory for example usage and configurations.
+- **Utilities**: Use `utils.py` and other scripts in `utils/` for common tasks.
+
+## Documentation
+
+### Using the Benchmark
+- **Setup**: Ensure all dependencies are installed and API keys are configured in the `.env` file.
+- **Running Tests**: Use the `benchmark` module to run tests on different models. Specify the model and task set.
+- **Serial vs Parallel**: Use `--serial` for models with strict rate limits.
+
+### Adding New Agents
+- **Model Integration**: Implement a new model class inheriting from `BaseModel`.
+- **Configuration**: Configure API keys and model parameters in the new class.
+- **Testing**: Add the new model to the benchmark script and test with existing task sets.
+
+### Interpreting Results
+- **Output Files**: Check the `results` directory for detailed logs and evaluation scores.
+- **Error Handling**: Review logs for any errors or skipped tasks.
+- **Baseline Comparison**: Compare results against baseline scores provided in the `baseline_results` directory.
+
+### Baseline Results
+- Baseline results for each model are available for comparison.
+- Use these results to gauge the performance of new models or configurations.
+
 ## Contributing
 
 Contributions are welcome! Please feel free to submit a Pull Request.

diff --git a/models/gemini.py b/models/gemini.py
@@ -13,15 +13,13 @@
 class GeminiModel(BaseModel):
     """Gemini model implementation for the DOM benchmark."""
 
-    def __init__(self, api_key: str = None, model_config: Dict[str, Any] = None):
-        """Initialize GeminiModel."""
-        super().__init__("gemini-pro", model_config or {})
-
-        # Configure Gemini API
+    def __init__(self, api_key: str, model_config: Dict[str, Any] = None):
+        super().__init__("gemini-1.5-pro", model_config or {})
         genai.configure(api_key=api_key)
         self.model = genai.GenerativeModel('gemini-1.5-pro')
         self.max_retries = 10
-        self.temperature = 0
+        self.temperature = model_config.get("temperature", 0)
+        self.max_tokens = 32000
         # Use GPT-4 tokenizer as an approximation since Gemini uses similar tokenization
         self.tokenizer = tiktoken.encoding_for_model("gpt-4")
         self.function_parser = FunctionParser()
@@ -65,92 +63,89 @@ def __init__(self, api_key: str = None, model_config: Dict[str, Any] = None):
 </args>"""
 
     def _clean_html(self, html: str) -> str:
-        """Keep only relevant semantic HTML elements and attributes for content analysis."""
-        # Count tokens before cleaning
-        tokenizer = genai.GenerativeModel("gemini-pro").count_tokens
-        initial_tokens = tokenizer(html).total_tokens
-        print(f"[Gemini] Initial HTML context length: {initial_tokens} tokens")
-
-        # Use BeautifulSoup for robust HTML parsing
+        """Remove all JavaScript and CSS from HTML to reduce size."""
+        # First use BeautifulSoup for robust HTML parsing
         soup = BeautifulSoup(html, "html.parser")
 
-        # Define elements we want to keep
-        allowed_elements = {
-            # Text content elements
-            'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
-            'ul', 'ol', 'li', 'a', 'table', 'tr', 'td', 'th',
-            'div', 'span', 'strong', 'em', 'code', 'pre',
-            'blockquote', 'article', 'section', 'main',
-
-            # Interactive elements
-            'button', 'input', 'select', 'option', 'textarea', 'form',
-            'label', 'fieldset', 'legend', 'datalist', 'output',
-
-            # Media elements that might be clickable
-            'img', 'svg', 'canvas', 'video', 'audio',
+        # Remove script tags and their contents
+        for script in soup.find_all('script'):
+            script.decompose()
+
+        # Remove style tags and their contents
+        for style in soup.find_all('style'):
+            style.decompose()
 
-            # Navigation elements
-            'nav', 'header', 'footer', 'menu', 'menuitem',
+        # Remove link tags for stylesheets
+        for link in soup.find_all('link', rel="stylesheet"):
+            link.decompose()
 
-            # Interactive containers
-            'dialog', 'details', 'summary'
-        }
+        # Remove all style attributes
+        for tag in soup.find_all():
+            if tag.has_attr('style'):
+                del tag['style']
+
+        # Get the cleaned HTML
+        cleaned_html = str(soup)
 
-        # Define attributes we want to keep
-        allowed_attributes = {
-            'a': ['href', 'title'],
-            'img': ['alt', 'src'],
-            '*': ['id', 'class']  # Allow these on any element
-        }
+        # Additional regex-based cleaning for things BeautifulSoup might miss
+        # Remove noscript tags and their contents
+        cleaned_html = re.sub(r'<noscript\b[^<]*(?:(?!<\/noscript>)<[^<]*)*<\/noscript>', '', cleaned_html)
 
-        # Function to clean a tag
-        def clean_tag(tag):
-            if tag.name not in allowed_elements:
-                tag.unwrap()  # Keep content but remove the tag
-                return
-
-            # Remove all attributes except allowed ones
-            allowed_for_tag = allowed_attributes.get(tag.name, []) + allowed_attributes['*']
-            attrs = dict(tag.attrs)  # Create a copy since we're modifying
-            for attr in attrs:
-                if attr not in allowed_for_tag:
-                    del tag[attr]
+        # Remove template tags (often used by JS frameworks)
+        cleaned_html = re.sub(r'<template\b[^<]*(?:(?!<\/template>)<[^<]*)*<\/template>', '', cleaned_html)
 
-        # Clean all tags in the document
-        for tag in soup.find_all(True):
-            clean_tag(tag)
-
-        cleaned_html = str(soup)
-        final_tokens = tokenizer(cleaned_html).total_tokens
-        print(f"[Gemini] Final HTML context length: {final_tokens} tokens")
-        print(f"[Gemini] Reduced by: {initial_tokens - final_tokens} tokens ({((initial_tokens - final_tokens) / initial_tokens * 100):.1f}%)")
+        # Remove preloaded resources
+        cleaned_html = re.sub(r'<link[^>]*rel="preload"[^>]*>', '', cleaned_html)
+
+        # Remove meta tags with CSS/JS content
+        cleaned_html = re.sub(r'<meta[^>]*http-equiv="Content-Style-Type"[^>]*>', '', cleaned_html)
+        cleaned_html = re.sub(r'<meta[^>]*http-equiv="Content-Script-Type"[^>]*>', '', cleaned_html)
+
+        # Remove inline event handlers
+        cleaned_html = re.sub(r'\son\w+="[^"]*"', '', cleaned_html)
+
+        # Remove javascript: URLs
+        cleaned_html = re.sub(r'href="javascript:[^"]*"', '', cleaned_html)
+
+        # Remove data attributes (often used for JS functionality)
+        cleaned_html = re.sub(r'\sdata-[a-zA-Z0-9\-_]+="[^"]*"', '', cleaned_html)
+
+        # Remove framework-specific attributes
+        cleaned_html = re.sub(r'\s(?:ng|v|x)-[a-zA-Z0-9\-_]+="[^"]*"', '', cleaned_html)
+
+        # Remove old-style HTML styling attributes
+        attrs_to_remove = ['align', 'bgcolor', 'border', 'cellpadding', 'cellspacing', 
+                          'color', 'face', 'height', 'hspace', 'marginheight', 'marginwidth',
+                          'size', 'valign', 'vspace', 'width']
+        for attr in attrs_to_remove:
+            cleaned_html = re.sub(fr'\s{attr}="[^"]*"', '', cleaned_html)
 
         return cleaned_html
 
     def _call_api(self, messages: list, retry_count: int = 0) -> Tuple[Optional[dict], bool]:
         """Helper method to call Gemini API with retry logic."""
         try:
             # Convert messages to Gemini format
-            gemini_messages = []
+            prompt = ""
             for msg in messages:
-                if msg["role"] == "system":
-                    # Prepend system message to user message since Gemini doesn't support system
-                    continue
-                elif msg["role"] == "user":
-                    gemini_messages.append(msg["content"])
-                elif msg["role"] == "assistant":
-                    gemini_messages.append(msg["content"])
-
-            # Join all messages with newlines
-            prompt = "\n".join(gemini_messages)
-
-            # Make API call
+                role_prefix = "System: " if msg["role"] == "system" else "User: " if msg["role"] == "user" else "Assistant: "
+                prompt += f"{role_prefix}{msg['content']}\n\n"
+
+            # Add explicit instruction for JSON output
+            prompt += "\nPlease respond with a valid JSON object following the specified format."
+
             response = self.model.generate_content(
                 prompt,
                 generation_config=genai.types.GenerationConfig(
-                    temperature=self.temperature
+                    temperature=self.temperature,
+                    max_output_tokens=self.max_tokens
                 )
             )
+
+            # Ensure the response was generated successfully
+            if not response.parts:
+                raise Exception("Empty response from Gemini")
+
             return response, False
         except Exception as e:
             if any(err in str(e).lower() for err in ["too_long", "length", "token limit"]):
@@ -308,4 +303,4 @@ def validate_result(self, task: Dict[str, Any], result: TaskResult) -> bool:
             failure_reason = validation_result.replace("NO", "").strip()
             if failure_reason:
                 print(f"Validation failed: {failure_reason}")
-            return False
+            return False
diff --git a/models/gemini_function_parser.py b/models/gemini_function_parser.py
@@ -1,4 +1,5 @@
 import re
+import json
 from typing import Dict, Any, Optional, List, Tuple
 
 class FunctionParser:
@@ -40,30 +41,16 @@ def extract_function_calls(text: str) -> List[Dict[str, Any]]:
 
         return function_calls
 
-    @staticmethod
-    def extract_web_interaction(text: str) -> Optional[Dict[str, Any]]:
-        """
-        Extract web interaction details from Gemini's text output.
-        Expected format:
-        <interaction>
-        {
-            "action": "click|type|hover",
-            "selector_type": "css|xpath|id|class",
-            "selector_value": "string",
-            "input_text": "string",
-            "description": "string"
-        }
-        </interaction>
-        """
-        pattern = r'<interaction>\s*(\{[\s\S]*?\})\s*</interaction>'
-
-        match = re.search(pattern, text)
-        if not match:
-            return None
-
+    def extract_web_interaction(self, response_text: str) -> dict:
+        """Extract web interaction details from Gemini model response."""
         try:
-            interaction_str = match.group(1).strip()
-            return eval(interaction_str)  # Using eval since the dict might contain single quotes
-        except Exception as e:
-            print(f"Error parsing interaction: {str(e)}")
-            return None
+            # Attempt to parse the response as JSON
+            interaction_data = json.loads(response_text)
+            return interaction_data
+        except json.JSONDecodeError:
+            # Log an error if parsing fails
+            print("Failed to parse response as JSON")
+            return {}
+
+        # Additional parsing logic can be added here if needed
+        return {}