diff --git a/README.md b/README.md index 066a92a..7b51cb0 100644 --- a/README.md +++ b/README.md @@ -128,6 +128,58 @@ The repository includes two task sets: - `data/test_tasks.jsonl`: Full test set with 100+ tasks - `data/test_tasks_10.jsonl`: Smaller set of 10 tasks for quick testing +## Detailed Setup Instructions +- **Environment Configuration**: Copy `.env.example` to `.env` and fill in your API keys. +- **Dependencies**: Install dependencies using `pip install -r requirements.txt`. +- **Virtual Environment**: (Optional) Set up a virtual environment using `venv`. + +## Running Benchmarks +- **Main Script**: Use `run.py` to execute benchmarks. Example: + ```bash + python run.py --tasks data/test_tasks.jsonl --output results --model gpt4 + ``` +- **Parallel and Serial Execution**: Use `parallel_runner.py` or `serial_runner.py` for specific execution modes. + +## Adding New Models +- **Model Class**: Create a new class in `models/` inheriting from `BaseModel`. +- **Integration**: Implement required methods and integrate with `run.py`. +- **Testing**: Validate the new model with existing task sets. + +## Interpreting Results +- **Results Directory**: Check the `results/` directory for output files and logs. +- **Evaluation**: Use `evaluate.py` to assess model performance. +- **Logs**: Review logs for insights into model behavior and errors. + +## Baseline Results +- **Reference Scores**: Baseline results are available in `results/baseline_results/`. +- **Comparison**: Use these scores to evaluate new models or configurations. + +## Additional Resources +- **Scripts**: Explore the `scripts/` directory for additional utilities. +- **Examples**: Check the `examples/` directory for example usage and configurations. +- **Utilities**: Use `utils.py` and other scripts in `utils/` for common tasks. + +## Documentation + +### Using the Benchmark +- **Setup**: Ensure all dependencies are installed and API keys are configured in the `.env` file. +- **Running Tests**: Use the `benchmark` module to run tests on different models. Specify the model and task set. +- **Serial vs Parallel**: Use `--serial` for models with strict rate limits. + +### Adding New Agents +- **Model Integration**: Implement a new model class inheriting from `BaseModel`. +- **Configuration**: Configure API keys and model parameters in the new class. +- **Testing**: Add the new model to the benchmark script and test with existing task sets. + +### Interpreting Results +- **Output Files**: Check the `results` directory for detailed logs and evaluation scores. +- **Error Handling**: Review logs for any errors or skipped tasks. +- **Baseline Comparison**: Compare results against baseline scores provided in the `baseline_results` directory. + +### Baseline Results +- Baseline results for each model are available for comparison. +- Use these results to gauge the performance of new models or configurations. + ## Contributing Contributions are welcome! Please feel free to submit a Pull Request. diff --git a/models/gemini.py b/models/gemini.py index 5f2146e..175e7dd 100644 --- a/models/gemini.py +++ b/models/gemini.py @@ -13,15 +13,13 @@ class GeminiModel(BaseModel): """Gemini model implementation for the DOM benchmark.""" - def __init__(self, api_key: str = None, model_config: Dict[str, Any] = None): - """Initialize GeminiModel.""" - super().__init__("gemini-pro", model_config or {}) - - # Configure Gemini API + def __init__(self, api_key: str, model_config: Dict[str, Any] = None): + super().__init__("gemini-1.5-pro", model_config or {}) genai.configure(api_key=api_key) self.model = genai.GenerativeModel('gemini-1.5-pro') self.max_retries = 10 - self.temperature = 0 + self.temperature = model_config.get("temperature", 0) + self.max_tokens = 32000 # Use GPT-4 tokenizer as an approximation since Gemini uses similar tokenization self.tokenizer = tiktoken.encoding_for_model("gpt-4") self.function_parser = FunctionParser() @@ -65,65 +63,62 @@ def __init__(self, api_key: str = None, model_config: Dict[str, Any] = None): """ def _clean_html(self, html: str) -> str: - """Keep only relevant semantic HTML elements and attributes for content analysis.""" - # Count tokens before cleaning - tokenizer = genai.GenerativeModel("gemini-pro").count_tokens - initial_tokens = tokenizer(html).total_tokens - print(f"[Gemini] Initial HTML context length: {initial_tokens} tokens") - - # Use BeautifulSoup for robust HTML parsing + """Remove all JavaScript and CSS from HTML to reduce size.""" + # First use BeautifulSoup for robust HTML parsing soup = BeautifulSoup(html, "html.parser") - # Define elements we want to keep - allowed_elements = { - # Text content elements - 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', - 'ul', 'ol', 'li', 'a', 'table', 'tr', 'td', 'th', - 'div', 'span', 'strong', 'em', 'code', 'pre', - 'blockquote', 'article', 'section', 'main', - - # Interactive elements - 'button', 'input', 'select', 'option', 'textarea', 'form', - 'label', 'fieldset', 'legend', 'datalist', 'output', - - # Media elements that might be clickable - 'img', 'svg', 'canvas', 'video', 'audio', + # Remove script tags and their contents + for script in soup.find_all('script'): + script.decompose() + + # Remove style tags and their contents + for style in soup.find_all('style'): + style.decompose() - # Navigation elements - 'nav', 'header', 'footer', 'menu', 'menuitem', + # Remove link tags for stylesheets + for link in soup.find_all('link', rel="stylesheet"): + link.decompose() - # Interactive containers - 'dialog', 'details', 'summary' - } + # Remove all style attributes + for tag in soup.find_all(): + if tag.has_attr('style'): + del tag['style'] + + # Get the cleaned HTML + cleaned_html = str(soup) - # Define attributes we want to keep - allowed_attributes = { - 'a': ['href', 'title'], - 'img': ['alt', 'src'], - '*': ['id', 'class'] # Allow these on any element - } + # Additional regex-based cleaning for things BeautifulSoup might miss + # Remove noscript tags and their contents + cleaned_html = re.sub(r')<[^<]*)*<\/noscript>', '', cleaned_html) - # Function to clean a tag - def clean_tag(tag): - if tag.name not in allowed_elements: - tag.unwrap() # Keep content but remove the tag - return - - # Remove all attributes except allowed ones - allowed_for_tag = allowed_attributes.get(tag.name, []) + allowed_attributes['*'] - attrs = dict(tag.attrs) # Create a copy since we're modifying - for attr in attrs: - if attr not in allowed_for_tag: - del tag[attr] + # Remove template tags (often used by JS frameworks) + cleaned_html = re.sub(r')<[^<]*)*<\/template>', '', cleaned_html) - # Clean all tags in the document - for tag in soup.find_all(True): - clean_tag(tag) - - cleaned_html = str(soup) - final_tokens = tokenizer(cleaned_html).total_tokens - print(f"[Gemini] Final HTML context length: {final_tokens} tokens") - print(f"[Gemini] Reduced by: {initial_tokens - final_tokens} tokens ({((initial_tokens - final_tokens) / initial_tokens * 100):.1f}%)") + # Remove preloaded resources + cleaned_html = re.sub(r']*rel="preload"[^>]*>', '', cleaned_html) + + # Remove meta tags with CSS/JS content + cleaned_html = re.sub(r']*http-equiv="Content-Style-Type"[^>]*>', '', cleaned_html) + cleaned_html = re.sub(r']*http-equiv="Content-Script-Type"[^>]*>', '', cleaned_html) + + # Remove inline event handlers + cleaned_html = re.sub(r'\son\w+="[^"]*"', '', cleaned_html) + + # Remove javascript: URLs + cleaned_html = re.sub(r'href="javascript:[^"]*"', '', cleaned_html) + + # Remove data attributes (often used for JS functionality) + cleaned_html = re.sub(r'\sdata-[a-zA-Z0-9\-_]+="[^"]*"', '', cleaned_html) + + # Remove framework-specific attributes + cleaned_html = re.sub(r'\s(?:ng|v|x)-[a-zA-Z0-9\-_]+="[^"]*"', '', cleaned_html) + + # Remove old-style HTML styling attributes + attrs_to_remove = ['align', 'bgcolor', 'border', 'cellpadding', 'cellspacing', + 'color', 'face', 'height', 'hspace', 'marginheight', 'marginwidth', + 'size', 'valign', 'vspace', 'width'] + for attr in attrs_to_remove: + cleaned_html = re.sub(fr'\s{attr}="[^"]*"', '', cleaned_html) return cleaned_html @@ -131,26 +126,26 @@ def _call_api(self, messages: list, retry_count: int = 0) -> Tuple[Optional[dict """Helper method to call Gemini API with retry logic.""" try: # Convert messages to Gemini format - gemini_messages = [] + prompt = "" for msg in messages: - if msg["role"] == "system": - # Prepend system message to user message since Gemini doesn't support system - continue - elif msg["role"] == "user": - gemini_messages.append(msg["content"]) - elif msg["role"] == "assistant": - gemini_messages.append(msg["content"]) - - # Join all messages with newlines - prompt = "\n".join(gemini_messages) - - # Make API call + role_prefix = "System: " if msg["role"] == "system" else "User: " if msg["role"] == "user" else "Assistant: " + prompt += f"{role_prefix}{msg['content']}\n\n" + + # Add explicit instruction for JSON output + prompt += "\nPlease respond with a valid JSON object following the specified format." + response = self.model.generate_content( prompt, generation_config=genai.types.GenerationConfig( - temperature=self.temperature + temperature=self.temperature, + max_output_tokens=self.max_tokens ) ) + + # Ensure the response was generated successfully + if not response.parts: + raise Exception("Empty response from Gemini") + return response, False except Exception as e: if any(err in str(e).lower() for err in ["too_long", "length", "token limit"]): @@ -308,4 +303,4 @@ def validate_result(self, task: Dict[str, Any], result: TaskResult) -> bool: failure_reason = validation_result.replace("NO", "").strip() if failure_reason: print(f"Validation failed: {failure_reason}") - return False + return False \ No newline at end of file diff --git a/models/gemini_function_parser.py b/models/gemini_function_parser.py index 86f7860..811c131 100644 --- a/models/gemini_function_parser.py +++ b/models/gemini_function_parser.py @@ -1,4 +1,5 @@ import re +import json from typing import Dict, Any, Optional, List, Tuple class FunctionParser: @@ -40,30 +41,16 @@ def extract_function_calls(text: str) -> List[Dict[str, Any]]: return function_calls - @staticmethod - def extract_web_interaction(text: str) -> Optional[Dict[str, Any]]: - """ - Extract web interaction details from Gemini's text output. - Expected format: - - { - "action": "click|type|hover", - "selector_type": "css|xpath|id|class", - "selector_value": "string", - "input_text": "string", - "description": "string" - } - - """ - pattern = r'\s*(\{[\s\S]*?\})\s*' - - match = re.search(pattern, text) - if not match: - return None - + def extract_web_interaction(self, response_text: str) -> dict: + """Extract web interaction details from Gemini model response.""" try: - interaction_str = match.group(1).strip() - return eval(interaction_str) # Using eval since the dict might contain single quotes - except Exception as e: - print(f"Error parsing interaction: {str(e)}") - return None + # Attempt to parse the response as JSON + interaction_data = json.loads(response_text) + return interaction_data + except json.JSONDecodeError: + # Log an error if parsing fails + print("Failed to parse response as JSON") + return {} + + # Additional parsing logic can be added here if needed + return {} \ No newline at end of file