Skip to content

Commit

Permalink
Done
Browse files Browse the repository at this point in the history
  • Loading branch information
dhruvahuja19 committed Dec 20, 2024
1 parent 117a792 commit 767836d
Show file tree
Hide file tree
Showing 3 changed files with 133 additions and 99 deletions.
52 changes: 52 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,58 @@ The repository includes two task sets:
- `data/test_tasks.jsonl`: Full test set with 100+ tasks
- `data/test_tasks_10.jsonl`: Smaller set of 10 tasks for quick testing

## Detailed Setup Instructions
- **Environment Configuration**: Copy `.env.example` to `.env` and fill in your API keys.
- **Dependencies**: Install dependencies using `pip install -r requirements.txt`.
- **Virtual Environment**: (Optional) Set up a virtual environment using `venv`.

## Running Benchmarks
- **Main Script**: Use `run.py` to execute benchmarks. Example:
```bash
python run.py --tasks data/test_tasks.jsonl --output results --model gpt4
```
- **Parallel and Serial Execution**: Use `parallel_runner.py` or `serial_runner.py` for specific execution modes.

## Adding New Models
- **Model Class**: Create a new class in `models/` inheriting from `BaseModel`.
- **Integration**: Implement required methods and integrate with `run.py`.
- **Testing**: Validate the new model with existing task sets.

## Interpreting Results
- **Results Directory**: Check the `results/` directory for output files and logs.
- **Evaluation**: Use `evaluate.py` to assess model performance.
- **Logs**: Review logs for insights into model behavior and errors.

## Baseline Results
- **Reference Scores**: Baseline results are available in `results/baseline_results/`.
- **Comparison**: Use these scores to evaluate new models or configurations.

## Additional Resources
- **Scripts**: Explore the `scripts/` directory for additional utilities.
- **Examples**: Check the `examples/` directory for example usage and configurations.
- **Utilities**: Use `utils.py` and other scripts in `utils/` for common tasks.

## Documentation

### Using the Benchmark
- **Setup**: Ensure all dependencies are installed and API keys are configured in the `.env` file.
- **Running Tests**: Use the `benchmark` module to run tests on different models. Specify the model and task set.
- **Serial vs Parallel**: Use `--serial` for models with strict rate limits.

### Adding New Agents
- **Model Integration**: Implement a new model class inheriting from `BaseModel`.
- **Configuration**: Configure API keys and model parameters in the new class.
- **Testing**: Add the new model to the benchmark script and test with existing task sets.

### Interpreting Results
- **Output Files**: Check the `results` directory for detailed logs and evaluation scores.
- **Error Handling**: Review logs for any errors or skipped tasks.
- **Baseline Comparison**: Compare results against baseline scores provided in the `baseline_results` directory.

### Baseline Results
- Baseline results for each model are available for comparison.
- Use these results to gauge the performance of new models or configurations.

## Contributing

Contributions are welcome! Please feel free to submit a Pull Request.
Expand Down
141 changes: 68 additions & 73 deletions models/gemini.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,13 @@
class GeminiModel(BaseModel):
"""Gemini model implementation for the DOM benchmark."""

def __init__(self, api_key: str = None, model_config: Dict[str, Any] = None):
"""Initialize GeminiModel."""
super().__init__("gemini-pro", model_config or {})

# Configure Gemini API
def __init__(self, api_key: str, model_config: Dict[str, Any] = None):
super().__init__("gemini-1.5-pro", model_config or {})
genai.configure(api_key=api_key)
self.model = genai.GenerativeModel('gemini-1.5-pro')
self.max_retries = 10
self.temperature = 0
self.temperature = model_config.get("temperature", 0)
self.max_tokens = 32000
# Use GPT-4 tokenizer as an approximation since Gemini uses similar tokenization
self.tokenizer = tiktoken.encoding_for_model("gpt-4")
self.function_parser = FunctionParser()
Expand Down Expand Up @@ -65,92 +63,89 @@ def __init__(self, api_key: str = None, model_config: Dict[str, Any] = None):
</args>"""

def _clean_html(self, html: str) -> str:
"""Keep only relevant semantic HTML elements and attributes for content analysis."""
# Count tokens before cleaning
tokenizer = genai.GenerativeModel("gemini-pro").count_tokens
initial_tokens = tokenizer(html).total_tokens
print(f"[Gemini] Initial HTML context length: {initial_tokens} tokens")

# Use BeautifulSoup for robust HTML parsing
"""Remove all JavaScript and CSS from HTML to reduce size."""
# First use BeautifulSoup for robust HTML parsing
soup = BeautifulSoup(html, "html.parser")

# Define elements we want to keep
allowed_elements = {
# Text content elements
'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'ul', 'ol', 'li', 'a', 'table', 'tr', 'td', 'th',
'div', 'span', 'strong', 'em', 'code', 'pre',
'blockquote', 'article', 'section', 'main',

# Interactive elements
'button', 'input', 'select', 'option', 'textarea', 'form',
'label', 'fieldset', 'legend', 'datalist', 'output',

# Media elements that might be clickable
'img', 'svg', 'canvas', 'video', 'audio',
# Remove script tags and their contents
for script in soup.find_all('script'):
script.decompose()

# Remove style tags and their contents
for style in soup.find_all('style'):
style.decompose()

# Navigation elements
'nav', 'header', 'footer', 'menu', 'menuitem',
# Remove link tags for stylesheets
for link in soup.find_all('link', rel="stylesheet"):
link.decompose()

# Interactive containers
'dialog', 'details', 'summary'
}
# Remove all style attributes
for tag in soup.find_all():
if tag.has_attr('style'):
del tag['style']

# Get the cleaned HTML
cleaned_html = str(soup)

# Define attributes we want to keep
allowed_attributes = {
'a': ['href', 'title'],
'img': ['alt', 'src'],
'*': ['id', 'class'] # Allow these on any element
}
# Additional regex-based cleaning for things BeautifulSoup might miss
# Remove noscript tags and their contents
cleaned_html = re.sub(r'<noscript\b[^<]*(?:(?!<\/noscript>)<[^<]*)*<\/noscript>', '', cleaned_html)

# Function to clean a tag
def clean_tag(tag):
if tag.name not in allowed_elements:
tag.unwrap() # Keep content but remove the tag
return

# Remove all attributes except allowed ones
allowed_for_tag = allowed_attributes.get(tag.name, []) + allowed_attributes['*']
attrs = dict(tag.attrs) # Create a copy since we're modifying
for attr in attrs:
if attr not in allowed_for_tag:
del tag[attr]
# Remove template tags (often used by JS frameworks)
cleaned_html = re.sub(r'<template\b[^<]*(?:(?!<\/template>)<[^<]*)*<\/template>', '', cleaned_html)

# Clean all tags in the document
for tag in soup.find_all(True):
clean_tag(tag)

cleaned_html = str(soup)
final_tokens = tokenizer(cleaned_html).total_tokens
print(f"[Gemini] Final HTML context length: {final_tokens} tokens")
print(f"[Gemini] Reduced by: {initial_tokens - final_tokens} tokens ({((initial_tokens - final_tokens) / initial_tokens * 100):.1f}%)")
# Remove preloaded resources
cleaned_html = re.sub(r'<link[^>]*rel="preload"[^>]*>', '', cleaned_html)

# Remove meta tags with CSS/JS content
cleaned_html = re.sub(r'<meta[^>]*http-equiv="Content-Style-Type"[^>]*>', '', cleaned_html)
cleaned_html = re.sub(r'<meta[^>]*http-equiv="Content-Script-Type"[^>]*>', '', cleaned_html)

# Remove inline event handlers
cleaned_html = re.sub(r'\son\w+="[^"]*"', '', cleaned_html)

# Remove javascript: URLs
cleaned_html = re.sub(r'href="javascript:[^"]*"', '', cleaned_html)

# Remove data attributes (often used for JS functionality)
cleaned_html = re.sub(r'\sdata-[a-zA-Z0-9\-_]+="[^"]*"', '', cleaned_html)

# Remove framework-specific attributes
cleaned_html = re.sub(r'\s(?:ng|v|x)-[a-zA-Z0-9\-_]+="[^"]*"', '', cleaned_html)

# Remove old-style HTML styling attributes
attrs_to_remove = ['align', 'bgcolor', 'border', 'cellpadding', 'cellspacing',
'color', 'face', 'height', 'hspace', 'marginheight', 'marginwidth',
'size', 'valign', 'vspace', 'width']
for attr in attrs_to_remove:
cleaned_html = re.sub(fr'\s{attr}="[^"]*"', '', cleaned_html)

return cleaned_html

def _call_api(self, messages: list, retry_count: int = 0) -> Tuple[Optional[dict], bool]:
"""Helper method to call Gemini API with retry logic."""
try:
# Convert messages to Gemini format
gemini_messages = []
prompt = ""
for msg in messages:
if msg["role"] == "system":
# Prepend system message to user message since Gemini doesn't support system
continue
elif msg["role"] == "user":
gemini_messages.append(msg["content"])
elif msg["role"] == "assistant":
gemini_messages.append(msg["content"])

# Join all messages with newlines
prompt = "\n".join(gemini_messages)

# Make API call
role_prefix = "System: " if msg["role"] == "system" else "User: " if msg["role"] == "user" else "Assistant: "
prompt += f"{role_prefix}{msg['content']}\n\n"

# Add explicit instruction for JSON output
prompt += "\nPlease respond with a valid JSON object following the specified format."

response = self.model.generate_content(
prompt,
generation_config=genai.types.GenerationConfig(
temperature=self.temperature
temperature=self.temperature,
max_output_tokens=self.max_tokens
)
)

# Ensure the response was generated successfully
if not response.parts:
raise Exception("Empty response from Gemini")

return response, False
except Exception as e:
if any(err in str(e).lower() for err in ["too_long", "length", "token limit"]):
Expand Down Expand Up @@ -308,4 +303,4 @@ def validate_result(self, task: Dict[str, Any], result: TaskResult) -> bool:
failure_reason = validation_result.replace("NO", "").strip()
if failure_reason:
print(f"Validation failed: {failure_reason}")
return False
return False
39 changes: 13 additions & 26 deletions models/gemini_function_parser.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
import json
from typing import Dict, Any, Optional, List, Tuple

class FunctionParser:
Expand Down Expand Up @@ -40,30 +41,16 @@ def extract_function_calls(text: str) -> List[Dict[str, Any]]:

return function_calls

@staticmethod
def extract_web_interaction(text: str) -> Optional[Dict[str, Any]]:
"""
Extract web interaction details from Gemini's text output.
Expected format:
<interaction>
{
"action": "click|type|hover",
"selector_type": "css|xpath|id|class",
"selector_value": "string",
"input_text": "string",
"description": "string"
}
</interaction>
"""
pattern = r'<interaction>\s*(\{[\s\S]*?\})\s*</interaction>'

match = re.search(pattern, text)
if not match:
return None

def extract_web_interaction(self, response_text: str) -> dict:
"""Extract web interaction details from Gemini model response."""
try:
interaction_str = match.group(1).strip()
return eval(interaction_str) # Using eval since the dict might contain single quotes
except Exception as e:
print(f"Error parsing interaction: {str(e)}")
return None
# Attempt to parse the response as JSON
interaction_data = json.loads(response_text)
return interaction_data
except json.JSONDecodeError:
# Log an error if parsing fails
print("Failed to parse response as JSON")
return {}

# Additional parsing logic can be added here if needed
return {}

0 comments on commit 767836d

Please sign in to comment.