From 23d4aa55572dc3ea5d53c25828bcf7bd6f49738b Mon Sep 17 00:00:00 2001
From: Dhruv Ahuja <dhruv_ahuja@berkeley.edu>
Date: Sun, 15 Dec 2024 18:27:36 -0800
Subject: [PATCH] Initial Setup

---
 .env.template             |  15 +++
 .gitignore                |  44 ++++++++
 LICENSE                   |  21 ++++
 Makefile                  |  49 +++++++++
 README.md                 | 140 ++++++++++++++++++++++++++
 data/dom_tasks.jsonl      |   2 +
 evaluation/README.md      |  61 ++++++++++++
 evaluation/auto_eval.py   | 205 ++++++++++++++++++++++++++++++++++++++
 image_match.py            |  67 +++++++++++++
 prompts.py                |  50 ++++++++++
 pyproject.toml            |  63 ++++++++++++
 requirements.txt          |   8 ++
 run.py                    | 165 ++++++++++++++++++++++++++++++
 setup.cfg                 |  25 +++++
 utils.py                  | 123 +++++++++++++++++++++++
 utils/screenshot_utils.py | 113 +++++++++++++++++++++
 16 files changed, 1151 insertions(+)
 create mode 100644 .env.template
 create mode 100644 .gitignore
 create mode 100644 LICENSE
 create mode 100644 Makefile
 create mode 100644 README.md
 create mode 100644 data/dom_tasks.jsonl
 create mode 100644 evaluation/README.md
 create mode 100644 evaluation/auto_eval.py
 create mode 100644 image_match.py
 create mode 100644 prompts.py
 create mode 100644 pyproject.toml
 create mode 100644 requirements.txt
 create mode 100644 run.py
 create mode 100644 setup.cfg
 create mode 100644 utils.py
 create mode 100644 utils/screenshot_utils.py

diff --git a/.env.template b/.env.template
new file mode 100644
index 0000000..8e52579
--- /dev/null
+++ b/.env.template
@@ -0,0 +1,15 @@
+# OpenAI API Key for GPT-4V evaluation
+OPENAI_API_KEY=your_openai_api_key_here
+
+# Chrome WebDriver Settings
+CHROME_BINARY_PATH=/path/to/chrome/binary  # Optional
+CHROME_DRIVER_PATH=/path/to/chromedriver   # Optional
+
+# Benchmark Settings
+HEADLESS=true                   # Run browser in headless mode
+FORCE_DEVICE_SCALE=true        # Force consistent device scaling
+IMAGE_MATCH_THRESHOLD=0.95     # Threshold for image similarity matching
+
+# Output Settings
+SAVE_ACCESSIBILITY_TREE=true   # Save accessibility tree for each task
+LOG_LEVEL=INFO                 # Logging level (DEBUG, INFO, WARNING, ERROR)
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..c0161c6
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,44 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# Virtual Environment
+venv/
+env/
+ENV/
+
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+
+# Benchmark specific
+results/
+data/ground_truth/*.png
+*.log
+
+# Environment variables
+.env
+
+# OS specific
+.DS_Store
+Thumbs.db
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..d77e04a
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 Dhruv Ahuja
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..4f14495
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,49 @@
+.PHONY: install test lint format clean run evaluate
+
+# Environment setup
+install:
+	pip install -e .
+	pip install -r requirements.txt
+
+# Testing
+test:
+	pytest
+
+# Code quality
+lint:
+	flake8 .
+	mypy .
+	black . --check
+	isort . --check
+
+format:
+	black .
+	isort .
+
+# Cleaning
+clean:
+	rm -rf build/
+	rm -rf dist/
+	rm -rf *.egg-info
+	find . -type d -name __pycache__ -exec rm -r {} +
+	find . -type f -name "*.pyc" -delete
+	find . -type f -name "*.pyo" -delete
+	find . -type f -name "*.pyd" -delete
+	find . -type f -name ".coverage" -delete
+	find . -type d -name "*.egg-info" -exec rm -r {} +
+	find . -type d -name "*.egg" -exec rm -r {} +
+
+# Benchmark commands
+run:
+	python run.py \
+		--tasks data/dom_tasks.jsonl \
+		--output results/run_001 \
+		--headless \
+		--save-accessibility-tree
+
+evaluate:
+	python evaluation/auto_eval.py \
+		--tasks data/dom_tasks.jsonl \
+		--results results/run_001 \
+		--ground-truth data/ground_truth \
+		--output results/run_001/evaluation.json
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..b56ff30
--- /dev/null
+++ b/README.md
@@ -0,0 +1,140 @@
+# DOM and DOMer-2
+
+A benchmark for evaluating language models' ability to execute web element interactions.
+
+## Overview
+
+DOM and DOMer-2 focuses on testing a model's ability to interact with web elements (clicking buttons, typing text, etc.) without requiring complex planning or reasoning. The benchmark provides:
+
+1. Simple, single-action tasks
+2. Real websites with diverse DOM structures
+3. Ground truth screenshots for validation
+4. GPT-4V based evaluation
+
+## Directory Structure
+
+```
+DOMe-and-DOMer-2/
+├── data/
+│   ├── dom_tasks.jsonl         # Task definitions
+│   └── ground_truth/          # Ground truth screenshots
+│       ├── amazon_search_1_gt.png
+│       └── ...
+├── evaluation/
+│   ├── auto_eval.py           # GPT-4V evaluation script
+│   └── README.md              # Evaluation documentation
+├── results/                   # Results for each run
+│   └── run_001/
+│       ├── before_*.png       # Screenshots before interaction
+│       ├── after_*.png        # Screenshots after interaction
+│       ├── accessibility_*.json  # Accessibility trees
+│       ├── results.json       # Raw results
+│       ├── evaluation.json    # GPT-4V evaluations
+│       └── benchmark.log      # Detailed logs
+├── prompts.py                # LLM system prompts
+├── run.py                    # Main benchmark runner
+├── utils.py                 # Utility functions
+└── requirements.txt         # Dependencies
+
+## Task Format
+
+Tasks are defined in `data/dom_tasks.jsonl`:
+
+```json
+{
+    "web_name": "Amazon",
+    "id": "amazon_search_1",
+    "task": "Click the search button",
+    "web": "https://www.amazon.com",
+    "element_type": "button",
+    "interaction": "click",
+    "target_element": {
+        "type": "id",
+        "value": "nav-search-submit-button"
+    },
+    "ground_truth": {
+        "screenshot": "amazon_search_1_gt.png",
+        "description": "The search button has been clicked, showing search results"
+    }
+}
+```
+
+## Ground Truth
+
+Ground truth is provided in two forms:
+1. **Screenshots**: Visual state after successful interaction
+2. **Descriptions**: Text description of expected changes
+
+Located in `data/ground_truth/`, each task has:
+- `[task_id]_gt.png`: Screenshot of successful interaction
+- Description in task JSON explaining expected changes
+
+## Running the Benchmark
+
+1. **Run Tests**:
+```bash
+python run.py \
+    --tasks data/dom_tasks.jsonl \
+    --output results/run_001 \
+    --headless \
+    --save-accessibility-tree
+```
+
+2. **Evaluate Results**:
+```bash
+python evaluation/auto_eval.py \
+    --tasks data/dom_tasks.jsonl \
+    --results results/run_001 \
+    --ground-truth data/ground_truth \
+    --output results/run_001/evaluation.json \
+    --openai-key YOUR_API_KEY
+```
+
+## Evaluation Process
+
+1. **Technical Validation**:
+   - Element found and interacted with
+   - No errors during interaction
+   - Accessibility tree verification
+
+2. **Visual Validation**:
+   - Compare after screenshot with ground truth
+   - Verify expected visual changes
+   - Check for unintended side effects
+
+3. **GPT-4V Analysis**:
+   - Compare before/after/ground-truth screenshots
+   - Verify interaction success
+   - Check visual state matches expectations
+
+## Output Format
+
+```json
+{
+    "total_tasks": 10,
+    "successful_tasks": 8,
+    "evaluations": [
+        {
+            "task_id": "amazon_search_1",
+            "success": true,
+            "evaluation": "Detailed evaluation text...",
+            "timestamp": 1234567890
+        }
+    ]
+}
+```
+
+## Requirements
+
+- Python 3.8+
+- Chrome/Chromium browser
+- OpenAI API key (for evaluation)
+- Required packages in `requirements.txt`
+
+## Contributing
+
+[Contributing guidelines will be added]
+
+## License
+
+[License information will be added]
diff --git a/data/dom_tasks.jsonl b/data/dom_tasks.jsonl
new file mode 100644
index 0000000..e21902b
--- /dev/null
+++ b/data/dom_tasks.jsonl
@@ -0,0 +1,2 @@
+{"web_name": "Cambridge Dictionary", "id": "cambridge_lookup_1", "task": "Click the search box and type 'hello'", "web": "https://dictionary.cambridge.org/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "searchword"}, "input_text": "hello", "ground_truth": {"screenshot": "cambridge_lookup_1_gt.png", "description": "The word 'hello' has been entered in the search box", "visual_changes": ["Text 'hello' appears in search box", "Text cursor visible at end of input", "Search suggestions may appear"], "accessibility_changes": ["Search box aria-value updates to 'hello'", "Search suggestions list may become visible"], "success_criteria": ["Input text matches 'hello' exactly", "Text is visible in search box", "Search box maintains focus"]}}
+{"web_name": "Cambridge Dictionary", "id": "cambridge_search_1", "task": "Click the search button", "web": "https://dictionary.cambridge.org/", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "cdo-search-button"}, "ground_truth": {"screenshot": "cambridge_search_1_gt.png", "description": "The search results for 'hello' are displayed", "visual_changes": ["Search button appears pressed", "Page transitions to search results", "Definition of 'hello' is displayed"], "accessibility_changes": ["Search results region becomes visible", "Page title updates to include 'hello'", "Search results are announced to screen readers"], "success_criteria": ["Search button responds to click", "Results page loads with 'hello' definition", "No error messages are displayed"]}}
diff --git a/evaluation/README.md b/evaluation/README.md
new file mode 100644
index 0000000..b379c3d
--- /dev/null
+++ b/evaluation/README.md
@@ -0,0 +1,61 @@
+# DOM and DOMer-2 Evaluation
+
+This directory contains the evaluation tools for the DOM and DOMer-2 benchmark.
+
+## Overview
+
+The evaluation uses GPT-4V to assess web interactions by analyzing:
+1. Before/After screenshots of the webpage
+2. Accessibility tree information
+3. Task descriptions and expected outcomes
+
+## Usage
+
+```bash
+python auto_eval.py \
+    --tasks ../data/dom_tasks.jsonl \
+    --results ../results/run_001 \
+    --output ../results/run_001/evaluation.json \
+    --openai-key YOUR_API_KEY
+```
+
+## Evaluation Process
+
+1. **Screenshot Analysis**
+   - Compare before/after states
+   - Verify visual changes match expected interaction
+   - Check element visibility and state changes
+
+2. **Accessibility Tree Verification**
+   - Validate correct element was targeted
+   - Check element attributes and relationships
+   - Verify element state changes
+
+3. **Success Criteria**
+   - Correct element identified and interacted with
+   - Expected visual changes occurred
+   - No unintended side effects
+
+## Output Format
+
+```json
+{
+    "total_tasks": 10,
+    "successful_tasks": 8,
+    "evaluations": [
+        {
+            "task_id": "task_001",
+            "success": true,
+            "evaluation": "Detailed evaluation text...",
+            "timestamp": 1234567890
+        },
+        ...
+    ]
+}
+```
+
+## Requirements
+
+- OpenAI API key with GPT-4V access
+- Python 3.8+
+- Required packages in `requirements.txt`
diff --git a/evaluation/auto_eval.py b/evaluation/auto_eval.py
new file mode 100644
index 0000000..41bed6e
--- /dev/null
+++ b/evaluation/auto_eval.py
@@ -0,0 +1,205 @@
+import argparse
+import os
+import json
+import time
+import base64
+from pathlib import Path
+from typing import List, Dict, Any
+
+from openai import OpenAI
+from dotenv import load_dotenv
+
+SYSTEM_PROMPT = """As an evaluator for DOM and DOMer-2 benchmark, you will assess web element interactions based on:
+
+1. Task Description: A specific web interaction task (e.g., "Click the search button", "Type text in input field")
+
+2. Visual Validation:
+   - Before: Initial webpage state
+   - After: Actual result after interaction
+   - Ground Truth: Expected result for successful interaction
+   - Expected Visual Changes: List of specific visual changes to verify
+   
+3. Accessibility Validation:
+   - Accessibility Tree: JSON representation of webpage's accessibility state
+   - Expected Accessibility Changes: List of specific accessibility changes to verify
+
+4. Success Criteria:
+   - Specific conditions that must be met for success
+   - Visual state matches ground truth
+   - Accessibility state reflects expected changes
+
+Your evaluation should:
+1. Compare before/after/ground-truth screenshots
+2. Verify all listed visual changes occurred
+3. Validate accessibility tree changes
+4. Check all success criteria are met
+
+Provide your evaluation as:
+1. 'SUCCESS' or 'NOT SUCCESS'
+2. Detailed explanation of:
+   - Visual changes observed/missing
+   - Accessibility changes verified/missing
+   - Success criteria met/failed"""
+
+def encode_image(image_path: str) -> str:
+    """Encode image as base64 string"""
+    with open(image_path, "rb") as f:
+        return base64.b64encode(f.read()).decode('utf-8')
+
+def evaluate_task(
+    task: Dict[str, Any],
+    result: Dict[str, Any],
+    output_dir: Path,
+    ground_truth_dir: Path,
+    openai_client: OpenAI
+) -> Dict[str, Any]:
+    """Evaluate a single task using GPT-4V"""
+    
+    # Get screenshots
+    before_img = encode_image(str(output_dir / f"before_{task['id']}.png"))
+    after_img = encode_image(str(output_dir / f"after_{task['id']}.png"))
+    ground_truth_img = encode_image(str(ground_truth_dir / task['ground_truth']['screenshot']))
+    
+    # Get accessibility tree if available
+    tree_path = output_dir / f"accessibility_tree_{task['id']}.json"
+    accessibility_tree = None
+    if tree_path.exists():
+        with open(tree_path) as f:
+            accessibility_tree = json.load(f)
+    
+    # Format prompt with enhanced ground truth information
+    messages = [
+        {
+            "role": "system",
+            "content": SYSTEM_PROMPT
+        },
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": f"""Task: {task['task']}
+Website: {task['web_name']}
+Interaction: {task['interaction']}
+Element Type: {task['element_type']}
+
+Ground Truth Information:
+1. Description: {task['ground_truth']['description']}
+2. Expected Visual Changes:
+{chr(10).join(f'   - {change}' for change in task['ground_truth'].get('visual_changes', []))}
+3. Expected Accessibility Changes:
+{chr(10).join(f'   - {change}' for change in task['ground_truth'].get('accessibility_changes', []))}
+4. Success Criteria:
+{chr(10).join(f'   - {criterion}' for criterion in task['ground_truth'].get('success_criteria', []))}
+
+Accessibility Tree:
+{json.dumps(accessibility_tree, indent=2) if accessibility_tree else 'Not available'}
+
+Please evaluate the interaction by comparing:
+1. Before screenshot (initial state)
+2. After screenshot (actual result)
+3. Ground Truth screenshot (expected result)"""
+                },
+                {
+                    "type": "text",
+                    "text": "Before interaction:"
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/png;base64,{before_img}"}
+                },
+                {
+                    "type": "text",
+                    "text": "After interaction:"
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/png;base64,{after_img}"}
+                },
+                {
+                    "type": "text",
+                    "text": "Ground Truth:"
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/png;base64,{ground_truth_img}"}
+                }
+            ]
+        }
+    ]
+    
+    # Get GPT-4V evaluation
+    response = openai_client.chat.completions.create(
+        model="gpt-4-vision-preview",
+        messages=messages,
+        max_tokens=1000
+    )
+    
+    evaluation = response.choices[0].message.content
+    success = "SUCCESS" in evaluation.upper()
+    
+    return {
+        "task_id": task["id"],
+        "success": success,
+        "evaluation": evaluation,
+        "timestamp": int(time.time())
+    }
+
+def run_evaluation(
+    tasks_file: Path,
+    results_dir: Path,
+    ground_truth_dir: Path,
+    output_file: Path,
+    openai_key: str
+):
+    """Run evaluation on benchmark results"""
+    # Load environment variables
+    load_dotenv()
+    
+    # Initialize OpenAI client
+    openai_client = OpenAI(api_key=openai_key)
+    
+    # Load tasks and results
+    with open(tasks_file) as f:
+        tasks = [json.loads(line) for line in f]
+    
+    with open(results_dir / "results.json") as f:
+        results = json.load(f)
+    
+    # Evaluate each task
+    evaluations = []
+    for task in tasks:
+        task_result = next((r for r in results if r["task_id"] == task["id"]), None)
+        if task_result:
+            evaluation = evaluate_task(
+                task,
+                task_result,
+                results_dir,
+                ground_truth_dir,
+                openai_client
+            )
+            evaluations.append(evaluation)
+    
+    # Save evaluations
+    output = {
+        "total_tasks": len(tasks),
+        "successful_tasks": sum(1 for e in evaluations if e["success"]),
+        "evaluations": evaluations
+    }
+    
+    with open(output_file, "w") as f:
+        json.dump(output, f, indent=2)
+
+def main():
+    parser = argparse.ArgumentParser(description="Evaluate DOM benchmark results")
+    parser.add_argument("--tasks", type=Path, required=True, help="Path to tasks JSONL file")
+    parser.add_argument("--results", type=Path, required=True, help="Path to results directory")
+    parser.add_argument("--ground-truth", type=Path, required=True, help="Path to ground truth directory")
+    parser.add_argument("--output", type=Path, required=True, help="Path to output evaluation file")
+    parser.add_argument("--openai-key", type=str, required=True, help="OpenAI API key")
+    
+    args = parser.parse_args()
+    run_evaluation(args.tasks, args.results, args.ground_truth, args.output, args.openai_key)
+
+if __name__ == "__main__":
+    main()
diff --git a/image_match.py b/image_match.py
new file mode 100644
index 0000000..c8eda06
--- /dev/null
+++ b/image_match.py
@@ -0,0 +1,67 @@
+import base64
+import requests
+from openai import OpenAI
+import os
+
+
+from dotenv import load_dotenv
+load_dotenv()
+
+system_prompt = """
+A task required an agent to create an image based on a prompt and your task is to compare the image it generated with the image it was supposed to generate. 
+
+Your output should be in the following format:
+Correctness: [True/False]
+Reason: [Reason for the correctness/incorrectness of the agent's output]
+
+"""
+
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+
+def compare_images(prompt, ground_truth_path, agent_image_path, note = None):
+    # TODO: if the image is not there (the agent image path), then return False, "The agent did not generate an image"
+
+    print (f"[DEBUG] Debugging the image output of this agent execution.")
+    if not os.path.exists(agent_image_path):
+        print (f"[DEBUG] The agent did not generate an image or generated the image with the wrong name or the wrong path.")
+        return False, "The agent did not generate an image or generated the image with the wrong name or the wrong path."
+    
+    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+
+    image1 = encode_image(ground_truth_path)
+    image2 = encode_image(agent_image_path)
+    user_prompt = f"The agent was trying to accomplish the following task: {prompt} The first image is the expected image and the second image is the agent's output. Does the image answer the question correctly as the expected image? Don't focus on unnecessary details, like axes titles or colors or image size or labels unless specified in the task."
+    if note:
+        user_prompt += f"Here are some notes to help you evaluate the images: {note}"
+    response = client.chat.completions.create(
+        model="gpt-4o",
+        temperature=0,
+        messages=[
+            {"role": "system", "content": system_prompt},
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": user_prompt},
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/jpeg;base64,{image1}"
+                        }
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/jpeg;base64,{image2}"
+                        }
+                    }
+                ]
+            }
+        ],
+        max_tokens=300
+    )
+    print (f"[DEBUG] Response from the image comparison: {response.choices[0].message.content}")
+    print (f"[DEBUG] Image Correctness: {response.choices[0].message.content.lower().strip() == 'true'}")
+    return "true" in response.choices[0].message.content.lower().strip(), response.choices[0].message.content
+
diff --git a/prompts.py b/prompts.py
new file mode 100644
index 0000000..1499e23
--- /dev/null
+++ b/prompts.py
@@ -0,0 +1,50 @@
+from typing import Dict, Any
+
+SYSTEM_PROMPT = """You are an AI agent designed to interact with web elements. Your task is to execute specific web interactions based on natural language descriptions.
+
+Focus on the following:
+1. Element Identification: Use the provided accessibility tree and visual context to identify the correct element
+2. Precise Interaction: Execute the exact interaction required (click, type, hover)
+3. Accuracy: Ensure you interact with the correct element, as there may be similar elements on the page
+
+Guidelines:
+- Pay attention to element attributes (role, type, name) in the accessibility tree
+- Consider the visual context and location of elements
+- Be precise in your interactions - click exactly where specified
+- Handle dynamic elements and wait for page loads appropriately
+
+Example Task:
+{
+    "web_name": "Amazon",
+    "task": "Click the search button",
+    "web": "https://www.amazon.com",
+    "element_type": "button",
+    "interaction": "click",
+    "target_element": {
+        "type": "id",
+        "value": "nav-search-submit-button"
+    }
+}
+
+Remember: Your goal is to execute the interaction accurately and efficiently.
+"""
+
+def format_task_prompt(task: Dict[str, Any], accessibility_tree: Dict[str, Any] = None) -> str:
+    """Format task into prompt for the agent"""
+    prompt = f"""Website: {task['web_name']}
+Task: {task['task']}
+URL: {task['web']}
+Required Interaction: {task['interaction']}
+Target Element Type: {task['element_type']}
+
+Accessibility Tree Information:
+"""
+    
+    if accessibility_tree:
+        prompt += f"```json\n{accessibility_tree}\n```\n"
+    else:
+        prompt += "Not available\n"
+        
+    prompt += "\nPlease execute the specified interaction accurately."
+    
+    return prompt
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..d30ead9
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,63 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "dom-and-domer-2"
+version = "0.1.0"
+description = "A benchmark for evaluating language models' ability to execute web element interactions"
+readme = "README.md"
+requires-python = ">=3.8"
+license = "MIT"
+keywords = ["benchmark", "dom", "web-interaction", "language-models"]
+authors = [
+    { name = "Dhruv Ahuja" }
+]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+]
+dependencies = [
+    "selenium==4.15.2",
+    "webdriver-manager==4.0.1",
+    "Pillow==10.1.0",
+    "numpy==1.24.3",
+    "requests==2.31.0",
+    "beautifulsoup4==4.12.2",
+    "openai==1.3.7",
+    "python-dotenv==1.0.0",
+]
+
+[project.urls]
+Homepage = "https://github.com/yourusername/DOM-and-DOMer-2"
+Repository = "https://github.com/yourusername/DOM-and-DOMer-2.git"
+
+[tool.black]
+line-length = 88
+target-version = ["py38"]
+include = '\.pyi?$'
+
+[tool.isort]
+profile = "black"
+multi_line_output = 3
+line_length = 88
+
+[tool.mypy]
+python_version = "3.8"
+warn_return_any = true
+warn_unused_configs = true
+disallow_untyped_defs = true
+check_untyped_defs = true
+
+[tool.pytest.ini_options]
+minversion = "6.0"
+addopts = "-ra -q"
+testpaths = [
+    "tests",
+]
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..0141fcf
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,8 @@
+selenium>=4.15.2
+webdriver-manager>=4.0.1
+Pillow>=10.1.0  # For image processing
+numpy==1.24.3   # For image comparison
+requests==2.31.0
+beautifulsoup4==4.12.2
+openai==1.3.7   # For GPT-4V evaluation
+python-dotenv==1.0.0  # For environment variables
diff --git a/run.py b/run.py
new file mode 100644
index 0000000..069ac36
--- /dev/null
+++ b/run.py
@@ -0,0 +1,165 @@
+import argparse
+import json
+import logging
+import time
+from pathlib import Path
+from typing import Dict, List, Any
+
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.chrome.options import Options
+from webdriver_manager.chrome import ChromeDriverManager
+
+from utils import WebInteractionUtils, load_tasks, save_results, get_accessibility_tree, compute_image_similarity
+
+def setup_logger(output_dir: Path) -> None:
+    """Setup logging configuration"""
+    log_file = output_dir / "benchmark.log"
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s - %(levelname)s - %(message)s',
+        handlers=[
+            logging.FileHandler(log_file),
+            logging.StreamHandler()
+        ]
+    )
+
+def setup_driver(
+    headless: bool = True,
+    download_dir: str = None,
+    force_device_scale: bool = True
+) -> webdriver.Chrome:
+    """Setup Chrome WebDriver with specified options"""
+    options = Options()
+    
+    if force_device_scale:
+        options.add_argument("--force-device-scale-factor=1")
+    if headless:
+        options.add_argument("--headless")
+        options.add_argument(
+            "--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
+        )
+    if download_dir:
+        options.add_experimental_option(
+            "prefs", {"download.default_directory": download_dir}
+        )
+    
+    options.add_argument("--no-sandbox")
+    options.add_argument("--disable-dev-shm-usage")
+    
+    service = Service(ChromeDriverManager().install())
+    return webdriver.Chrome(service=service, options=options)
+
+def run_benchmark(
+    tasks_file: Path,
+    output_dir: Path,
+    headless: bool = True,
+    force_device_scale: bool = True,
+    save_accessibility_tree: bool = True,
+    image_match_threshold: float = 0.95
+) -> None:
+    """Run the DOM benchmark"""
+    
+    # Setup
+    output_dir.mkdir(parents=True, exist_ok=True)
+    setup_logger(output_dir)
+    
+    # Load tasks
+    tasks = load_tasks(tasks_file)
+    logging.info(f"Loaded {len(tasks)} tasks from {tasks_file}")
+    
+    # Setup WebDriver
+    driver = setup_driver(
+        headless=headless,
+        download_dir=str(output_dir / "downloads"),
+        force_device_scale=force_device_scale
+    )
+    utils = WebInteractionUtils(driver)
+    
+    try:
+        results = []
+        for i, task in enumerate(tasks):
+            task_id = task["id"]
+            logging.info(f"Running task {i+1}/{len(tasks)}: {task_id}")
+            
+            # Load webpage
+            driver.get(task["web"])
+            time.sleep(2)  # Wait for page load
+            
+            # Get accessibility tree
+            if save_accessibility_tree:
+                tree_file = output_dir / f"accessibility_tree_{task_id}.json"
+                tree = get_accessibility_tree(driver, str(tree_file))
+                logging.info(f"Saved accessibility tree to {tree_file}")
+            
+            # Take before screenshot
+            before_screenshot = output_dir / f"before_{task_id}.png"
+            driver.save_screenshot(str(before_screenshot))
+            
+            # Execute interaction
+            success = utils.execute_interaction(task)
+            time.sleep(1)  # Wait for interaction effect
+            
+            # Take after screenshot
+            after_screenshot = output_dir / f"after_{task_id}.png"
+            driver.save_screenshot(str(after_screenshot))
+            
+            # Compare screenshots
+            image_similarity = compute_image_similarity(str(before_screenshot), str(after_screenshot))
+            
+            # Save result
+            result = {
+                "task_id": task_id,
+                "success": success,
+                "image_similarity": image_similarity,
+                "passed_threshold": image_similarity >= image_match_threshold,
+                "timestamp": time.time(),
+                "accessibility_tree": str(tree_file) if save_accessibility_tree else None
+            }
+            results.append(result)
+            
+            logging.info(
+                f"Task {task_id} completed: success={success}, "
+                f"image_similarity={image_similarity:.3f}"
+            )
+        
+        # Save results
+        results_file = output_dir / "results.json"
+        save_results(results, str(results_file))
+        logging.info(f"Results saved to {results_file}")
+        
+        # Print summary
+        successful = sum(1 for r in results if r["success"])
+        passed_threshold = sum(1 for r in results if r["passed_threshold"])
+        logging.info(
+            f"\nBenchmark Summary:\n"
+            f"Total Tasks: {len(tasks)}\n"
+            f"Successful Interactions: {successful}\n"
+            f"Passed Image Threshold: {passed_threshold}\n"
+        )
+        
+    finally:
+        driver.quit()
+
+def main():
+    parser = argparse.ArgumentParser(description="Run DOM Benchmark")
+    parser.add_argument("--tasks", type=Path, required=True, help="Path to tasks JSONL file")
+    parser.add_argument("--output", type=Path, required=True, help="Output directory for results")
+    parser.add_argument("--headless", action="store_true", help="Run Chrome in headless mode")
+    parser.add_argument("--force-device-scale", action="store_true", help="Force device scale factor to 1")
+    parser.add_argument("--save-accessibility-tree", action="store_true", help="Save accessibility tree for each task")
+    parser.add_argument("--threshold", type=float, default=0.95, help="Image similarity threshold")
+    
+    args = parser.parse_args()
+    
+    run_benchmark(
+        tasks_file=args.tasks,
+        output_dir=args.output,
+        headless=args.headless,
+        force_device_scale=args.force_device_scale,
+        save_accessibility_tree=args.save_accessibility_tree,
+        image_match_threshold=args.threshold
+    )
+
+if __name__ == "__main__":
+    main()
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..f3d2c7c
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,25 @@
+[flake8]
+max-line-length = 88
+extend-ignore = E203
+exclude = .git,__pycache__,build,dist
+
+[coverage:run]
+source = dom_and_domer_2
+omit = tests/*
+
+[coverage:report]
+exclude_lines =
+    pragma: no cover
+    def __repr__
+    if self.debug:
+    raise NotImplementedError
+    if __name__ == .__main__.:
+    pass
+    raise ImportError
+
+[tool:pytest]
+testpaths = tests
+python_files = test_*.py
+python_classes = Test
+python_functions = test_*
+addopts = --verbose
diff --git a/utils.py b/utils.py
new file mode 100644
index 0000000..1b95bf0
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,123 @@
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.common.action_chains import ActionChains
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+import json
+import base64
+import time
+import logging
+from typing import Dict, List, Any, Optional
+from PIL import Image
+import numpy as np
+
+def get_accessibility_tree(driver: webdriver.Chrome, save_file: Optional[str] = None) -> Dict:
+    """Get accessibility tree of the current page"""
+    js_script = """
+        function getAccessibilityTree(node, tree = {}) {
+            tree.role = node.role;
+            tree.name = node.name;
+            tree.type = node.type;
+            if (node.value) tree.value = node.value;
+            
+            const rect = node.getBoundingClientRect();
+            tree.location = {
+                x: rect.x,
+                y: rect.y,
+                width: rect.width,
+                height: rect.height
+            };
+            
+            tree.children = [];
+            for (let child of node.children) {
+                tree.children.push(getAccessibilityTree(child));
+            }
+            return tree;
+        }
+        return getAccessibilityTree(document.documentElement);
+    """
+    tree = driver.execute_script(js_script)
+    
+    if save_file:
+        with open(save_file, 'w') as f:
+            json.dump(tree, f, indent=2)
+    
+    return tree
+
+class WebInteractionUtils:
+    def __init__(self, driver: webdriver.Chrome):
+        self.driver = driver
+        self.wait = WebDriverWait(driver, 10)
+        
+    def find_element(self, locator_type: str, locator: str) -> Optional[Any]:
+        """Find element with wait and retry logic"""
+        try:
+            element = self.wait.until(
+                EC.presence_of_element_located((getattr(By, locator_type.upper()), locator))
+            )
+            return element
+        except Exception as e:
+            logging.error(f"Failed to find element {locator_type}={locator}: {str(e)}")
+            return None
+    
+    def execute_interaction(self, task: Dict[str, Any]) -> bool:
+        """Execute web interaction based on task definition"""
+        try:
+            # Find element
+            element = self.find_element(
+                task["target_element"].get("type", "XPATH"),
+                task["target_element"].get("value")
+            )
+            if not element:
+                return False
+                
+            # Execute interaction
+            interaction = task["interaction"].lower()
+            if interaction == "click":
+                element.click()
+            elif interaction == "type":
+                element.clear()
+                element.send_keys(task.get("input_text", ""))
+            elif interaction == "hover":
+                ActionChains(self.driver).move_to_element(element).perform()
+            else:
+                logging.error(f"Unknown interaction type: {interaction}")
+                return False
+                
+            return True
+            
+        except Exception as e:
+            logging.error(f"Failed to execute interaction: {str(e)}")
+            return False
+
+def compute_image_similarity(img1_path: str, img2_path: str) -> float:
+    """Compute similarity between two images"""
+    def load_and_process(path):
+        img = Image.open(path).convert('RGB')
+        img = img.resize((224, 224))  # Standard size
+        return np.array(img)
+    
+    img1 = load_and_process(img1_path)
+    img2 = load_and_process(img2_path)
+    
+    # Compute MSE
+    mse = np.mean((img1 - img2) ** 2)
+    # Convert to similarity score (1 = identical, 0 = completely different)
+    similarity = 1 / (1 + mse)
+    
+    return similarity
+
+def load_tasks(tasks_file: str) -> List[Dict[str, Any]]:
+    """Load tasks from JSONL file"""
+    tasks = []
+    with open(tasks_file) as f:
+        for line in f:
+            if line.strip():
+                tasks.append(json.loads(line))
+    return tasks
+
+def save_results(results: List[Dict[str, Any]], output_file: str) -> None:
+    """Save benchmark results to JSON file"""
+    with open(output_file, 'w') as f:
+        json.dump(results, f, indent=2)
diff --git a/utils/screenshot_utils.py b/utils/screenshot_utils.py
new file mode 100644
index 0000000..d81eb2a
--- /dev/null
+++ b/utils/screenshot_utils.py
@@ -0,0 +1,113 @@
+import selenium
+from selenium import webdriver
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.chrome.options import Options
+import time
+import os
+import base64
+from pathlib import Path
+
+def setup_driver():
+    """Initialize Chrome driver with appropriate settings"""
+    chrome_options = Options()
+    chrome_options.add_argument("--window-size=1920,1080")
+    chrome_options.add_argument("--disable-gpu")
+    chrome_options.add_argument("--no-sandbox")
+    chrome_options.add_argument("--disable-dev-shm-usage")
+    return webdriver.Chrome(options=chrome_options)
+
+def take_element_screenshot(driver, element, output_path):
+    """Take a screenshot of a specific element"""
+    element_png = element.screenshot_as_png
+    with open(output_path, "wb") as f:
+        f.write(element_png)
+
+def take_full_page_screenshot(driver, url, output_path):
+    """Take a full page screenshot with consistent rendering"""
+    try:
+        driver.get(url)
+        time.sleep(3)  # Wait for page load and any animations
+        
+        # Normalize color scheme and rendering
+        driver.execute_script("""
+            document.documentElement.style.colorScheme = 'normal';
+            document.documentElement.style.forcedColorAdjust = 'none';
+        """)
+        
+        # Get page metrics and capture full page
+        metrics = driver.execute_cdp_cmd('Page.getLayoutMetrics', {})
+        screenshot_config = {
+            'captureBeyondViewport': True,
+            'fromSurface': True,
+            'clip': {
+                'x': 0,
+                'y': 0,
+                'width': metrics['cssContentSize']['width'],
+                'height': metrics['cssContentSize']['height'],
+                'scale': 1
+            }
+        }
+        screenshot_data = driver.execute_cdp_cmd('Page.captureScreenshot', screenshot_config)
+        
+        # Save screenshot
+        with open(output_path, 'wb') as f:
+            f.write(base64.b64decode(screenshot_data['data']))
+        print(f"Screenshot saved successfully as {output_path}")
+        
+    except Exception as e:
+        print(f"Screenshot failed: {str(e)}")
+        raise
+
+def capture_task_screenshots(task_data, ground_truth_dir):
+    """Capture before and after screenshots for a task"""
+    driver = setup_driver()
+    try:
+        # Create screenshot paths
+        task_id = task_data["id"]
+        before_path = Path(ground_truth_dir) / f"{task_id}_before.png"
+        after_path = Path(ground_truth_dir) / f"{task_id}_gt.png"
+        
+        # Take before screenshot
+        take_full_page_screenshot(driver, task_data["web"], str(before_path))
+        
+        # Perform task action
+        if task_data["element_type"] == "input":
+            element = WebDriverWait(driver, 10).until(
+                EC.presence_of_element_located((By.ID, task_data["target_element"]["value"]))
+            )
+            element.click()
+            if "input_text" in task_data:
+                element.send_keys(task_data["input_text"])
+        
+        elif task_data["element_type"] == "button":
+            element = WebDriverWait(driver, 10).until(
+                EC.element_to_be_clickable((By.CLASS_NAME, task_data["target_element"]["value"]))
+            )
+            element.click()
+        
+        # Wait for any transitions/loading
+        time.sleep(2)
+        
+        # Take after screenshot
+        take_full_page_screenshot(driver, driver.current_url, str(after_path))
+        
+    finally:
+        driver.quit()
+
+if __name__ == "__main__":
+    # Example usage
+    from pathlib import Path
+    import json
+    
+    # Load tasks
+    tasks_file = Path("data/dom_tasks.jsonl")
+    ground_truth_dir = Path("data/ground_truth")
+    ground_truth_dir.mkdir(exist_ok=True)
+    
+    with open(tasks_file) as f:
+        for line in f:
+            task = json.loads(line)
+            capture_task_screenshots(task, ground_truth_dir)