Works

dhruvahuja19 · Dec 16, 2024 · c02b676 · c02b676
1 parent 23d4aa5
commit c02b676
Show file tree

Hide file tree

Showing 6 changed files with 382 additions and 129 deletions.
diff --git a/data/dom_tasks.jsonl b/data/dom_tasks.jsonl
@@ -1,2 +1 @@
 {"web_name": "Cambridge Dictionary", "id": "cambridge_lookup_1", "task": "Click the search box and type 'hello'", "web": "https://dictionary.cambridge.org/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "searchword"}, "input_text": "hello", "ground_truth": {"screenshot": "cambridge_lookup_1_gt.png", "description": "The word 'hello' has been entered in the search box", "visual_changes": ["Text 'hello' appears in search box", "Text cursor visible at end of input", "Search suggestions may appear"], "accessibility_changes": ["Search box aria-value updates to 'hello'", "Search suggestions list may become visible"], "success_criteria": ["Input text matches 'hello' exactly", "Text is visible in search box", "Search box maintains focus"]}}
-{"web_name": "Cambridge Dictionary", "id": "cambridge_search_1", "task": "Click the search button", "web": "https://dictionary.cambridge.org/", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "cdo-search-button"}, "ground_truth": {"screenshot": "cambridge_search_1_gt.png", "description": "The search results for 'hello' are displayed", "visual_changes": ["Search button appears pressed", "Page transitions to search results", "Definition of 'hello' is displayed"], "accessibility_changes": ["Search results region becomes visible", "Page title updates to include 'hello'", "Search results are announced to screen readers"], "success_criteria": ["Search button responds to click", "Results page loads with 'hello' definition", "No error messages are displayed"]}}
diff --git a/evaluation/auto_eval.py b/evaluation/auto_eval.py
@@ -9,7 +9,7 @@
 from openai import OpenAI
 from dotenv import load_dotenv
 
-SYSTEM_PROMPT = """As an evaluator for DOM and DOMer-2 benchmark, you will assess web element interactions based on:
+SYSTEM_PROMPT = """As an evaluator for DOM and DOMer-2 benchmark, you will assess web element interactions based on visual comparison:
 
 1. Task Description: A specific web interaction task (e.g., "Click the search button", "Type text in input field")
 
@@ -18,28 +18,18 @@
    - After: Actual result after interaction
    - Ground Truth: Expected result for successful interaction
    - Expected Visual Changes: List of specific visual changes to verify
-   
-3. Accessibility Validation:
-   - Accessibility Tree: JSON representation of webpage's accessibility state
-   - Expected Accessibility Changes: List of specific accessibility changes to verify
-
-4. Success Criteria:
-   - Specific conditions that must be met for success
-   - Visual state matches ground truth
-   - Accessibility state reflects expected changes
 
 Your evaluation should:
-1. Compare before/after/ground-truth screenshots
+1. Compare the after screenshot with the ground truth screenshot
 2. Verify all listed visual changes occurred
-3. Validate accessibility tree changes
-4. Check all success criteria are met
+3. Pay special attention to the relevant regions where changes should occur
 
 Provide your evaluation as:
-1. 'SUCCESS' or 'NOT SUCCESS'
-2. Detailed explanation of:
+1. A score from 0-100 based on visual similarity and completion of expected changes
+2. 'SUCCESS' if score ≥ 90, otherwise 'NOT SUCCESS'
+3. Brief explanation of:
    - Visual changes observed/missing
-   - Accessibility changes verified/missing
-   - Success criteria met/failed"""
+   - Why the interaction succeeded or failed"""
 
 def encode_image(image_path: str) -> str:
     """Encode image as base64 string"""
@@ -49,101 +39,70 @@ def encode_image(image_path: str) -> str:
 def evaluate_task(
     task: Dict[str, Any],
     result: Dict[str, Any],
-    output_dir: Path,
-    ground_truth_dir: Path,
+    ground_truth: Dict[str, Any],
     openai_client: OpenAI
 ) -> Dict[str, Any]:
-    """Evaluate a single task using GPT-4V"""
-
-    # Get screenshots
-    before_img = encode_image(str(output_dir / f"before_{task['id']}.png"))
-    after_img = encode_image(str(output_dir / f"after_{task['id']}.png"))
-    ground_truth_img = encode_image(str(ground_truth_dir / task['ground_truth']['screenshot']))
-
-    # Get accessibility tree if available
-    tree_path = output_dir / f"accessibility_tree_{task['id']}.json"
-    accessibility_tree = None
-    if tree_path.exists():
-        with open(tree_path) as f:
-            accessibility_tree = json.load(f)
+    """Evaluate a single task using GPT-4V based on visual comparison"""
 
-    # Format prompt with enhanced ground truth information
     messages = [
-        {
-            "role": "system",
-            "content": SYSTEM_PROMPT
-        },
-        {
-            "role": "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": f"""Task: {task['task']}
-Website: {task['web_name']}
-Interaction: {task['interaction']}
-Element Type: {task['element_type']}
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": f"""
+Task: {task['task']}
 
-Ground Truth Information:
-1. Description: {task['ground_truth']['description']}
-2. Expected Visual Changes:
-{chr(10).join(f'   - {change}' for change in task['ground_truth'].get('visual_changes', []))}
-3. Expected Accessibility Changes:
-{chr(10).join(f'   - {change}' for change in task['ground_truth'].get('accessibility_changes', []))}
-4. Success Criteria:
-{chr(10).join(f'   - {criterion}' for criterion in task['ground_truth'].get('success_criteria', []))}
-
-Accessibility Tree:
-{json.dumps(accessibility_tree, indent=2) if accessibility_tree else 'Not available'}
-
-Please evaluate the interaction by comparing:
+Please compare:
 1. Before screenshot (initial state)
 2. After screenshot (actual result)
-3. Ground Truth screenshot (expected result)"""
-                },
-                {
-                    "type": "text",
-                    "text": "Before interaction:"
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {"url": f"data:image/png;base64,{before_img}"}
-                },
-                {
-                    "type": "text",
-                    "text": "After interaction:"
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {"url": f"data:image/png;base64,{after_img}"}
-                },
-                {
-                    "type": "text",
-                    "text": "Ground Truth:"
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {"url": f"data:image/png;base64,{ground_truth_img}"}
-                }
-            ]
-        }
+3. Ground truth screenshot (expected result)
+
+Expected visual changes:
+{json.dumps(ground_truth['visual_changes'], indent=2)}
+
+Provide:
+1. Similarity score (0-100)
+2. Success status
+3. Brief explanation"""},
+        {"role": "assistant", "content": "I'll examine the screenshots and evaluate based on visual similarity and expected changes."},
+        {"role": "user", "content": [
+            {"type": "text", "text": "Before interaction:"},
+            {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encode_image(result['before_screenshot'])}"}},
+            {"type": "text", "text": "After interaction:"},
+            {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encode_image(result['after_screenshot'])}"}},
+            {"type": "text", "text": "Ground Truth:"},
+            {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encode_image(ground_truth['screenshot'])}"}},
+        ]}
     ]
-
-    # Get GPT-4V evaluation
-    response = openai_client.chat.completions.create(
-        model="gpt-4-vision-preview",
-        messages=messages,
-        max_tokens=1000
-    )
-
-    evaluation = response.choices[0].message.content
-    success = "SUCCESS" in evaluation.upper()
-
-    return {
-        "task_id": task["id"],
-        "success": success,
-        "evaluation": evaluation,
-        "timestamp": int(time.time())
-    }
+
+    try:
+        response = openai_client.chat.completions.create(
+            model="gpt-4-vision-preview",
+            messages=messages,
+            max_tokens=1000,
+            temperature=0
+        )
+
+        evaluation = response.choices[0].message.content
+
+        # Extract score and success status
+        import re
+        score_match = re.search(r'(\d+)(?=/100|%)', evaluation)
+        score = int(score_match.group(1)) if score_match else 0
+
+        return {
+            "task_id": task["id"],
+            "score": score,
+            "success": score >= 90,
+            "evaluation": evaluation,
+            "timestamp": int(time.time())
+        }
+
+    except Exception as e:
+        return {
+            "task_id": task["id"],
+            "score": 0,
+            "success": False,
+            "evaluation": f"Evaluation failed: {str(e)}",
+            "timestamp": int(time.time())
+        }
 
 def run_evaluation(
     tasks_file: Path,
@@ -174,7 +133,6 @@ def run_evaluation(
             evaluation = evaluate_task(
                 task,
                 task_result,
-                results_dir,
                 ground_truth_dir,
                 openai_client
             )

diff --git a/requirements.txt b/requirements.txt
@@ -1,8 +1,8 @@
-selenium>=4.15.2
-webdriver-manager>=4.0.1
-Pillow>=10.1.0  # For image processing
-numpy==1.24.3   # For image comparison
-requests==2.31.0
-beautifulsoup4==4.12.2
-openai==1.3.7   # For GPT-4V evaluation
-python-dotenv==1.0.0  # For environment variables
+selenium
+webdriver-manager
+Pillow
+numpy
+requests
+beautifulsoup4
+openai
+python-dotenv
diff --git a/utils.py b/utils.py
@@ -16,10 +16,11 @@ def get_accessibility_tree(driver: webdriver.Chrome, save_file: Optional[str] =
     """Get accessibility tree of the current page"""
     js_script = """
         function getAccessibilityTree(node, tree = {}) {
-            tree.role = node.role;
-            tree.name = node.name;
-            tree.type = node.type;
-            if (node.value) tree.value = node.value;
+            tree.role = node.role || '';
+            tree.name = node.tagName || '';
+            tree.type = node.type || '';
+            tree.value = node.value || '';
+            tree.textContent = node.textContent ? node.textContent.trim() : '';
             
             const rect = node.getBoundingClientRect();
             tree.location = {
@@ -30,8 +31,9 @@ def get_accessibility_tree(driver: webdriver.Chrome, save_file: Optional[str] =
             };
             
             tree.children = [];
-            for (let child of node.children) {
-                tree.children.push(getAccessibilityTree(child));
+            const children = node.children;
+            for (let i = 0; i < children.length; i++) {
+                tree.children.push(getAccessibilityTree(children[i]));
             }
             return tree;
         }
@@ -93,20 +95,21 @@ def execute_interaction(self, task: Dict[str, Any]) -> bool:
 
 def compute_image_similarity(img1_path: str, img2_path: str) -> float:
     """Compute similarity between two images"""
-    def load_and_process(path):
-        img = Image.open(path).convert('RGB')
-        img = img.resize((224, 224))  # Standard size
-        return np.array(img)
+    img1 = np.array(Image.open(img1_path))
+    img2 = np.array(Image.open(img2_path))
 
-    img1 = load_and_process(img1_path)
-    img2 = load_and_process(img2_path)
+    # Ensure same size
+    if img1.shape != img2.shape:
+        img2 = np.array(Image.open(img2_path).resize((img1.shape[1], img1.shape[0])))
 
     # Compute MSE
     mse = np.mean((img1 - img2) ** 2)
-    # Convert to similarity score (1 = identical, 0 = completely different)
+
+    # Convert to similarity score (0 to 1)
     similarity = 1 / (1 + mse)
 
-    return similarity
+    # Convert numpy float to Python float
+    return float(similarity)
 
 def load_tasks(tasks_file: str) -> List[Dict[str, Any]]:
     """Load tasks from JSONL file"""
@@ -119,5 +122,18 @@ def load_tasks(tasks_file: str) -> List[Dict[str, Any]]:
 
 def save_results(results: List[Dict[str, Any]], output_file: str) -> None:
     """Save benchmark results to JSON file"""
+    # Convert any numpy types to Python types
+    serializable_results = []
+    for result in results:
+        serializable_result = {}
+        for key, value in result.items():
+            if isinstance(value, np.floating):
+                serializable_result[key] = float(value)
+            elif isinstance(value, np.integer):
+                serializable_result[key] = int(value)
+            else:
+                serializable_result[key] = value
+        serializable_results.append(serializable_result)
+
     with open(output_file, 'w') as f:
-        json.dump(results, f, indent=2)
+        json.dump(serializable_results, f, indent=2)
Original file line number	Diff line number	Diff line change
		@@ -1,2 +1 @@
		{"web_name": "Cambridge Dictionary", "id": "cambridge_lookup_1", "task": "Click the search box and type 'hello'", "web": "https://dictionary.cambridge.org/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "searchword"}, "input_text": "hello", "ground_truth": {"screenshot": "cambridge_lookup_1_gt.png", "description": "The word 'hello' has been entered in the search box", "visual_changes": ["Text 'hello' appears in search box", "Text cursor visible at end of input", "Search suggestions may appear"], "accessibility_changes": ["Search box aria-value updates to 'hello'", "Search suggestions list may become visible"], "success_criteria": ["Input text matches 'hello' exactly", "Text is visible in search box", "Search box maintains focus"]}}
		{"web_name": "Cambridge Dictionary", "id": "cambridge_search_1", "task": "Click the search button", "web": "https://dictionary.cambridge.org/", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "cdo-search-button"}, "ground_truth": {"screenshot": "cambridge_search_1_gt.png", "description": "The search results for 'hello' are displayed", "visual_changes": ["Search button appears pressed", "Page transitions to search results", "Definition of 'hello' is displayed"], "accessibility_changes": ["Search results region becomes visible", "Page title updates to include 'hello'", "Search results are announced to screen readers"], "success_criteria": ["Search button responds to click", "Results page loads with 'hello' definition", "No error messages are displayed"]}}