Skip to content

Commit

Permalink
Works
Browse files Browse the repository at this point in the history
  • Loading branch information
dhruvahuja19 committed Dec 16, 2024
1 parent 23d4aa5 commit c02b676
Show file tree
Hide file tree
Showing 6 changed files with 382 additions and 129 deletions.
1 change: 0 additions & 1 deletion data/dom_tasks.jsonl
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
{"web_name": "Cambridge Dictionary", "id": "cambridge_lookup_1", "task": "Click the search box and type 'hello'", "web": "https://dictionary.cambridge.org/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "searchword"}, "input_text": "hello", "ground_truth": {"screenshot": "cambridge_lookup_1_gt.png", "description": "The word 'hello' has been entered in the search box", "visual_changes": ["Text 'hello' appears in search box", "Text cursor visible at end of input", "Search suggestions may appear"], "accessibility_changes": ["Search box aria-value updates to 'hello'", "Search suggestions list may become visible"], "success_criteria": ["Input text matches 'hello' exactly", "Text is visible in search box", "Search box maintains focus"]}}
{"web_name": "Cambridge Dictionary", "id": "cambridge_search_1", "task": "Click the search button", "web": "https://dictionary.cambridge.org/", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "cdo-search-button"}, "ground_truth": {"screenshot": "cambridge_search_1_gt.png", "description": "The search results for 'hello' are displayed", "visual_changes": ["Search button appears pressed", "Page transitions to search results", "Definition of 'hello' is displayed"], "accessibility_changes": ["Search results region becomes visible", "Page title updates to include 'hello'", "Search results are announced to screen readers"], "success_criteria": ["Search button responds to click", "Results page loads with 'hello' definition", "No error messages are displayed"]}}
168 changes: 63 additions & 105 deletions evaluation/auto_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from openai import OpenAI
from dotenv import load_dotenv

SYSTEM_PROMPT = """As an evaluator for DOM and DOMer-2 benchmark, you will assess web element interactions based on:
SYSTEM_PROMPT = """As an evaluator for DOM and DOMer-2 benchmark, you will assess web element interactions based on visual comparison:
1. Task Description: A specific web interaction task (e.g., "Click the search button", "Type text in input field")
Expand All @@ -18,28 +18,18 @@
- After: Actual result after interaction
- Ground Truth: Expected result for successful interaction
- Expected Visual Changes: List of specific visual changes to verify
3. Accessibility Validation:
- Accessibility Tree: JSON representation of webpage's accessibility state
- Expected Accessibility Changes: List of specific accessibility changes to verify
4. Success Criteria:
- Specific conditions that must be met for success
- Visual state matches ground truth
- Accessibility state reflects expected changes
Your evaluation should:
1. Compare before/after/ground-truth screenshots
1. Compare the after screenshot with the ground truth screenshot
2. Verify all listed visual changes occurred
3. Validate accessibility tree changes
4. Check all success criteria are met
3. Pay special attention to the relevant regions where changes should occur
Provide your evaluation as:
1. 'SUCCESS' or 'NOT SUCCESS'
2. Detailed explanation of:
1. A score from 0-100 based on visual similarity and completion of expected changes
2. 'SUCCESS' if score ≥ 90, otherwise 'NOT SUCCESS'
3. Brief explanation of:
- Visual changes observed/missing
- Accessibility changes verified/missing
- Success criteria met/failed"""
- Why the interaction succeeded or failed"""

def encode_image(image_path: str) -> str:
"""Encode image as base64 string"""
Expand All @@ -49,101 +39,70 @@ def encode_image(image_path: str) -> str:
def evaluate_task(
task: Dict[str, Any],
result: Dict[str, Any],
output_dir: Path,
ground_truth_dir: Path,
ground_truth: Dict[str, Any],
openai_client: OpenAI
) -> Dict[str, Any]:
"""Evaluate a single task using GPT-4V"""

# Get screenshots
before_img = encode_image(str(output_dir / f"before_{task['id']}.png"))
after_img = encode_image(str(output_dir / f"after_{task['id']}.png"))
ground_truth_img = encode_image(str(ground_truth_dir / task['ground_truth']['screenshot']))

# Get accessibility tree if available
tree_path = output_dir / f"accessibility_tree_{task['id']}.json"
accessibility_tree = None
if tree_path.exists():
with open(tree_path) as f:
accessibility_tree = json.load(f)
"""Evaluate a single task using GPT-4V based on visual comparison"""

# Format prompt with enhanced ground truth information
messages = [
{
"role": "system",
"content": SYSTEM_PROMPT
},
{
"role": "user",
"content": [
{
"type": "text",
"text": f"""Task: {task['task']}
Website: {task['web_name']}
Interaction: {task['interaction']}
Element Type: {task['element_type']}
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": f"""
Task: {task['task']}
Ground Truth Information:
1. Description: {task['ground_truth']['description']}
2. Expected Visual Changes:
{chr(10).join(f' - {change}' for change in task['ground_truth'].get('visual_changes', []))}
3. Expected Accessibility Changes:
{chr(10).join(f' - {change}' for change in task['ground_truth'].get('accessibility_changes', []))}
4. Success Criteria:
{chr(10).join(f' - {criterion}' for criterion in task['ground_truth'].get('success_criteria', []))}
Accessibility Tree:
{json.dumps(accessibility_tree, indent=2) if accessibility_tree else 'Not available'}
Please evaluate the interaction by comparing:
Please compare:
1. Before screenshot (initial state)
2. After screenshot (actual result)
3. Ground Truth screenshot (expected result)"""
},
{
"type": "text",
"text": "Before interaction:"
},
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{before_img}"}
},
{
"type": "text",
"text": "After interaction:"
},
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{after_img}"}
},
{
"type": "text",
"text": "Ground Truth:"
},
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{ground_truth_img}"}
}
]
}
3. Ground truth screenshot (expected result)
Expected visual changes:
{json.dumps(ground_truth['visual_changes'], indent=2)}
Provide:
1. Similarity score (0-100)
2. Success status
3. Brief explanation"""},
{"role": "assistant", "content": "I'll examine the screenshots and evaluate based on visual similarity and expected changes."},
{"role": "user", "content": [
{"type": "text", "text": "Before interaction:"},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encode_image(result['before_screenshot'])}"}},
{"type": "text", "text": "After interaction:"},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encode_image(result['after_screenshot'])}"}},
{"type": "text", "text": "Ground Truth:"},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encode_image(ground_truth['screenshot'])}"}},
]}
]

# Get GPT-4V evaluation
response = openai_client.chat.completions.create(
model="gpt-4-vision-preview",
messages=messages,
max_tokens=1000
)

evaluation = response.choices[0].message.content
success = "SUCCESS" in evaluation.upper()

return {
"task_id": task["id"],
"success": success,
"evaluation": evaluation,
"timestamp": int(time.time())
}

try:
response = openai_client.chat.completions.create(
model="gpt-4-vision-preview",
messages=messages,
max_tokens=1000,
temperature=0
)

evaluation = response.choices[0].message.content

# Extract score and success status
import re
score_match = re.search(r'(\d+)(?=/100|%)', evaluation)
score = int(score_match.group(1)) if score_match else 0

return {
"task_id": task["id"],
"score": score,
"success": score >= 90,
"evaluation": evaluation,
"timestamp": int(time.time())
}

except Exception as e:
return {
"task_id": task["id"],
"score": 0,
"success": False,
"evaluation": f"Evaluation failed: {str(e)}",
"timestamp": int(time.time())
}

def run_evaluation(
tasks_file: Path,
Expand Down Expand Up @@ -174,7 +133,6 @@ def run_evaluation(
evaluation = evaluate_task(
task,
task_result,
results_dir,
ground_truth_dir,
openai_client
)
Expand Down
16 changes: 8 additions & 8 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
selenium>=4.15.2
webdriver-manager>=4.0.1
Pillow>=10.1.0 # For image processing
numpy==1.24.3 # For image comparison
requests==2.31.0
beautifulsoup4==4.12.2
openai==1.3.7 # For GPT-4V evaluation
python-dotenv==1.0.0 # For environment variables
selenium
webdriver-manager
Pillow
numpy
requests
beautifulsoup4
openai
python-dotenv
46 changes: 31 additions & 15 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,11 @@ def get_accessibility_tree(driver: webdriver.Chrome, save_file: Optional[str] =
"""Get accessibility tree of the current page"""
js_script = """
function getAccessibilityTree(node, tree = {}) {
tree.role = node.role;
tree.name = node.name;
tree.type = node.type;
if (node.value) tree.value = node.value;
tree.role = node.role || '';
tree.name = node.tagName || '';
tree.type = node.type || '';
tree.value = node.value || '';
tree.textContent = node.textContent ? node.textContent.trim() : '';
const rect = node.getBoundingClientRect();
tree.location = {
Expand All @@ -30,8 +31,9 @@ def get_accessibility_tree(driver: webdriver.Chrome, save_file: Optional[str] =
};
tree.children = [];
for (let child of node.children) {
tree.children.push(getAccessibilityTree(child));
const children = node.children;
for (let i = 0; i < children.length; i++) {
tree.children.push(getAccessibilityTree(children[i]));
}
return tree;
}
Expand Down Expand Up @@ -93,20 +95,21 @@ def execute_interaction(self, task: Dict[str, Any]) -> bool:

def compute_image_similarity(img1_path: str, img2_path: str) -> float:
"""Compute similarity between two images"""
def load_and_process(path):
img = Image.open(path).convert('RGB')
img = img.resize((224, 224)) # Standard size
return np.array(img)
img1 = np.array(Image.open(img1_path))
img2 = np.array(Image.open(img2_path))

img1 = load_and_process(img1_path)
img2 = load_and_process(img2_path)
# Ensure same size
if img1.shape != img2.shape:
img2 = np.array(Image.open(img2_path).resize((img1.shape[1], img1.shape[0])))

# Compute MSE
mse = np.mean((img1 - img2) ** 2)
# Convert to similarity score (1 = identical, 0 = completely different)

# Convert to similarity score (0 to 1)
similarity = 1 / (1 + mse)

return similarity
# Convert numpy float to Python float
return float(similarity)

def load_tasks(tasks_file: str) -> List[Dict[str, Any]]:
"""Load tasks from JSONL file"""
Expand All @@ -119,5 +122,18 @@ def load_tasks(tasks_file: str) -> List[Dict[str, Any]]:

def save_results(results: List[Dict[str, Any]], output_file: str) -> None:
"""Save benchmark results to JSON file"""
# Convert any numpy types to Python types
serializable_results = []
for result in results:
serializable_result = {}
for key, value in result.items():
if isinstance(value, np.floating):
serializable_result[key] = float(value)
elif isinstance(value, np.integer):
serializable_result[key] = int(value)
else:
serializable_result[key] = value
serializable_results.append(serializable_result)

with open(output_file, 'w') as f:
json.dump(results, f, indent=2)
json.dump(serializable_results, f, indent=2)
Loading

0 comments on commit c02b676

Please sign in to comment.