diff --git a/data/README.md b/data/README.md
new file mode 100644
index 0000000..4cbbd87
--- /dev/null
+++ b/data/README.md
@@ -0,0 +1,83 @@
+# DOM Task Format
+
+This document describes the format for DOM interaction tasks in our benchmark.
+
+## Schema
+
+Tasks are defined in JSONL format, where each line is a valid JSON object following the schema in `task_schema.json`.
+
+## Example Task
+
+```json
+{
+ "web_name": "Cambridge Dictionary",
+ "id": "cambridge_lookup_1",
+ "task": "Click the search box and type 'hello'",
+ "web": "https://dictionary.cambridge.org/",
+ "element_type": "input",
+ "interaction": "type",
+ "target_element": {
+ "type": "id",
+ "value": "searchword"
+ },
+ "input_text": "hello",
+ "target_html": "",
+ "ground_truth": {
+ "screenshot": "cambridge_lookup_1_gt.png",
+ "description": "The word 'hello' has been entered in the search box",
+ "visual_changes": [
+ "Text 'hello' appears in search box",
+ "Text cursor visible at end of input",
+ "Search suggestions may appear"
+ ],
+ "success_criteria": [
+ "Input text matches 'hello' exactly",
+ "Text is visible in search box",
+ "Search box maintains focus"
+ ]
+ }
+}
+```
+
+## Field Descriptions
+
+### Basic Information
+- `web_name`: Name of the website
+- `id`: Unique identifier for the task
+- `task`: Human-readable task description
+- `web`: Website URL
+
+### Element and Interaction
+- `element_type`: Type of HTML element (input, button, link, etc.)
+- `interaction`: Type of interaction (click, type, hover)
+- `target_element`: How to find the element
+ - `type`: Selector type (id, class, text)
+ - `value`: Selector value
+- `input_text`: Text to type (only for type interactions)
+
+### Validation
+- `target_html`: The actual HTML element for structural validation
+- `ground_truth`: Validation data
+ - `screenshot`: Reference screenshot filename
+ - `description`: What should happen
+ - `visual_changes`: List of expected visual changes
+ - `success_criteria`: Specific conditions for success
+
+## Validation Process
+
+Tasks are validated using two methods:
+1. **Visual Validation** (60% of score)
+ - Compares screenshots before/after interaction
+ - Verifies visual changes match ground truth
+
+2. **HTML Validation** (40% of score)
+ - Matches the HTML element the model interacted with
+ - Checks structure, attributes, and content
+
+## Adding New Tasks
+
+1. Follow the schema in `task_schema.json`
+2. Ensure unique task IDs
+3. Provide clear success criteria
+4. Include reference screenshots
+5. Fill in the `target_html` field with the actual HTML element
diff --git a/data/dom_tasks.jsonl b/data/dom_tasks.jsonl
index 8dba4f3..a5250c5 100644
--- a/data/dom_tasks.jsonl
+++ b/data/dom_tasks.jsonl
@@ -1,80 +1,80 @@
-{"web_name": "Cambridge Dictionary", "id": "cambridge_lookup_1", "task": "Click the search box and type 'hello'", "web": "https://dictionary.cambridge.org/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "searchword"}, "input_text": "hello", "ground_truth": {"screenshot": "cambridge_lookup_1_gt.png", "description": "The word 'hello' has been entered in the search box", "visual_changes": ["Text 'hello' appears in search box", "Text cursor visible at end of input", "Search suggestions may appear"], "success_criteria": ["Input text matches 'hello' exactly", "Text is visible in search box", "Search box maintains focus"]}}
-{"web_name": "Cambridge Dictionary", "id": "cambridge_search_1", "task": "Click the search button", "web": "https://dictionary.cambridge.org/", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "cdo-search-button"}, "ground_truth": {"screenshot": "cambridge_search_1_gt.png", "description": "The search results for 'hello' are displayed", "visual_changes": ["Search button appears pressed", "Page transitions to search results", "Definition of 'hello' is displayed"], "success_criteria": ["Search button responds to click", "Results page loads", "No error messages displayed"]}}
-{"web_name": "AllRecipes", "id": "allrecipes_search_1", "task": "Click the search box and type 'vegetarian lasagna'", "web": "https://www.allrecipes.com/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "search-box"}, "input_text": "vegetarian lasagna", "ground_truth": {"screenshot": "allrecipes_search_1_gt.png", "description": "Search term entered in search box", "visual_changes": ["Text appears in search box", "Search suggestions may appear"], "success_criteria": ["Text matches exactly", "Search box contains text"]}}
-{"web_name": "AllRecipes", "id": "allrecipes_filter_1", "task": "Click the 'Ratings' filter button", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Ratings"}, "ground_truth": {"screenshot": "allrecipes_filter_1_gt.png", "description": "Ratings filter dropdown opens", "visual_changes": ["Dropdown menu appears", "Filter options visible"], "success_criteria": ["Dropdown menu is visible", "Filter options are clickable"]}}
-{"web_name": "Amazon", "id": "amazon_search_1", "task": "Click the search box and type 'laptop'", "web": "https://www.amazon.com/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "twotabsearchtextbox"}, "input_text": "laptop", "ground_truth": {"screenshot": "amazon_search_1_gt.png", "description": "Search term entered", "visual_changes": ["Text appears in search box", "Search suggestions appear"], "success_criteria": ["Text matches exactly", "Search suggestions visible"]}}
-{"web_name": "Amazon", "id": "amazon_menu_1", "task": "Click the hamburger menu button", "web": "https://www.amazon.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "id", "value": "nav-hamburger-menu"}, "ground_truth": {"screenshot": "amazon_menu_1_gt.png", "description": "Side menu opens", "visual_changes": ["Menu slides in from left", "Menu options visible"], "success_criteria": ["Menu is visible", "Menu items are clickable"]}}
-{"web_name": "Apple", "id": "apple_menu_1", "task": "Click the 'Mac' menu item", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Mac"}, "ground_truth": {"screenshot": "apple_menu_1_gt.png", "description": "Mac page loads", "visual_changes": ["Page transitions to Mac section", "Mac products visible"], "success_criteria": ["Page URL changes", "Mac content visible"]}}
-{"web_name": "Apple", "id": "apple_search_1", "task": "Click the search icon", "web": "https://www.apple.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "globalnav-search-trigger"}, "ground_truth": {"screenshot": "apple_search_1_gt.png", "description": "Search overlay opens", "visual_changes": ["Search overlay appears", "Search box focused"], "success_criteria": ["Search overlay visible", "Search box is active"]}}
-{"web_name": "ArXiv", "id": "arxiv_search_1", "task": "Click the search box and type 'quantum computing'", "web": "https://arxiv.org/", "element_type": "input", "interaction": "type", "target_element": {"type": "name", "value": "query"}, "input_text": "quantum computing", "ground_truth": {"screenshot": "arxiv_search_1_gt.png", "description": "Search term entered", "visual_changes": ["Text appears in search box"], "success_criteria": ["Text matches exactly", "Search box contains text"]}}
-{"web_name": "ArXiv", "id": "arxiv_filter_1", "task": "Click the 'Advanced Search' link", "web": "https://arxiv.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Advanced Search"}, "ground_truth": {"screenshot": "arxiv_filter_1_gt.png", "description": "Advanced search page loads", "visual_changes": ["Page transitions to advanced search", "Search filters visible"], "success_criteria": ["URL changes to advanced search", "Advanced search form visible"]}}
-{"web_name": "BBC News", "id": "bbc_menu_1", "task": "Click the menu button", "web": "https://www.bbc.com/news", "element_type": "button", "interaction": "click", "target_element": {"type": "id", "value": "orbit-more-menu"}, "ground_truth": {"screenshot": "bbc_menu_1_gt.png", "description": "Menu overlay opens", "visual_changes": ["Menu overlay appears", "Navigation options visible"], "success_criteria": ["Menu overlay is visible", "Menu items are clickable"]}}
-{"web_name": "BBC News", "id": "bbc_search_1", "task": "Click the search icon", "web": "https://www.bbc.com/news", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "orbit-search__button"}, "ground_truth": {"screenshot": "bbc_search_1_gt.png", "description": "Search overlay opens", "visual_changes": ["Search overlay appears", "Search box focused"], "success_criteria": ["Search overlay visible", "Search input is active"]}}
-{"web_name": "Booking", "id": "booking_destination_1", "task": "Click the destination input and type 'Paris'", "web": "https://www.booking.com/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "ss"}, "input_text": "Paris", "ground_truth": {"screenshot": "booking_destination_1_gt.png", "description": "Destination entered", "visual_changes": ["Text appears in input", "Location suggestions appear"], "success_criteria": ["Text matches exactly", "Suggestions are visible"]}}
-{"web_name": "Booking", "id": "booking_dates_1", "task": "Click the check-in date field", "web": "https://www.booking.com/", "element_type": "div", "interaction": "click", "target_element": {"type": "class", "value": "sb-date-field__field"}, "ground_truth": {"screenshot": "booking_dates_1_gt.png", "description": "Calendar overlay opens", "visual_changes": ["Calendar overlay appears", "Available dates highlighted"], "success_criteria": ["Calendar is visible", "Dates are selectable"]}}
-{"web_name": "Cambridge Dictionary", "id": "cambridge_menu_1", "task": "Click the 'More' menu button", "web": "https://dictionary.cambridge.org/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "More"}, "ground_truth": {"screenshot": "cambridge_menu_1_gt.png", "description": "More menu opens", "visual_changes": ["Dropdown menu appears", "Menu options visible"], "success_criteria": ["Dropdown is visible", "Menu items are clickable"]}}
-{"web_name": "AllRecipes", "id": "allrecipes_menu_1", "task": "Click the 'Ingredients' button", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Ingredients"}, "ground_truth": {"screenshot": "allrecipes_menu_1_gt.png", "description": "Ingredients section expands", "visual_changes": ["Section expands", "Ingredient list visible"], "success_criteria": ["Section is expanded", "Ingredients are visible"]}}
-{"web_name": "Amazon", "id": "amazon_cart_1", "task": "Click the cart icon", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "id", "value": "nav-cart"}, "ground_truth": {"screenshot": "amazon_cart_1_gt.png", "description": "Cart page loads", "visual_changes": ["Page transitions to cart", "Cart contents visible"], "success_criteria": ["Cart page loads", "Cart status visible"]}}
-{"web_name": "Apple", "id": "apple_store_1", "task": "Click the 'Store' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Store"}, "ground_truth": {"screenshot": "apple_store_1_gt.png", "description": "Store page loads", "visual_changes": ["Page transitions to store", "Store products visible"], "success_criteria": ["Store page loads", "Products are visible"]}}
-{"web_name": "ArXiv", "id": "arxiv_recent_1", "task": "Click the 'Recent' link", "web": "https://arxiv.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "recent"}, "ground_truth": {"screenshot": "arxiv_recent_1_gt.png", "description": "Recent submissions page loads", "visual_changes": ["Page transitions to recent submissions", "Recent papers visible"], "success_criteria": ["Recent page loads", "Papers are visible"]}}
-{"web_name": "BBC News", "id": "bbc_sport_1", "task": "Click the 'Sport' section link", "web": "https://www.bbc.com/news", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Sport"}, "ground_truth": {"screenshot": "bbc_sport_1_gt.png", "description": "Sports section loads", "visual_changes": ["Page transitions to sports", "Sports news visible"], "success_criteria": ["Sports page loads", "Sports content visible"]}}
-{"web_name": "Booking", "id": "booking_guests_1", "task": "Click the guests selector", "web": "https://www.booking.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "xp__guests__count"}, "ground_truth": {"screenshot": "booking_guests_1_gt.png", "description": "Guests selector opens", "visual_changes": ["Guests overlay appears", "Guest options visible"], "success_criteria": ["Overlay is visible", "Guest controls are active"]}}
-{"web_name": "Booking", "id": "booking_search_1", "task": "Click the search button", "web": "https://www.booking.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "sb-searchbox__button"}, "ground_truth": {"screenshot": "booking_search_1_gt.png", "description": "Search results load", "visual_changes": ["Page transitions to results", "Available properties shown"], "success_criteria": ["Results page loads", "Properties are visible"]}}
-{"web_name": "AllRecipes", "id": "allrecipes_diet_1", "task": "Click the 'Dietary Restrictions' filter", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Dietary Restrictions"}, "ground_truth": {"screenshot": "allrecipes_diet_1_gt.png", "description": "Diet filter opens", "visual_changes": ["Filter dropdown appears", "Diet options visible"], "success_criteria": ["Dropdown is visible", "Options are clickable"]}}
-{"web_name": "Amazon", "id": "amazon_department_1", "task": "Click the department selector", "web": "https://www.amazon.com/", "element_type": "select", "interaction": "click", "target_element": {"type": "id", "value": "searchDropdownBox"}, "ground_truth": {"screenshot": "amazon_department_1_gt.png", "description": "Department dropdown opens", "visual_changes": ["Dropdown menu appears", "Department list visible"], "success_criteria": ["Dropdown is visible", "Departments are selectable"]}}
-{"web_name": "Apple", "id": "apple_bag_1", "task": "Click the shopping bag icon", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "class", "value": "globalnav-bag-item"}, "ground_truth": {"screenshot": "apple_bag_1_gt.png", "description": "Shopping bag overlay opens", "visual_changes": ["Bag overlay appears", "Cart contents visible"], "success_criteria": ["Overlay is visible", "Cart status shown"]}}
-{"web_name": "ArXiv", "id": "arxiv_subject_1", "task": "Click the subject area dropdown", "web": "https://arxiv.org/", "element_type": "select", "interaction": "click", "target_element": {"type": "name", "value": "subject"}, "ground_truth": {"screenshot": "arxiv_subject_1_gt.png", "description": "Subject dropdown opens", "visual_changes": ["Dropdown menu appears", "Subject areas visible"], "success_criteria": ["Dropdown is visible", "Subjects are selectable"]}}
-{"web_name": "BBC News", "id": "bbc_region_1", "task": "Click the region selector", "web": "https://www.bbc.com/news", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Choose your region"}, "ground_truth": {"screenshot": "bbc_region_1_gt.png", "description": "Region selector opens", "visual_changes": ["Region overlay appears", "Region options visible"], "success_criteria": ["Overlay is visible", "Regions are selectable"]}}
-{"web_name": "Cambridge Dictionary", "id": "cambridge_translate_1", "task": "Click the translation language selector", "web": "https://dictionary.cambridge.org/", "element_type": "select", "interaction": "click", "target_element": {"type": "id", "value": "translation-language"}, "ground_truth": {"screenshot": "cambridge_translate_1_gt.png", "description": "Language dropdown opens", "visual_changes": ["Dropdown menu appears", "Language options visible"], "success_criteria": ["Dropdown is visible", "Languages are selectable"]}}
-{"web_name": "AllRecipes", "id": "allrecipes_sort_1", "task": "Click the sort dropdown", "web": "https://www.allrecipes.com/", "element_type": "select", "interaction": "click", "target_element": {"type": "id", "value": "sort-dropdown"}, "ground_truth": {"screenshot": "allrecipes_sort_1_gt.png", "description": "Sort options appear", "visual_changes": ["Dropdown menu appears", "Sort options visible"], "success_criteria": ["Dropdown is visible", "Options are selectable"]}}
-{"web_name": "Amazon", "id": "amazon_language_1", "task": "Click the language selector", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "id", "value": "icp-nav-flyout"}, "ground_truth": {"screenshot": "amazon_language_1_gt.png", "description": "Language overlay opens", "visual_changes": ["Language overlay appears", "Language options visible"], "success_criteria": ["Overlay is visible", "Languages are selectable"]}}
-{"web_name": "BBC News", "id": "bbc_weather_1", "task": "Click the weather widget", "web": "https://www.bbc.com/news", "element_type": "div", "interaction": "click", "target_element": {"type": "class", "value": "weather-widget"}, "ground_truth": {"screenshot": "bbc_weather_1_gt.png", "description": "Weather details expand", "visual_changes": ["Weather details appear", "Forecast visible"], "success_criteria": ["Weather details visible", "Forecast information shown"]}}
-{"web_name": "Booking", "id": "booking_currency_1", "task": "Click the currency selector", "web": "https://www.booking.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "bui-button__text"}, "ground_truth": {"screenshot": "booking_currency_1_gt.png", "description": "Currency selector opens", "visual_changes": ["Currency overlay appears", "Currency options visible"], "success_criteria": ["Overlay is visible", "Currencies are selectable"]}}
-{"web_name": "AllRecipes", "id": "allrecipes_share_1", "task": "Click the share button", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Share"}, "ground_truth": {"screenshot": "allrecipes_share_1_gt.png", "description": "Share options appear", "visual_changes": ["Share overlay appears", "Share options visible"], "success_criteria": ["Overlay is visible", "Share options are clickable"]}}
-{"web_name": "Amazon", "id": "amazon_account_1", "task": "Click the account menu", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "id", "value": "nav-link-accountList"}, "ground_truth": {"screenshot": "amazon_account_1_gt.png", "description": "Account menu opens", "visual_changes": ["Account overlay appears", "Account options visible"], "success_criteria": ["Overlay is visible", "Account options are clickable"]}}
-{"web_name": "Apple", "id": "apple_support_1", "task": "Click the 'Support' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Support"}, "ground_truth": {"screenshot": "apple_support_1_gt.png", "description": "Support page loads", "visual_changes": ["Page transitions to support", "Support options visible"], "success_criteria": ["Support page loads", "Support content visible"]}}
-{"web_name": "ArXiv", "id": "arxiv_help_1", "task": "Click the 'Help' link", "web": "https://arxiv.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Help"}, "ground_truth": {"screenshot": "arxiv_help_1_gt.png", "description": "Help page loads", "visual_changes": ["Page transitions to help", "Help content visible"], "success_criteria": ["Help page loads", "Help content visible"]}}
-{"web_name": "BBC News", "id": "bbc_video_1", "task": "Click the video player", "web": "https://www.bbc.com/news", "element_type": "div", "interaction": "click", "target_element": {"type": "class", "value": "media-player"}, "ground_truth": {"screenshot": "bbc_video_1_gt.png", "description": "Video player activates", "visual_changes": ["Video starts playing", "Player controls visible"], "success_criteria": ["Video is playing", "Controls are visible"]}}
-{"web_name": "Cambridge Dictionary", "id": "cambridge_grammar_1", "task": "Click the 'Grammar' tab", "web": "https://dictionary.cambridge.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Grammar"}, "ground_truth": {"screenshot": "cambridge_grammar_1_gt.png", "description": "Grammar section loads", "visual_changes": ["Page transitions to grammar", "Grammar content visible"], "success_criteria": ["Grammar page loads", "Grammar content visible"]}}
-{"web_name": "AllRecipes", "id": "allrecipes_print_1", "task": "Click the print button", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Print"}, "ground_truth": {"screenshot": "allrecipes_print_1_gt.png", "description": "Print dialog opens", "visual_changes": ["Print overlay appears", "Print options visible"], "success_criteria": ["Print dialog visible", "Print options available"]}}
-{"web_name": "Amazon", "id": "amazon_orders_1", "task": "Click the 'Returns & Orders' link", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "id", "value": "nav-orders"}, "ground_truth": {"screenshot": "amazon_orders_1_gt.png", "description": "Orders page loads", "visual_changes": ["Page transitions to orders", "Order history visible"], "success_criteria": ["Orders page loads", "Order history visible"]}}
-{"web_name": "BBC News", "id": "bbc_notification_1", "task": "Click the notification bell icon", "web": "https://www.bbc.com/news", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "notification-bell"}, "ground_truth": {"screenshot": "bbc_notification_1_gt.png", "description": "Notification settings open", "visual_changes": ["Notification overlay appears", "Notification options visible"], "success_criteria": ["Overlay is visible", "Settings are accessible"]}}
-{"web_name": "Booking", "id": "booking_property_1", "task": "Click the property type filter", "web": "https://www.booking.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Property type"}, "ground_truth": {"screenshot": "booking_property_1_gt.png", "description": "Property types appear", "visual_changes": ["Filter overlay appears", "Property options visible"], "success_criteria": ["Overlay is visible", "Options are selectable"]}}
-{"web_name": "AllRecipes", "id": "allrecipes_save_1", "task": "Click the save recipe button", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Save Recipe"}, "ground_truth": {"screenshot": "allrecipes_save_1_gt.png", "description": "Save options appear", "visual_changes": ["Save overlay appears", "Collection options visible"], "success_criteria": ["Save dialog visible", "Collections are selectable"]}}
-{"web_name": "Amazon", "id": "amazon_filter_1", "task": "Click the price filter dropdown", "web": "https://www.amazon.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Price"}, "ground_truth": {"screenshot": "amazon_filter_1_gt.png", "description": "Price ranges appear", "visual_changes": ["Price overlay appears", "Range options visible"], "success_criteria": ["Overlay is visible", "Ranges are selectable"]}}
-{"web_name": "Apple", "id": "apple_watch_1", "task": "Click the 'Watch' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Watch"}, "ground_truth": {"screenshot": "apple_watch_1_gt.png", "description": "Watch page loads", "visual_changes": ["Page transitions to Watch", "Watch products visible"], "success_criteria": ["Watch page loads", "Products are visible"]}}
-{"web_name": "ArXiv", "id": "arxiv_pdf_1", "task": "Click the PDF link", "web": "https://arxiv.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "PDF"}, "ground_truth": {"screenshot": "arxiv_pdf_1_gt.png", "description": "PDF starts downloading", "visual_changes": ["Download starts", "Download indicator visible"], "success_criteria": ["Download begins", "Download status shown"]}}
-{"web_name": "BBC News", "id": "bbc_business_1", "task": "Click the 'Business' section link", "web": "https://www.bbc.com/news", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Business"}, "ground_truth": {"screenshot": "bbc_business_1_gt.png", "description": "Business section loads", "visual_changes": ["Page transitions to business", "Business news visible"], "success_criteria": ["Business page loads", "Business content visible"]}}
-{"web_name": "Cambridge Dictionary", "id": "cambridge_thesaurus_1", "task": "Click the 'Thesaurus' tab", "web": "https://dictionary.cambridge.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Thesaurus"}, "ground_truth": {"screenshot": "cambridge_thesaurus_1_gt.png", "description": "Thesaurus section loads", "visual_changes": ["Page transitions to thesaurus", "Thesaurus content visible"], "success_criteria": ["Thesaurus page loads", "Synonyms are visible"]}}
-{"web_name": "AllRecipes", "id": "allrecipes_review_1", "task": "Click the reviews tab", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Reviews"}, "ground_truth": {"screenshot": "allrecipes_review_1_gt.png", "description": "Reviews section opens", "visual_changes": ["Reviews section appears", "Review content visible"], "success_criteria": ["Reviews are visible", "Rating information shown"]}}
-{"web_name": "Amazon", "id": "amazon_prime_1", "task": "Click the Prime benefits link", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Prime"}, "ground_truth": {"screenshot": "amazon_prime_1_gt.png", "description": "Prime page loads", "visual_changes": ["Page transitions to Prime", "Prime benefits visible"], "success_criteria": ["Prime page loads", "Benefits are visible"]}}
-{"web_name": "Apple", "id": "apple_ipad_1", "task": "Click the 'iPad' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "iPad"}, "ground_truth": {"screenshot": "apple_ipad_1_gt.png", "description": "iPad page loads", "visual_changes": ["Page transitions to iPad", "iPad products visible"], "success_criteria": ["iPad page loads", "Products are visible"]}}
-{"web_name": "ArXiv", "id": "arxiv_abstract_1", "task": "Click the abstract toggle", "web": "https://arxiv.org/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Abstract"}, "ground_truth": {"screenshot": "arxiv_abstract_1_gt.png", "description": "Abstract expands", "visual_changes": ["Abstract section expands", "Full text visible"], "success_criteria": ["Abstract is expanded", "Text is readable"]}}
-{"web_name": "BBC News", "id": "bbc_tech_1", "task": "Click the 'Technology' section link", "web": "https://www.bbc.com/news", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Technology"}, "ground_truth": {"screenshot": "bbc_tech_1_gt.png", "description": "Technology section loads", "visual_changes": ["Page transitions to technology", "Tech news visible"], "success_criteria": ["Tech page loads", "Tech content visible"]}}
-{"web_name": "Cambridge Dictionary", "id": "cambridge_translate_2", "task": "Click the search box and type 'bonjour'", "web": "https://dictionary.cambridge.org/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "searchword"}, "input_text": "bonjour", "ground_truth": {"screenshot": "cambridge_translate_2_gt.png", "description": "Word entered in search", "visual_changes": ["Text appears in search box", "Suggestions may appear"], "success_criteria": ["Text matches exactly", "Input is visible"]}}
-{"web_name": "AllRecipes", "id": "allrecipes_cuisine_1", "task": "Click the cuisine filter", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Cuisine"}, "ground_truth": {"screenshot": "allrecipes_cuisine_1_gt.png", "description": "Cuisine options appear", "visual_changes": ["Filter overlay appears", "Cuisine options visible"], "success_criteria": ["Overlay is visible", "Options are selectable"]}}
-{"web_name": "Amazon", "id": "amazon_deals_1", "task": "Click the 'Today's Deals' link", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Today's Deals"}, "ground_truth": {"screenshot": "amazon_deals_1_gt.png", "description": "Deals page loads", "visual_changes": ["Page transitions to deals", "Deal items visible"], "success_criteria": ["Deals page loads", "Deals are visible"]}}
-{"web_name": "Booking", "id": "booking_map_1", "task": "Click the map view button", "web": "https://www.booking.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Map"}, "ground_truth": {"screenshot": "booking_map_1_gt.png", "description": "Map view opens", "visual_changes": ["Map interface appears", "Property markers visible"], "success_criteria": ["Map is visible", "Properties are plotted"]}}
-{"web_name": "Apple", "id": "apple_airpods_1", "task": "Click the 'AirPods' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "AirPods"}, "ground_truth": {"screenshot": "apple_airpods_1_gt.png", "description": "AirPods page loads", "visual_changes": ["Page transitions to AirPods", "AirPods products visible"], "success_criteria": ["AirPods page loads", "Products are visible"]}}
-{"web_name": "ArXiv", "id": "arxiv_author_1", "task": "Click the author search link", "web": "https://arxiv.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Author"}, "ground_truth": {"screenshot": "arxiv_author_1_gt.png", "description": "Author search opens", "visual_changes": ["Author search interface appears", "Search options visible"], "success_criteria": ["Search interface visible", "Author field active"]}}
-{"web_name": "BBC News", "id": "bbc_share_1", "task": "Click the share button", "web": "https://www.bbc.com/news", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "share-button"}, "ground_truth": {"screenshot": "bbc_share_1_gt.png", "description": "Share options appear", "visual_changes": ["Share overlay appears", "Share options visible"], "success_criteria": ["Overlay is visible", "Options are clickable"]}}
-{"web_name": "Cambridge Dictionary", "id": "cambridge_examples_1", "task": "Click the 'Examples' tab", "web": "https://dictionary.cambridge.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Examples"}, "ground_truth": {"screenshot": "cambridge_examples_1_gt.png", "description": "Examples section loads", "visual_changes": ["Page transitions to examples", "Usage examples visible"], "success_criteria": ["Examples page loads", "Examples are visible"]}}
-{"web_name": "AllRecipes", "id": "allrecipes_nutrition_1", "task": "Click the nutrition info button", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Nutrition"}, "ground_truth": {"screenshot": "allrecipes_nutrition_1_gt.png", "description": "Nutrition info appears", "visual_changes": ["Nutrition overlay appears", "Nutritional values visible"], "success_criteria": ["Overlay is visible", "Values are readable"]}}
-{"web_name": "Amazon", "id": "amazon_wishlist_1", "task": "Click the 'Add to List' button", "web": "https://www.amazon.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Add to List"}, "ground_truth": {"screenshot": "amazon_wishlist_1_gt.png", "description": "List options appear", "visual_changes": ["List overlay appears", "List options visible"], "success_criteria": ["Overlay is visible", "Lists are selectable"]}}
-{"web_name": "Apple", "id": "apple_iphone_1", "task": "Click the 'iPhone' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "iPhone"}, "ground_truth": {"screenshot": "apple_iphone_1_gt.png", "description": "iPhone page loads", "visual_changes": ["Page transitions to iPhone", "iPhone products visible"], "success_criteria": ["iPhone page loads", "Products are visible"]}}
-{"web_name": "ArXiv", "id": "arxiv_date_1", "task": "Click the date range filter", "web": "https://arxiv.org/", "element_type": "select", "interaction": "click", "target_element": {"type": "name", "value": "date-range"}, "ground_truth": {"screenshot": "arxiv_date_1_gt.png", "description": "Date options appear", "visual_changes": ["Date dropdown appears", "Range options visible"], "success_criteria": ["Dropdown is visible", "Ranges are selectable"]}}
-{"web_name": "BBC News", "id": "bbc_climate_1", "task": "Click the 'Climate' section link", "web": "https://www.bbc.com/news", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Climate"}, "ground_truth": {"screenshot": "bbc_climate_1_gt.png", "description": "Climate section loads", "visual_changes": ["Page transitions to climate", "Climate news visible"], "success_criteria": ["Climate page loads", "Climate content visible"]}}
-{"web_name": "Booking", "id": "booking_rating_1", "task": "Click the rating filter", "web": "https://www.booking.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Rating"}, "ground_truth": {"screenshot": "booking_rating_1_gt.png", "description": "Rating options appear", "visual_changes": ["Rating overlay appears", "Score options visible"], "success_criteria": ["Overlay is visible", "Ratings are selectable"]}}
-{"web_name": "Cambridge Dictionary", "id": "cambridge_browse_1", "task": "Click the 'Browse Dictionary' link", "web": "https://dictionary.cambridge.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Browse Dictionary"}, "ground_truth": {"screenshot": "cambridge_browse_1_gt.png", "description": "Browse page loads", "visual_changes": ["Page transitions to browse", "Word categories visible"], "success_criteria": ["Browse page loads", "Categories are visible"]}}
-{"web_name": "AllRecipes", "id": "allrecipes_video_1", "task": "Click the recipe video play button", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "video-play"}, "ground_truth": {"screenshot": "allrecipes_video_1_gt.png", "description": "Video starts playing", "visual_changes": ["Video begins playback", "Player controls visible"], "success_criteria": ["Video is playing", "Controls are visible"]}}
-{"web_name": "Amazon", "id": "amazon_seller_1", "task": "Click the 'Other Sellers' link", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Other Sellers"}, "ground_truth": {"screenshot": "amazon_seller_1_gt.png", "description": "Seller options appear", "visual_changes": ["Seller list appears", "Price options visible"], "success_criteria": ["Seller list visible", "Prices are shown"]}}
-{"web_name": "Apple", "id": "apple_tv_1", "task": "Click the 'TV & Home' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "TV & Home"}, "ground_truth": {"screenshot": "apple_tv_1_gt.png", "description": "TV & Home page loads", "visual_changes": ["Page transitions to TV", "TV products visible"], "success_criteria": ["TV page loads", "Products are visible"]}}
-{"web_name": "ArXiv", "id": "arxiv_title_1", "task": "Click the search box and type 'machine learning'", "web": "https://arxiv.org/", "element_type": "input", "interaction": "type", "target_element": {"type": "name", "value": "query"}, "input_text": "machine learning", "ground_truth": {"screenshot": "arxiv_title_1_gt.png", "description": "Search term entered", "visual_changes": ["Text appears in search box", "Suggestions may appear"], "success_criteria": ["Text matches exactly", "Input is visible"]}}
-{"web_name": "BBC News", "id": "bbc_science_1", "task": "Click the 'Science' section link", "web": "https://www.bbc.com/news", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Science"}, "ground_truth": {"screenshot": "bbc_science_1_gt.png", "description": "Science section loads", "visual_changes": ["Page transitions to science", "Science news visible"], "success_criteria": ["Science page loads", "Science content visible"]}}
-{"web_name": "Cambridge Dictionary", "id": "cambridge_word_1", "task": "Click the 'Word of the Day' link", "web": "https://dictionary.cambridge.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Word of the Day"}, "ground_truth": {"screenshot": "cambridge_word_1_gt.png", "description": "Word of the Day loads", "visual_changes": ["Page transitions to word", "Word details visible"], "success_criteria": ["Word page loads", "Definition visible"]}}
-{"web_name": "AllRecipes", "id": "allrecipes_time_1", "task": "Click the cooking time filter", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Cooking Time"}, "ground_truth": {"screenshot": "allrecipes_time_1_gt.png", "description": "Time options appear", "visual_changes": ["Filter overlay appears", "Time ranges visible"], "success_criteria": ["Overlay is visible", "Ranges are selectable"]}}
-{"web_name": "Amazon", "id": "amazon_gift_1", "task": "Click the 'Gift Cards' link", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Gift Cards"}, "ground_truth": {"screenshot": "amazon_gift_1_gt.png", "description": "Gift cards page loads", "visual_changes": ["Page transitions to gifts", "Gift card options visible"], "success_criteria": ["Gift page loads", "Options are visible"]}}
-{"web_name": "Booking", "id": "booking_popular_1", "task": "Click the 'Popular filters' button", "web": "https://www.booking.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Popular filters"}, "ground_truth": {"screenshot": "booking_popular_1_gt.png", "description": "Popular filters appear", "visual_changes": ["Filter overlay appears", "Popular options visible"], "success_criteria": ["Overlay is visible", "Filters are selectable"]}}
-{"web_name": "Apple", "id": "apple_music_1", "task": "Click the 'Music' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Music"}, "ground_truth": {"screenshot": "apple_music_1_gt.png", "description": "Music page loads", "visual_changes": ["Page transitions to music", "Music services visible"], "success_criteria": ["Music page loads", "Services are visible"]}}
-{"web_name": "ArXiv", "id": "arxiv_stats_1", "task": "Click the 'Statistics' subject link", "web": "https://arxiv.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Statistics"}, "ground_truth": {"screenshot": "arxiv_stats_1_gt.png", "description": "Statistics papers load", "visual_changes": ["Page transitions to stats", "Statistics papers visible"], "success_criteria": ["Stats page loads", "Papers are visible"]}}
-{"web_name": "BBC News", "id": "bbc_local_1", "task": "Click the 'Local News' link", "web": "https://www.bbc.com/news", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Local News"}, "ground_truth": {"screenshot": "bbc_local_1_gt.png", "description": "Local news loads", "visual_changes": ["Page transitions to local", "Local news visible"], "success_criteria": ["Local page loads", "News is visible"]}}
\ No newline at end of file
+{"web_name": "Cambridge Dictionary", "id": "cambridge_lookup_1", "task": "Click the search box and type 'hello'", "web": "https://dictionary.cambridge.org/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "searchword"}, "input_text": "hello", "target_html": "", "ground_truth": {"screenshot": "cambridge_lookup_1_gt.png", "description": "The word 'hello' has been entered in the search box", "visual_changes": ["Text 'hello' appears in search box", "Text cursor visible at end of input", "Search suggestions may appear"], "success_criteria": ["Input text matches 'hello' exactly", "Text is visible in search box", "Search box maintains focus"]}}
+{"web_name": "Cambridge Dictionary", "id": "cambridge_search_1", "task": "Click the search button", "web": "https://dictionary.cambridge.org/", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "cdo-search-button"}, "target_html": "", "ground_truth": {"screenshot": "cambridge_search_1_gt.png", "description": "The search results for 'hello' are displayed", "visual_changes": ["Search button appears pressed", "Page transitions to search results", "Definition of 'hello' is displayed"], "success_criteria": ["Search button responds to click", "Results page loads", "No error messages displayed"]}}
+{"web_name": "AllRecipes", "id": "allrecipes_search_1", "task": "Click the search box and type 'vegetarian lasagna'", "web": "https://www.allrecipes.com/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "search-box"}, "input_text": "vegetarian lasagna", "ground_truth": {"screenshot": "allrecipes_search_1_gt.png", "description": "Search term entered in search box", "visual_changes": ["Text appears in search box", "Search suggestions may appear"], "success_criteria": ["Text matches exactly", "Search box contains text"]}, "target_html": ""}
+{"web_name": "AllRecipes", "id": "allrecipes_filter_1", "task": "Click the 'Ratings' filter button", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Ratings"}, "ground_truth": {"screenshot": "allrecipes_filter_1_gt.png", "description": "Ratings filter dropdown opens", "visual_changes": ["Dropdown menu appears", "Filter options visible"], "success_criteria": ["Dropdown menu is visible", "Filter options are clickable"]}, "target_html": ""}
+{"web_name": "Amazon", "id": "amazon_search_1", "task": "Click the search box and type 'laptop'", "web": "https://www.amazon.com/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "twotabsearchtextbox"}, "input_text": "laptop", "ground_truth": {"screenshot": "amazon_search_1_gt.png", "description": "Search term entered", "visual_changes": ["Text appears in search box", "Search suggestions appear"], "success_criteria": ["Text matches exactly", "Search suggestions visible"]}, "target_html": ""}
+{"web_name": "Amazon", "id": "amazon_menu_1", "task": "Click the hamburger menu button", "web": "https://www.amazon.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "id", "value": "nav-hamburger-menu"}, "ground_truth": {"screenshot": "amazon_menu_1_gt.png", "description": "Side menu opens", "visual_changes": ["Menu slides in from left", "Menu options visible"], "success_criteria": ["Menu is visible", "Menu items are clickable"]}, "target_html": ""}
+{"web_name": "Apple", "id": "apple_menu_1", "task": "Click the 'Mac' menu item", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Mac"}, "ground_truth": {"screenshot": "apple_menu_1_gt.png", "description": "Mac page loads", "visual_changes": ["Page transitions to Mac section", "Mac products visible"], "success_criteria": ["Page URL changes", "Mac content visible"]}, "target_html": ""}
+{"web_name": "Apple", "id": "apple_search_1", "task": "Click the search icon", "web": "https://www.apple.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "globalnav-search-trigger"}, "ground_truth": {"screenshot": "apple_search_1_gt.png", "description": "Search overlay opens", "visual_changes": ["Search overlay appears", "Search box focused"], "success_criteria": ["Search overlay visible", "Search box is active"]}, "target_html": ""}
+{"web_name": "ArXiv", "id": "arxiv_search_1", "task": "Click the search box and type 'quantum computing'", "web": "https://arxiv.org/", "element_type": "input", "interaction": "type", "target_element": {"type": "name", "value": "query"}, "input_text": "quantum computing", "ground_truth": {"screenshot": "arxiv_search_1_gt.png", "description": "Search term entered", "visual_changes": ["Text appears in search box"], "success_criteria": ["Text matches exactly", "Search box contains text"]}, "target_html": ""}
+{"web_name": "ArXiv", "id": "arxiv_filter_1", "task": "Click the 'Advanced Search' link", "web": "https://arxiv.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Advanced Search"}, "ground_truth": {"screenshot": "arxiv_filter_1_gt.png", "description": "Advanced search page loads", "visual_changes": ["Page transitions to advanced search", "Search filters visible"], "success_criteria": ["URL changes to advanced search", "Advanced search form visible"]}, "target_html": ""}
+{"web_name": "BBC News", "id": "bbc_menu_1", "task": "Click the menu button", "web": "https://www.bbc.com/news", "element_type": "button", "interaction": "click", "target_element": {"type": "id", "value": "orbit-more-menu"}, "ground_truth": {"screenshot": "bbc_menu_1_gt.png", "description": "Menu overlay opens", "visual_changes": ["Menu overlay appears", "Navigation options visible"], "success_criteria": ["Menu overlay is visible", "Menu items are clickable"]}, "target_html": ""}
+{"web_name": "BBC News", "id": "bbc_search_1", "task": "Click the search icon", "web": "https://www.bbc.com/news", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "orbit-search__button"}, "ground_truth": {"screenshot": "bbc_search_1_gt.png", "description": "Search overlay opens", "visual_changes": ["Search overlay appears", "Search box focused"], "success_criteria": ["Search overlay visible", "Search input is active"]}, "target_html": ""}
+{"web_name": "Booking", "id": "booking_destination_1", "task": "Click the destination input and type 'Paris'", "web": "https://www.booking.com/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "ss"}, "input_text": "Paris", "ground_truth": {"screenshot": "booking_destination_1_gt.png", "description": "Destination entered", "visual_changes": ["Text appears in input", "Location suggestions appear"], "success_criteria": ["Text matches exactly", "Suggestions are visible"]}, "target_html": ""}
+{"web_name": "Booking", "id": "booking_dates_1", "task": "Click the check-in date field", "web": "https://www.booking.com/", "element_type": "div", "interaction": "click", "target_element": {"type": "class", "value": "sb-date-field__field"}, "ground_truth": {"screenshot": "booking_dates_1_gt.png", "description": "Calendar overlay opens", "visual_changes": ["Calendar overlay appears", "Available dates highlighted"], "success_criteria": ["Calendar is visible", "Dates are selectable"]}, "target_html": ""}
+{"web_name": "Cambridge Dictionary", "id": "cambridge_menu_1", "task": "Click the 'More' menu button", "web": "https://dictionary.cambridge.org/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "More"}, "ground_truth": {"screenshot": "cambridge_menu_1_gt.png", "description": "More menu opens", "visual_changes": ["Dropdown menu appears", "Menu options visible"], "success_criteria": ["Dropdown is visible", "Menu items are clickable"]}, "target_html": ""}
+{"web_name": "AllRecipes", "id": "allrecipes_menu_1", "task": "Click the 'Ingredients' button", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Ingredients"}, "ground_truth": {"screenshot": "allrecipes_menu_1_gt.png", "description": "Ingredients section expands", "visual_changes": ["Section expands", "Ingredient list visible"], "success_criteria": ["Section is expanded", "Ingredients are visible"]}, "target_html": ""}
+{"web_name": "Amazon", "id": "amazon_cart_1", "task": "Click the cart icon", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "id", "value": "nav-cart"}, "ground_truth": {"screenshot": "amazon_cart_1_gt.png", "description": "Cart page loads", "visual_changes": ["Page transitions to cart", "Cart contents visible"], "success_criteria": ["Cart page loads", "Cart status visible"]}, "target_html": ""}
+{"web_name": "Apple", "id": "apple_store_1", "task": "Click the 'Store' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Store"}, "ground_truth": {"screenshot": "apple_store_1_gt.png", "description": "Store page loads", "visual_changes": ["Page transitions to store", "Store products visible"], "success_criteria": ["Store page loads", "Products are visible"]}, "target_html": ""}
+{"web_name": "ArXiv", "id": "arxiv_recent_1", "task": "Click the 'Recent' link", "web": "https://arxiv.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "recent"}, "ground_truth": {"screenshot": "arxiv_recent_1_gt.png", "description": "Recent submissions page loads", "visual_changes": ["Page transitions to recent submissions", "Recent papers visible"], "success_criteria": ["Recent page loads", "Papers are visible"]}, "target_html": ""}
+{"web_name": "BBC News", "id": "bbc_sport_1", "task": "Click the 'Sport' section link", "web": "https://www.bbc.com/news", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Sport"}, "ground_truth": {"screenshot": "bbc_sport_1_gt.png", "description": "Sports section loads", "visual_changes": ["Page transitions to sports", "Sports news visible"], "success_criteria": ["Sports page loads", "Sports content visible"]}, "target_html": ""}
+{"web_name": "Booking", "id": "booking_guests_1", "task": "Click the guests selector", "web": "https://www.booking.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "xp__guests__count"}, "ground_truth": {"screenshot": "booking_guests_1_gt.png", "description": "Guests selector opens", "visual_changes": ["Guests overlay appears", "Guest options visible"], "success_criteria": ["Overlay is visible", "Guest controls are active"]}, "target_html": ""}
+{"web_name": "Booking", "id": "booking_search_1", "task": "Click the search button", "web": "https://www.booking.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "sb-searchbox__button"}, "ground_truth": {"screenshot": "booking_search_1_gt.png", "description": "Search results load", "visual_changes": ["Page transitions to results", "Available properties shown"], "success_criteria": ["Results page loads", "Properties are visible"]}, "target_html": ""}
+{"web_name": "AllRecipes", "id": "allrecipes_diet_1", "task": "Click the 'Dietary Restrictions' filter", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Dietary Restrictions"}, "ground_truth": {"screenshot": "allrecipes_diet_1_gt.png", "description": "Diet filter opens", "visual_changes": ["Filter dropdown appears", "Diet options visible"], "success_criteria": ["Dropdown is visible", "Options are clickable"]}, "target_html": ""}
+{"web_name": "Amazon", "id": "amazon_department_1", "task": "Click the department selector", "web": "https://www.amazon.com/", "element_type": "select", "interaction": "click", "target_element": {"type": "id", "value": "searchDropdownBox"}, "ground_truth": {"screenshot": "amazon_department_1_gt.png", "description": "Department dropdown opens", "visual_changes": ["Dropdown menu appears", "Department list visible"], "success_criteria": ["Dropdown is visible", "Departments are selectable"]}, "target_html": ""}
+{"web_name": "Apple", "id": "apple_bag_1", "task": "Click the shopping bag icon", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "class", "value": "globalnav-bag-item"}, "ground_truth": {"screenshot": "apple_bag_1_gt.png", "description": "Shopping bag overlay opens", "visual_changes": ["Bag overlay appears", "Cart contents visible"], "success_criteria": ["Overlay is visible", "Cart status shown"]}, "target_html": ""}
+{"web_name": "ArXiv", "id": "arxiv_subject_1", "task": "Click the subject area dropdown", "web": "https://arxiv.org/", "element_type": "select", "interaction": "click", "target_element": {"type": "name", "value": "subject"}, "ground_truth": {"screenshot": "arxiv_subject_1_gt.png", "description": "Subject dropdown opens", "visual_changes": ["Dropdown menu appears", "Subject areas visible"], "success_criteria": ["Dropdown is visible", "Subjects are selectable"]}, "target_html": ""}
+{"web_name": "BBC News", "id": "bbc_region_1", "task": "Click the region selector", "web": "https://www.bbc.com/news", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Choose your region"}, "ground_truth": {"screenshot": "bbc_region_1_gt.png", "description": "Region selector opens", "visual_changes": ["Region overlay appears", "Region options visible"], "success_criteria": ["Overlay is visible", "Regions are selectable"]}, "target_html": ""}
+{"web_name": "Cambridge Dictionary", "id": "cambridge_translate_1", "task": "Click the translation language selector", "web": "https://dictionary.cambridge.org/", "element_type": "select", "interaction": "click", "target_element": {"type": "id", "value": "translation-language"}, "ground_truth": {"screenshot": "cambridge_translate_1_gt.png", "description": "Language dropdown opens", "visual_changes": ["Dropdown menu appears", "Language options visible"], "success_criteria": ["Dropdown is visible", "Languages are selectable"]}, "target_html": ""}
+{"web_name": "AllRecipes", "id": "allrecipes_sort_1", "task": "Click the sort dropdown", "web": "https://www.allrecipes.com/", "element_type": "select", "interaction": "click", "target_element": {"type": "id", "value": "sort-dropdown"}, "ground_truth": {"screenshot": "allrecipes_sort_1_gt.png", "description": "Sort options appear", "visual_changes": ["Dropdown menu appears", "Sort options visible"], "success_criteria": ["Dropdown is visible", "Options are selectable"]}, "target_html": ""}
+{"web_name": "Amazon", "id": "amazon_language_1", "task": "Click the language selector", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "id", "value": "icp-nav-flyout"}, "ground_truth": {"screenshot": "amazon_language_1_gt.png", "description": "Language overlay opens", "visual_changes": ["Language overlay appears", "Language options visible"], "success_criteria": ["Overlay is visible", "Languages are selectable"]}, "target_html": ""}
+{"web_name": "BBC News", "id": "bbc_weather_1", "task": "Click the weather widget", "web": "https://www.bbc.com/news", "element_type": "div", "interaction": "click", "target_element": {"type": "class", "value": "weather-widget"}, "ground_truth": {"screenshot": "bbc_weather_1_gt.png", "description": "Weather details expand", "visual_changes": ["Weather details appear", "Forecast visible"], "success_criteria": ["Weather details visible", "Forecast information shown"]}, "target_html": ""}
+{"web_name": "Booking", "id": "booking_currency_1", "task": "Click the currency selector", "web": "https://www.booking.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "bui-button__text"}, "ground_truth": {"screenshot": "booking_currency_1_gt.png", "description": "Currency selector opens", "visual_changes": ["Currency overlay appears", "Currency options visible"], "success_criteria": ["Overlay is visible", "Currencies are selectable"]}, "target_html": ""}
+{"web_name": "AllRecipes", "id": "allrecipes_share_1", "task": "Click the share button", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Share"}, "ground_truth": {"screenshot": "allrecipes_share_1_gt.png", "description": "Share options appear", "visual_changes": ["Share overlay appears", "Share options visible"], "success_criteria": ["Overlay is visible", "Share options are clickable"]}, "target_html": ""}
+{"web_name": "Amazon", "id": "amazon_account_1", "task": "Click the account menu", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "id", "value": "nav-link-accountList"}, "ground_truth": {"screenshot": "amazon_account_1_gt.png", "description": "Account menu opens", "visual_changes": ["Account overlay appears", "Account options visible"], "success_criteria": ["Overlay is visible", "Account options are clickable"]}, "target_html": ""}
+{"web_name": "Apple", "id": "apple_support_1", "task": "Click the 'Support' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Support"}, "ground_truth": {"screenshot": "apple_support_1_gt.png", "description": "Support page loads", "visual_changes": ["Page transitions to support", "Support options visible"], "success_criteria": ["Support page loads", "Support content visible"]}, "target_html": ""}
+{"web_name": "ArXiv", "id": "arxiv_help_1", "task": "Click the 'Help' link", "web": "https://arxiv.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Help"}, "ground_truth": {"screenshot": "arxiv_help_1_gt.png", "description": "Help page loads", "visual_changes": ["Page transitions to help", "Help content visible"], "success_criteria": ["Help page loads", "Help content visible"]}, "target_html": ""}
+{"web_name": "BBC News", "id": "bbc_video_1", "task": "Click the video player", "web": "https://www.bbc.com/news", "element_type": "div", "interaction": "click", "target_element": {"type": "class", "value": "media-player"}, "ground_truth": {"screenshot": "bbc_video_1_gt.png", "description": "Video player activates", "visual_changes": ["Video starts playing", "Player controls visible"], "success_criteria": ["Video is playing", "Controls are visible"]}, "target_html": ""}
+{"web_name": "Cambridge Dictionary", "id": "cambridge_grammar_1", "task": "Click the 'Grammar' tab", "web": "https://dictionary.cambridge.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Grammar"}, "ground_truth": {"screenshot": "cambridge_grammar_1_gt.png", "description": "Grammar section loads", "visual_changes": ["Page transitions to grammar", "Grammar content visible"], "success_criteria": ["Grammar page loads", "Grammar content visible"]}, "target_html": ""}
+{"web_name": "AllRecipes", "id": "allrecipes_print_1", "task": "Click the print button", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Print"}, "ground_truth": {"screenshot": "allrecipes_print_1_gt.png", "description": "Print dialog opens", "visual_changes": ["Print overlay appears", "Print options visible"], "success_criteria": ["Print dialog visible", "Print options available"]}, "target_html": ""}
+{"web_name": "Amazon", "id": "amazon_orders_1", "task": "Click the 'Returns & Orders' link", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "id", "value": "nav-orders"}, "ground_truth": {"screenshot": "amazon_orders_1_gt.png", "description": "Orders page loads", "visual_changes": ["Page transitions to orders", "Order history visible"], "success_criteria": ["Orders page loads", "Order history visible"]}, "target_html": ""}
+{"web_name": "BBC News", "id": "bbc_notification_1", "task": "Click the notification bell icon", "web": "https://www.bbc.com/news", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "notification-bell"}, "ground_truth": {"screenshot": "bbc_notification_1_gt.png", "description": "Notification settings open", "visual_changes": ["Notification overlay appears", "Notification options visible"], "success_criteria": ["Overlay is visible", "Settings are accessible"]}, "target_html": ""}
+{"web_name": "Booking", "id": "booking_property_1", "task": "Click the property type filter", "web": "https://www.booking.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Property type"}, "ground_truth": {"screenshot": "booking_property_1_gt.png", "description": "Property types appear", "visual_changes": ["Filter overlay appears", "Property options visible"], "success_criteria": ["Overlay is visible", "Options are selectable"]}, "target_html": ""}
+{"web_name": "AllRecipes", "id": "allrecipes_save_1", "task": "Click the save recipe button", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Save Recipe"}, "ground_truth": {"screenshot": "allrecipes_save_1_gt.png", "description": "Save options appear", "visual_changes": ["Save overlay appears", "Collection options visible"], "success_criteria": ["Save dialog visible", "Collections are selectable"]}, "target_html": ""}
+{"web_name": "Amazon", "id": "amazon_filter_1", "task": "Click the price filter dropdown", "web": "https://www.amazon.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Price"}, "ground_truth": {"screenshot": "amazon_filter_1_gt.png", "description": "Price ranges appear", "visual_changes": ["Price overlay appears", "Range options visible"], "success_criteria": ["Overlay is visible", "Ranges are selectable"]}, "target_html": ""}
+{"web_name": "Apple", "id": "apple_watch_1", "task": "Click the 'Watch' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Watch"}, "ground_truth": {"screenshot": "apple_watch_1_gt.png", "description": "Watch page loads", "visual_changes": ["Page transitions to Watch", "Watch products visible"], "success_criteria": ["Watch page loads", "Products are visible"]}, "target_html": ""}
+{"web_name": "ArXiv", "id": "arxiv_pdf_1", "task": "Click the PDF link", "web": "https://arxiv.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "PDF"}, "ground_truth": {"screenshot": "arxiv_pdf_1_gt.png", "description": "PDF starts downloading", "visual_changes": ["Download starts", "Download indicator visible"], "success_criteria": ["Download begins", "Download status shown"]}, "target_html": ""}
+{"web_name": "BBC News", "id": "bbc_business_1", "task": "Click the 'Business' section link", "web": "https://www.bbc.com/news", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Business"}, "ground_truth": {"screenshot": "bbc_business_1_gt.png", "description": "Business section loads", "visual_changes": ["Page transitions to business", "Business news visible"], "success_criteria": ["Business page loads", "Business content visible"]}, "target_html": ""}
+{"web_name": "Cambridge Dictionary", "id": "cambridge_thesaurus_1", "task": "Click the 'Thesaurus' tab", "web": "https://dictionary.cambridge.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Thesaurus"}, "ground_truth": {"screenshot": "cambridge_thesaurus_1_gt.png", "description": "Thesaurus section loads", "visual_changes": ["Page transitions to thesaurus", "Thesaurus content visible"], "success_criteria": ["Thesaurus page loads", "Synonyms are visible"]}, "target_html": ""}
+{"web_name": "AllRecipes", "id": "allrecipes_review_1", "task": "Click the reviews tab", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Reviews"}, "ground_truth": {"screenshot": "allrecipes_review_1_gt.png", "description": "Reviews section opens", "visual_changes": ["Reviews section appears", "Review content visible"], "success_criteria": ["Reviews are visible", "Rating information shown"]}, "target_html": ""}
+{"web_name": "Amazon", "id": "amazon_prime_1", "task": "Click the Prime benefits link", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Prime"}, "ground_truth": {"screenshot": "amazon_prime_1_gt.png", "description": "Prime page loads", "visual_changes": ["Page transitions to Prime", "Prime benefits visible"], "success_criteria": ["Prime page loads", "Benefits are visible"]}, "target_html": ""}
+{"web_name": "Apple", "id": "apple_ipad_1", "task": "Click the 'iPad' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "iPad"}, "ground_truth": {"screenshot": "apple_ipad_1_gt.png", "description": "iPad page loads", "visual_changes": ["Page transitions to iPad", "iPad products visible"], "success_criteria": ["iPad page loads", "Products are visible"]}, "target_html": ""}
+{"web_name": "ArXiv", "id": "arxiv_abstract_1", "task": "Click the abstract toggle", "web": "https://arxiv.org/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Abstract"}, "ground_truth": {"screenshot": "arxiv_abstract_1_gt.png", "description": "Abstract expands", "visual_changes": ["Abstract section expands", "Full text visible"], "success_criteria": ["Abstract is expanded", "Text is readable"]}, "target_html": ""}
+{"web_name": "BBC News", "id": "bbc_tech_1", "task": "Click the 'Technology' section link", "web": "https://www.bbc.com/news", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Technology"}, "ground_truth": {"screenshot": "bbc_tech_1_gt.png", "description": "Technology section loads", "visual_changes": ["Page transitions to technology", "Tech news visible"], "success_criteria": ["Tech page loads", "Tech content visible"]}, "target_html": ""}
+{"web_name": "Cambridge Dictionary", "id": "cambridge_translate_2", "task": "Click the search box and type 'bonjour'", "web": "https://dictionary.cambridge.org/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "searchword"}, "input_text": "bonjour", "ground_truth": {"screenshot": "cambridge_translate_2_gt.png", "description": "Word entered in search", "visual_changes": ["Text appears in search box", "Suggestions may appear"], "success_criteria": ["Text matches exactly", "Input is visible"]}, "target_html": ""}
+{"web_name": "AllRecipes", "id": "allrecipes_cuisine_1", "task": "Click the cuisine filter", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Cuisine"}, "ground_truth": {"screenshot": "allrecipes_cuisine_1_gt.png", "description": "Cuisine options appear", "visual_changes": ["Filter overlay appears", "Cuisine options visible"], "success_criteria": ["Overlay is visible", "Options are selectable"]}, "target_html": ""}
+{"web_name": "Amazon", "id": "amazon_deals_1", "task": "Click the 'Today's Deals' link", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Today's Deals"}, "ground_truth": {"screenshot": "amazon_deals_1_gt.png", "description": "Deals page loads", "visual_changes": ["Page transitions to deals", "Deal items visible"], "success_criteria": ["Deals page loads", "Deals are visible"]}, "target_html": ""}
+{"web_name": "Booking", "id": "booking_map_1", "task": "Click the map view button", "web": "https://www.booking.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Map"}, "ground_truth": {"screenshot": "booking_map_1_gt.png", "description": "Map view opens", "visual_changes": ["Map interface appears", "Property markers visible"], "success_criteria": ["Map is visible", "Properties are plotted"]}, "target_html": ""}
+{"web_name": "Apple", "id": "apple_airpods_1", "task": "Click the 'AirPods' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "AirPods"}, "ground_truth": {"screenshot": "apple_airpods_1_gt.png", "description": "AirPods page loads", "visual_changes": ["Page transitions to AirPods", "AirPods products visible"], "success_criteria": ["AirPods page loads", "Products are visible"]}, "target_html": ""}
+{"web_name": "ArXiv", "id": "arxiv_author_1", "task": "Click the author search link", "web": "https://arxiv.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Author"}, "ground_truth": {"screenshot": "arxiv_author_1_gt.png", "description": "Author search opens", "visual_changes": ["Author search interface appears", "Search options visible"], "success_criteria": ["Search interface visible", "Author field active"]}, "target_html": ""}
+{"web_name": "BBC News", "id": "bbc_share_1", "task": "Click the share button", "web": "https://www.bbc.com/news", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "share-button"}, "ground_truth": {"screenshot": "bbc_share_1_gt.png", "description": "Share options appear", "visual_changes": ["Share overlay appears", "Share options visible"], "success_criteria": ["Overlay is visible", "Options are clickable"]}, "target_html": ""}
+{"web_name": "Cambridge Dictionary", "id": "cambridge_examples_1", "task": "Click the 'Examples' tab", "web": "https://dictionary.cambridge.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Examples"}, "ground_truth": {"screenshot": "cambridge_examples_1_gt.png", "description": "Examples section loads", "visual_changes": ["Page transitions to examples", "Usage examples visible"], "success_criteria": ["Examples page loads", "Examples are visible"]}, "target_html": ""}
+{"web_name": "AllRecipes", "id": "allrecipes_nutrition_1", "task": "Click the nutrition info button", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Nutrition"}, "ground_truth": {"screenshot": "allrecipes_nutrition_1_gt.png", "description": "Nutrition info appears", "visual_changes": ["Nutrition overlay appears", "Nutritional values visible"], "success_criteria": ["Overlay is visible", "Values are readable"]}, "target_html": ""}
+{"web_name": "Amazon", "id": "amazon_wishlist_1", "task": "Click the 'Add to List' button", "web": "https://www.amazon.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Add to List"}, "ground_truth": {"screenshot": "amazon_wishlist_1_gt.png", "description": "List options appear", "visual_changes": ["List overlay appears", "List options visible"], "success_criteria": ["Overlay is visible", "Lists are selectable"]}, "target_html": ""}
+{"web_name": "Apple", "id": "apple_iphone_1", "task": "Click the 'iPhone' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "iPhone"}, "ground_truth": {"screenshot": "apple_iphone_1_gt.png", "description": "iPhone page loads", "visual_changes": ["Page transitions to iPhone", "iPhone products visible"], "success_criteria": ["iPhone page loads", "Products are visible"]}, "target_html": ""}
+{"web_name": "ArXiv", "id": "arxiv_date_1", "task": "Click the date range filter", "web": "https://arxiv.org/", "element_type": "select", "interaction": "click", "target_element": {"type": "name", "value": "date-range"}, "ground_truth": {"screenshot": "arxiv_date_1_gt.png", "description": "Date options appear", "visual_changes": ["Date dropdown appears", "Range options visible"], "success_criteria": ["Dropdown is visible", "Ranges are selectable"]}, "target_html": ""}
+{"web_name": "BBC News", "id": "bbc_climate_1", "task": "Click the 'Climate' section link", "web": "https://www.bbc.com/news", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Climate"}, "ground_truth": {"screenshot": "bbc_climate_1_gt.png", "description": "Climate section loads", "visual_changes": ["Page transitions to climate", "Climate news visible"], "success_criteria": ["Climate page loads", "Climate content visible"]}, "target_html": ""}
+{"web_name": "Booking", "id": "booking_rating_1", "task": "Click the rating filter", "web": "https://www.booking.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Rating"}, "ground_truth": {"screenshot": "booking_rating_1_gt.png", "description": "Rating options appear", "visual_changes": ["Rating overlay appears", "Score options visible"], "success_criteria": ["Overlay is visible", "Ratings are selectable"]}, "target_html": ""}
+{"web_name": "Cambridge Dictionary", "id": "cambridge_browse_1", "task": "Click the 'Browse Dictionary' link", "web": "https://dictionary.cambridge.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Browse Dictionary"}, "ground_truth": {"screenshot": "cambridge_browse_1_gt.png", "description": "Browse page loads", "visual_changes": ["Page transitions to browse", "Word categories visible"], "success_criteria": ["Browse page loads", "Categories are visible"]}, "target_html": ""}
+{"web_name": "AllRecipes", "id": "allrecipes_video_1", "task": "Click the recipe video play button", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "video-play"}, "ground_truth": {"screenshot": "allrecipes_video_1_gt.png", "description": "Video starts playing", "visual_changes": ["Video begins playback", "Player controls visible"], "success_criteria": ["Video is playing", "Controls are visible"]}, "target_html": ""}
+{"web_name": "Amazon", "id": "amazon_seller_1", "task": "Click the 'Other Sellers' link", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Other Sellers"}, "ground_truth": {"screenshot": "amazon_seller_1_gt.png", "description": "Seller options appear", "visual_changes": ["Seller list appears", "Price options visible"], "success_criteria": ["Seller list visible", "Prices are shown"]}, "target_html": ""}
+{"web_name": "Apple", "id": "apple_tv_1", "task": "Click the 'TV & Home' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "TV & Home"}, "ground_truth": {"screenshot": "apple_tv_1_gt.png", "description": "TV & Home page loads", "visual_changes": ["Page transitions to TV", "TV products visible"], "success_criteria": ["TV page loads", "Products are visible"]}, "target_html": ""}
+{"web_name": "ArXiv", "id": "arxiv_title_1", "task": "Click the search box and type 'machine learning'", "web": "https://arxiv.org/", "element_type": "input", "interaction": "type", "target_element": {"type": "name", "value": "query"}, "input_text": "machine learning", "ground_truth": {"screenshot": "arxiv_title_1_gt.png", "description": "Search term entered", "visual_changes": ["Text appears in search box", "Suggestions may appear"], "success_criteria": ["Text matches exactly", "Input is visible"]}, "target_html": ""}
+{"web_name": "BBC News", "id": "bbc_science_1", "task": "Click the 'Science' section link", "web": "https://www.bbc.com/news", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Science"}, "ground_truth": {"screenshot": "bbc_science_1_gt.png", "description": "Science section loads", "visual_changes": ["Page transitions to science", "Science news visible"], "success_criteria": ["Science page loads", "Science content visible"]}, "target_html": ""}
+{"web_name": "Cambridge Dictionary", "id": "cambridge_word_1", "task": "Click the 'Word of the Day' link", "web": "https://dictionary.cambridge.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Word of the Day"}, "ground_truth": {"screenshot": "cambridge_word_1_gt.png", "description": "Word of the Day loads", "visual_changes": ["Page transitions to word", "Word details visible"], "success_criteria": ["Word page loads", "Definition visible"]}, "target_html": ""}
+{"web_name": "AllRecipes", "id": "allrecipes_time_1", "task": "Click the cooking time filter", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Cooking Time"}, "ground_truth": {"screenshot": "allrecipes_time_1_gt.png", "description": "Time options appear", "visual_changes": ["Filter overlay appears", "Time ranges visible"], "success_criteria": ["Overlay is visible", "Ranges are selectable"]}, "target_html": ""}
+{"web_name": "Amazon", "id": "amazon_gift_1", "task": "Click the 'Gift Cards' link", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Gift Cards"}, "ground_truth": {"screenshot": "amazon_gift_1_gt.png", "description": "Gift cards page loads", "visual_changes": ["Page transitions to gifts", "Gift card options visible"], "success_criteria": ["Gift page loads", "Options are visible"]}, "target_html": ""}
+{"web_name": "Booking", "id": "booking_popular_1", "task": "Click the 'Popular filters' button", "web": "https://www.booking.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Popular filters"}, "ground_truth": {"screenshot": "booking_popular_1_gt.png", "description": "Popular filters appear", "visual_changes": ["Filter overlay appears", "Popular options visible"], "success_criteria": ["Overlay is visible", "Filters are selectable"]}, "target_html": ""}
+{"web_name": "Apple", "id": "apple_music_1", "task": "Click the 'Music' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Music"}, "ground_truth": {"screenshot": "apple_music_1_gt.png", "description": "Music page loads", "visual_changes": ["Page transitions to music", "Music services visible"], "success_criteria": ["Music page loads", "Services are visible"]}, "target_html": ""}
+{"web_name": "ArXiv", "id": "arxiv_stats_1", "task": "Click the 'Statistics' subject link", "web": "https://arxiv.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Statistics"}, "ground_truth": {"screenshot": "arxiv_stats_1_gt.png", "description": "Statistics papers load", "visual_changes": ["Page transitions to stats", "Statistics papers visible"], "success_criteria": ["Stats page loads", "Papers are visible"]}, "target_html": ""}
+{"web_name": "BBC News", "id": "bbc_local_1", "task": "Click the 'Local News' link", "web": "https://www.bbc.com/news", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Local News"}, "ground_truth": {"screenshot": "bbc_local_1_gt.png", "description": "Local news loads", "visual_changes": ["Page transitions to local", "Local news visible"], "success_criteria": ["Local page loads", "News is visible"]}, "target_html": ""}
diff --git a/data/task_schema.json b/data/task_schema.json
new file mode 100644
index 0000000..0ff97ff
--- /dev/null
+++ b/data/task_schema.json
@@ -0,0 +1,108 @@
+{
+ "$schema": "http://json-schema.org/draft-07/schema#",
+ "title": "DOM Task Schema",
+ "description": "Schema for DOM interaction tasks in the benchmark",
+ "type": "object",
+ "required": [
+ "web_name",
+ "id",
+ "task",
+ "web",
+ "element_type",
+ "interaction",
+ "target_element",
+ "target_html",
+ "ground_truth"
+ ],
+ "properties": {
+ "web_name": {
+ "type": "string",
+ "description": "Name of the website"
+ },
+ "id": {
+ "type": "string",
+ "description": "Unique identifier for the task",
+ "pattern": "^[a-z0-9_]+$"
+ },
+ "task": {
+ "type": "string",
+ "description": "Human-readable task description"
+ },
+ "web": {
+ "type": "string",
+ "description": "Website URL",
+ "format": "uri"
+ },
+ "element_type": {
+ "type": "string",
+ "description": "Type of HTML element to interact with",
+ "enum": ["input", "button", "link", "div", "span"]
+ },
+ "interaction": {
+ "type": "string",
+ "description": "Type of interaction to perform",
+ "enum": ["click", "type", "hover"]
+ },
+ "target_element": {
+ "type": "object",
+ "description": "How to find the element",
+ "required": ["type", "value"],
+ "properties": {
+ "type": {
+ "type": "string",
+ "description": "Type of selector to use",
+ "enum": ["id", "class", "text"]
+ },
+ "value": {
+ "type": "string",
+ "description": "Value of the selector"
+ }
+ }
+ },
+ "input_text": {
+ "type": "string",
+ "description": "Text to type (only required for type interactions)"
+ },
+ "target_html": {
+ "type": "string",
+ "description": "The actual HTML element to match against for validation"
+ },
+ "ground_truth": {
+ "type": "object",
+ "description": "Validation data",
+ "required": [
+ "screenshot",
+ "description",
+ "visual_changes",
+ "success_criteria"
+ ],
+ "properties": {
+ "screenshot": {
+ "type": "string",
+ "description": "Filename of the ground truth screenshot",
+ "pattern": "^[a-z0-9_]+\\.png$"
+ },
+ "description": {
+ "type": "string",
+ "description": "Description of the expected outcome"
+ },
+ "visual_changes": {
+ "type": "array",
+ "description": "List of expected visual changes",
+ "items": {
+ "type": "string"
+ },
+ "minItems": 1
+ },
+ "success_criteria": {
+ "type": "array",
+ "description": "List of specific conditions that must be met for success",
+ "items": {
+ "type": "string"
+ },
+ "minItems": 1
+ }
+ }
+ }
+ }
+}
diff --git a/evaluation/README.md b/evaluation/README.md
index b379c3d..01cb9f2 100644
--- a/evaluation/README.md
+++ b/evaluation/README.md
@@ -4,10 +4,9 @@ This directory contains the evaluation tools for the DOM and DOMer-2 benchmark.
## Overview
-The evaluation uses GPT-4V to assess web interactions by analyzing:
-1. Before/After screenshots of the webpage
-2. Accessibility tree information
-3. Task descriptions and expected outcomes
+The evaluation system combines two approaches:
+1. Visual Validation (60% of score): Using GPT-4V to analyze screenshots
+2. HTML Element Validation (40% of score): Comparing actual HTML elements
## Usage
@@ -21,20 +20,22 @@ python auto_eval.py \
## Evaluation Process
-1. **Screenshot Analysis**
- - Compare before/after states
+1. **Visual Validation (60%)**
+ - Compare before/after screenshots
- Verify visual changes match expected interaction
- Check element visibility and state changes
+ - Uses GPT-4V for intelligent visual comparison
-2. **Accessibility Tree Verification**
- - Validate correct element was targeted
- - Check element attributes and relationships
- - Verify element state changes
+2. **HTML Element Validation (40%)**
+ - Compare model's selected HTML element with ground truth
+ - Structure score (40%): Tag hierarchy and relationships
+ - Attributes score (30%): Element properties and identifiers
+ - Content score (30%): Inner HTML and text content
3. **Success Criteria**
- - Correct element identified and interacted with
- - Expected visual changes occurred
- - No unintended side effects
+ - Visual score ≥ 0.9 for visual validation
+ - HTML similarity score ≥ 0.9 for element validation
+ - Combined weighted score ≥ 0.9 for overall success
## Output Format
@@ -45,15 +46,49 @@ python auto_eval.py \
"evaluations": [
{
"task_id": "task_001",
+ "visual_evaluation": {
+ "score": 0.95,
+ "details": "Detailed visual evaluation..."
+ },
+ "html_evaluation": {
+ "score": 0.92,
+ "structure_score": 0.95,
+ "attributes_score": 0.90,
+ "content_score": 0.89
+ },
+ "final_score": 0.94,
"success": true,
- "evaluation": "Detailed evaluation text...",
"timestamp": 1234567890
- },
- ...
+ }
]
}
```
+## Scoring Details
+
+### Visual Score (60%)
+- Element visibility and positioning
+- State changes (hover effects, expansions)
+- Content updates and transitions
+- Overall visual accuracy
+
+### HTML Score (40%)
+1. **Structure (40% of HTML score)**
+ - Correct tag name
+ - Parent-child relationships
+ - Sibling context
+
+2. **Attributes (30% of HTML score)**
+ - ID and class matching
+ - ARIA attributes
+ - Event handlers
+ - Custom data attributes
+
+3. **Content (30% of HTML score)**
+ - Inner HTML similarity
+ - Text content matching
+ - Nested element structure
+
## Requirements
- OpenAI API key with GPT-4V access
diff --git a/evaluation/auto_eval.py b/evaluation/auto_eval.py
index f0f1a05..8b24a2a 100644
--- a/evaluation/auto_eval.py
+++ b/evaluation/auto_eval.py
@@ -5,111 +5,143 @@
import base64
from pathlib import Path
from typing import List, Dict, Any
-
+import logging
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.chrome.service import Service
+from webdriver_manager.chrome import ChromeDriverManager
from openai import OpenAI
from dotenv import load_dotenv
-SYSTEM_PROMPT = """As an evaluator for DOM and DOMer-2 benchmark, you will assess web element interactions based on visual comparison:
-
-1. Task Description: A specific web interaction task (e.g., "Click the search button", "Type text in input field")
-
-2. Visual Validation:
- - Before: Initial webpage state
- - After: Actual result after interaction
- - Ground Truth: Expected result for successful interaction
- - Expected Visual Changes: List of specific visual changes to verify
- - Success Criteria: Specific conditions that must be met
-
-Your evaluation should:
-1. Compare the after screenshot with the ground truth screenshot
-2. Verify all listed visual changes occurred
-3. Check if all success criteria are met
-4. Pay special attention to the relevant regions where changes should occur
-
-Provide your evaluation as:
-1. A score from 0-100 based on visual similarity and completion of expected changes
-2. 'SUCCESS' if score ≥ 90, otherwise 'NOT SUCCESS'
-3. Brief explanation of:
- - Visual changes observed/missing
- - Success criteria met/unmet
- - Why the interaction succeeded or failed"""
+SYSTEM_PROMPT = """You are an expert web automation evaluator. Your task is to:
+1. Analyze the provided HTML source and accessibility tree
+2. Identify and extract the complete HTML element that matches the target description
+3. Score the visual interaction based on the provided before/after screenshots
+
+For HTML element selection:
+- Return the complete HTML element including its attributes and inner content
+- Consider the element's context and relationship with surrounding elements
+- Ensure the selected element uniquely matches the target description
+
+For visual evaluation:
+- Score how well the interaction matches the expected outcome
+- Consider element visibility, positioning, and state changes
+- Account for any dynamic content or loading states
+
+Provide your response in the following JSON format:
+{
+ "selected_html": "",
+ "visual_score": float, # 0.0 to 1.0
+ "confidence": float, # 0.0 to 1.0
+ "reasoning": "string" # Brief explanation of your evaluation
+}"""
def encode_image(image_path: str) -> str:
"""Encode image as base64 string"""
with open(image_path, "rb") as f:
return base64.b64encode(f.read()).decode('utf-8')
+def get_element_html_context(driver: webdriver.Chrome, element) -> str:
+ """Get HTML context of an element"""
+ return driver.execute_script("return arguments[0].outerHTML;", element)
+
+def get_accessibility_tree(driver: webdriver.Chrome) -> Dict[str, Any]:
+ """Get accessibility tree of the current page"""
+ return driver.execute_script("return window.axe.getEntireContext();")
+
+def compare_html_elements(html1: str, html2: str) -> Dict[str, Any]:
+ """Compare two HTML elements"""
+ # Implement HTML comparison logic here
+ # For demonstration purposes, return a dummy score
+ return {"total_score": 0.8, "attribute_score": 0.9, "content_score": 0.7, "structure_score": 0.8}
+
+def get_llm_evaluation(context: Dict[str, Any]) -> Dict[str, Any]:
+ """Get LLM evaluation"""
+ # Implement LLM evaluation logic here
+ # For demonstration purposes, return a dummy response
+ return {
+ "selected_html": "Selected HTML element
",
+ "visual_score": 0.9,
+ "confidence": 0.8,
+ "reasoning": "Brief explanation of the evaluation"
+ }
+
def evaluate_task(
task: Dict[str, Any],
result: Dict[str, Any],
ground_truth: Dict[str, Any],
openai_client: OpenAI
) -> Dict[str, Any]:
- """Evaluate a single task using GPT-4V based on visual comparison"""
-
- messages = [
- {"role": "system", "content": SYSTEM_PROMPT},
- {"role": "user", "content": f"""
-Task: {task['task']}
-
-Please compare:
-1. Before screenshot (initial state)
-2. After screenshot (actual result)
-3. Ground truth screenshot (expected result)
-
-Expected visual changes:
-{json.dumps(ground_truth['visual_changes'], indent=2)}
-
-Success criteria:
-{json.dumps(ground_truth['success_criteria'], indent=2)}
-
-Provide:
-1. Similarity score (0-100)
-2. Success status
-3. Brief explanation"""},
- {"role": "assistant", "content": "I'll examine the screenshots and evaluate based on visual similarity, expected changes, and success criteria."},
- {"role": "user", "content": [
- {"type": "text", "text": "Before interaction:"},
- {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encode_image(result['before_screenshot'])}"}},
- {"type": "text", "text": "After interaction:"},
- {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encode_image(result['after_screenshot'])}"}},
- {"type": "text", "text": "Ground Truth:"},
- {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encode_image(ground_truth['screenshot'])}"}},
- ]}
- ]
-
+ """Evaluate a task using both visual comparison and HTML matching"""
try:
- response = openai_client.chat.completions.create(
+ # 1. Visual Evaluation (existing)
+ messages = [
+ {"role": "system", "content": SYSTEM_PROMPT},
+ {"role": "user", "content": [
+ {"type": "text", "text": "Compare these screenshots:"},
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encode_image(result['before_screenshot'])}"}},
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encode_image(result['after_screenshot'])}"}},
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encode_image(ground_truth['screenshot'])}"}},
+ ]}
+ ]
+
+ visual_response = openai_client.chat.completions.create(
model="gpt-4-vision-preview",
messages=messages,
max_tokens=1000,
temperature=0
)
- evaluation = response.choices[0].message.content
+ # 2. HTML Element Matching (new)
+ html_score = compare_html_elements(
+ result.get('html_element', ''), # From model's response
+ ground_truth.get('target_html', ''), # From ground truth
+ )
+
+ # 3. Combine Scores
+ visual_score = extract_score(visual_response.choices[0].message.content)
- # Extract score and success status
- import re
- score_match = re.search(r'(\d+)(?=/100|%)', evaluation)
- score = int(score_match.group(1)) if score_match else 0
+ final_score = (
+ visual_score * 0.6 + # Weight visual score more
+ html_score['total_score'] * 0.4 # HTML matching score
+ )
return {
"task_id": task["id"],
- "score": score,
- "success": score >= 90,
- "evaluation": evaluation,
+ "visual_evaluation": {
+ "score": visual_score,
+ "details": visual_response.choices[0].message.content
+ },
+ "html_evaluation": {
+ "score": html_score['total_score'],
+ "structure_score": html_score['structure_score'],
+ "attributes_score": html_score['attributes_score'],
+ "content_score": html_score['content_score']
+ },
+ "final_score": final_score,
+ "success": final_score >= 0.9,
"timestamp": int(time.time())
}
except Exception as e:
+ logging.error(f"Error evaluating task: {str(e)}")
return {
"task_id": task["id"],
- "score": 0,
+ "error": str(e),
+ "final_score": 0.0,
"success": False,
- "evaluation": f"Evaluation failed: {str(e)}",
"timestamp": int(time.time())
}
+def extract_score(evaluation_text: str) -> float:
+ """Extract numerical score from evaluation text"""
+ import re
+ score_match = re.search(r'(\d+)(?=/100|%)', evaluation_text)
+ return float(score_match.group(1)) / 100 if score_match else 0.0
+
def run_evaluation(
tasks_file: Path,
results_dir: Path,
@@ -136,18 +168,22 @@ def run_evaluation(
for task in tasks:
task_result = next((r for r in results if r["task_id"] == task["id"]), None)
if task_result:
- evaluation = evaluate_task(
- task,
- task_result,
- ground_truth_dir,
- openai_client
- )
- evaluations.append(evaluation)
+ ground_truth = next((gt for gt in ground_truth_dir.iterdir() if gt.name == f"{task['id']}.json"), None)
+ if ground_truth:
+ with open(ground_truth) as f:
+ ground_truth_data = json.load(f)
+ evaluation = evaluate_task(
+ task,
+ task_result,
+ ground_truth_data,
+ openai_client
+ )
+ evaluations.append(evaluation)
# Save evaluations
output = {
"total_tasks": len(tasks),
- "successful_tasks": sum(1 for e in evaluations if e["success"]),
+ "successful_tasks": sum(1 for e in evaluations if e["final_score"] > 0.5),
"evaluations": evaluations
}
diff --git a/prompts.py b/prompts.py
index 1499e23..9915db8 100644
--- a/prompts.py
+++ b/prompts.py
@@ -1,50 +1,48 @@
from typing import Dict, Any
-SYSTEM_PROMPT = """You are an AI agent designed to interact with web elements. Your task is to execute specific web interactions based on natural language descriptions.
-
-Focus on the following:
-1. Element Identification: Use the provided accessibility tree and visual context to identify the correct element
-2. Precise Interaction: Execute the exact interaction required (click, type, hover)
-3. Accuracy: Ensure you interact with the correct element, as there may be similar elements on the page
+SYSTEM_PROMPT = """You are an AI agent designed to interact with web elements. Your task is to:
+1. Execute the specified web interaction (click, type, etc.)
+2. Return the exact HTML element you interacted with
Guidelines:
-- Pay attention to element attributes (role, type, name) in the accessibility tree
-- Consider the visual context and location of elements
-- Be precise in your interactions - click exactly where specified
-- Handle dynamic elements and wait for page loads appropriately
+- Execute the interaction precisely as specified
+- Return the complete HTML element including all attributes and content
+- Use the accessibility tree to help identify the correct element
+- Consider both visual context and element attributes
-Example Task:
+Your response MUST be in this exact JSON format:
{
- "web_name": "Amazon",
- "task": "Click the search button",
- "web": "https://www.amazon.com",
- "element_type": "button",
- "interaction": "click",
- "target_element": {
- "type": "id",
- "value": "nav-search-submit-button"
- }
+ "action": {
+ "type": "click|type|hover|etc",
+ "value": "text to type if applicable"
+ },
+ "html_element": "",
+ "confidence": 0.95 # How confident you are in your selection
}
-Remember: Your goal is to execute the interaction accurately and efficiently.
-"""
+Example:
+Task: "Click the search button"
+Response:
+{
+ "action": {
+ "type": "click",
+ "value": null
+ },
+ "html_element": "",
+ "confidence": 0.95
+}"""
def format_task_prompt(task: Dict[str, Any], accessibility_tree: Dict[str, Any] = None) -> str:
"""Format task into prompt for the agent"""
prompt = f"""Website: {task['web_name']}
Task: {task['task']}
URL: {task['web']}
-Required Interaction: {task['interaction']}
-Target Element Type: {task['element_type']}
-Accessibility Tree Information:
-"""
-
- if accessibility_tree:
- prompt += f"```json\n{accessibility_tree}\n```\n"
- else:
- prompt += "Not available\n"
-
- prompt += "\nPlease execute the specified interaction accurately."
+Accessibility Tree:
+```json
+{accessibility_tree if accessibility_tree else 'Not available'}
+```
+
+Execute the task and return both your action and the HTML element you interacted with."""
return prompt
diff --git a/results/README.md b/results/README.md
new file mode 100644
index 0000000..1a1ab3c
--- /dev/null
+++ b/results/README.md
@@ -0,0 +1,70 @@
+# Results Directory
+
+This directory stores benchmark results and evaluations.
+
+## Directory Structure
+
+```
+results/
+├── run_001/ # Each run in its own directory
+│ ├── results.json # Raw results from model
+│ ├── evaluation.json # Evaluation scores
+│ └── screenshots/ # Before/after screenshots
+├── run_002/
+└── ...
+```
+
+## File Formats
+
+### `results.json`
+```json
+{
+ "task_id": "task_001",
+ "action": {
+ "type": "click",
+ "value": null
+ },
+ "html_element": "",
+ "confidence": 0.95,
+ "screenshots": {
+ "before": "before_001.png",
+ "after": "after_001.png"
+ }
+}
+```
+
+### `evaluation.json`
+```json
+{
+ "task_id": "task_001",
+ "visual_evaluation": {
+ "score": 0.95,
+ "details": "..."
+ },
+ "html_evaluation": {
+ "score": 0.92,
+ "structure_score": 0.95,
+ "attributes_score": 0.90,
+ "content_score": 0.89
+ },
+ "final_score": 0.94,
+ "success": true
+}
+```
+
+## Guidelines
+
+1. **Organization**
+ - Create a new directory for each benchmark run
+ - Use consistent naming: `run_XXX`
+ - Keep screenshots organized by task
+
+2. **Storage**
+ - Clean up old runs periodically
+ - Compress screenshots if needed
+ - Back up important results
+
+3. **Analysis**
+ - Use evaluation.json for metrics
+ - Compare runs to track improvements
+ - Document significant changes
diff --git a/scripts/README.md b/scripts/README.md
new file mode 100644
index 0000000..284342f
--- /dev/null
+++ b/scripts/README.md
@@ -0,0 +1,40 @@
+# Scripts Directory
+
+This directory contains utility scripts for managing and maintaining the benchmark.
+
+## Scripts
+
+### `add_target_html.py`
+- Adds empty `target_html` field to task definitions
+- Used for upgrading existing task files to support HTML validation
+- Usage: `python add_target_html.py`
+
+## Adding New Scripts
+
+When adding new utility scripts:
+1. Follow Python best practices
+2. Add proper error handling
+3. Document usage in this README
+4. Include example commands if applicable
+
+## Script Guidelines
+
+1. **File Naming**
+ - Use descriptive names
+ - Separate words with underscores
+ - End with `.py` extension
+
+2. **Documentation**
+ - Add docstrings to all functions
+ - Include usage examples
+ - Document any dependencies
+
+3. **Error Handling**
+ - Handle file I/O errors
+ - Provide meaningful error messages
+ - Add logging where appropriate
+
+4. **Testing**
+ - Add test cases if possible
+ - Include sample data if needed
+ - Document test procedures
diff --git a/scripts/add_target_html.py b/scripts/add_target_html.py
new file mode 100644
index 0000000..35220e6
--- /dev/null
+++ b/scripts/add_target_html.py
@@ -0,0 +1,21 @@
+import json
+
+def add_target_html_field():
+ tasks = []
+
+ # Read existing tasks
+ with open('../data/dom_tasks.jsonl', 'r') as f:
+ for line in f:
+ task = json.loads(line)
+ # Add target_html field if not present
+ if 'target_html' not in task:
+ task['target_html'] = ""
+ tasks.append(task)
+
+ # Write back tasks with new field
+ with open('../data/dom_tasks.jsonl', 'w') as f:
+ for task in tasks:
+ f.write(json.dumps(task) + '\n')
+
+if __name__ == "__main__":
+ add_target_html_field()
diff --git a/utils/README.md b/utils/README.md
new file mode 100644
index 0000000..863ca62
--- /dev/null
+++ b/utils/README.md
@@ -0,0 +1,54 @@
+# Utils Directory
+
+This directory contains utility functions and helper modules used throughout the benchmark system.
+
+## Files
+
+### `accessibility_utils.py`
+- Handles accessibility tree extraction and HTML element comparison
+- Key functions:
+ - `get_accessibility_tree()`: Extracts accessibility tree from webpage
+ - `get_element_html_context()`: Gets HTML context for an element
+ - `compare_html_elements()`: Compares two HTML elements for similarity
+
+### Other Utils
+- Helper functions for web interaction
+- Image processing utilities
+- Common data structures and types
+
+## HTML Element Comparison
+
+The HTML comparison system uses three metrics:
+1. **Structure Score (40%)**
+ - Tag name matching
+ - Parent element matching
+ - Sibling context
+
+2. **Attributes Score (30%)**
+ - Matching of key attributes (id, class, etc.)
+ - Handling of dynamic attributes
+
+3. **Content Score (30%)**
+ - Inner HTML similarity
+ - Text content matching
+
+## Usage Example
+
+```python
+from utils.accessibility_utils import get_element_html_context, compare_html_elements
+
+# Get HTML context for an element
+element_context = get_element_html_context(driver, element)
+
+# Compare with ground truth
+similarity_score = compare_html_elements(
+ element_context,
+ ground_truth_html
+)
+
+# Score breakdown
+print(f"Structure Score: {similarity_score['structure_score']}")
+print(f"Attributes Score: {similarity_score['attributes_score']}")
+print(f"Content Score: {similarity_score['content_score']}")
+print(f"Total Score: {similarity_score['total_score']}")
+```
diff --git a/utils/accessibility_utils.py b/utils/accessibility_utils.py
index 5f5a1df..abe5e80 100644
--- a/utils/accessibility_utils.py
+++ b/utils/accessibility_utils.py
@@ -101,3 +101,59 @@ def build_tree(node_id: str, nodes_map: Dict[str, AccessibilityNode], depth: int
)
return build_tree(root_node["nodeId"], nodes_map)
+
+def get_element_html_context(driver: webdriver.Chrome, element) -> Dict[str, Any]:
+ """Get HTML context for an element including its structure and surrounding elements."""
+ return {
+ "outer_html": element.get_attribute("outerHTML"),
+ "inner_html": element.get_attribute("innerHTML"),
+ "tag_name": element.tag_name,
+ "attributes": {
+ name: element.get_attribute(name)
+ for name in ["id", "class", "name", "type", "value", "href", "src"]
+ if element.get_attribute(name)
+ },
+ "parent_tag": element.find_element("xpath", "..").tag_name if element else None,
+ "siblings": [
+ e.tag_name for e in element.find_elements("xpath", "../*")
+ if e != element
+ ][:3] # Get up to 3 siblings for context
+ }
+
+def compare_html_elements(suggested: Dict[str, Any], target: Dict[str, Any]) -> Dict[str, float]:
+ """Compare two HTML elements and return similarity scores."""
+ from difflib import SequenceMatcher
+
+ # Structure score (40%)
+ structure_score = (
+ (suggested["tag_name"] == target["tag_name"]) * 0.5 +
+ (suggested["parent_tag"] == target["parent_tag"]) * 0.3 +
+ len(set(suggested["siblings"]) & set(target["siblings"])) /
+ max(len(target["siblings"]), 1) * 0.2
+ )
+
+ # Attributes score (30%)
+ common_attrs = set(suggested["attributes"]) & set(target["attributes"])
+ matching_attrs = sum(
+ suggested["attributes"][attr] == target["attributes"][attr]
+ for attr in common_attrs
+ )
+ attrs_score = matching_attrs / max(len(target["attributes"]), 1)
+
+ # Content similarity score (30%)
+ content_score = SequenceMatcher(
+ None,
+ suggested["inner_html"],
+ target["inner_html"]
+ ).ratio()
+
+ return {
+ "structure_score": structure_score,
+ "attributes_score": attrs_score,
+ "content_score": content_score,
+ "total_score": (
+ structure_score * 0.4 +
+ attrs_score * 0.3 +
+ content_score * 0.3
+ )
+ }