diff --git a/data/README.md b/data/README.md new file mode 100644 index 0000000..4cbbd87 --- /dev/null +++ b/data/README.md @@ -0,0 +1,83 @@ +# DOM Task Format + +This document describes the format for DOM interaction tasks in our benchmark. + +## Schema + +Tasks are defined in JSONL format, where each line is a valid JSON object following the schema in `task_schema.json`. + +## Example Task + +```json +{ + "web_name": "Cambridge Dictionary", + "id": "cambridge_lookup_1", + "task": "Click the search box and type 'hello'", + "web": "https://dictionary.cambridge.org/", + "element_type": "input", + "interaction": "type", + "target_element": { + "type": "id", + "value": "searchword" + }, + "input_text": "hello", + "target_html": "", + "ground_truth": { + "screenshot": "cambridge_lookup_1_gt.png", + "description": "The word 'hello' has been entered in the search box", + "visual_changes": [ + "Text 'hello' appears in search box", + "Text cursor visible at end of input", + "Search suggestions may appear" + ], + "success_criteria": [ + "Input text matches 'hello' exactly", + "Text is visible in search box", + "Search box maintains focus" + ] + } +} +``` + +## Field Descriptions + +### Basic Information +- `web_name`: Name of the website +- `id`: Unique identifier for the task +- `task`: Human-readable task description +- `web`: Website URL + +### Element and Interaction +- `element_type`: Type of HTML element (input, button, link, etc.) +- `interaction`: Type of interaction (click, type, hover) +- `target_element`: How to find the element + - `type`: Selector type (id, class, text) + - `value`: Selector value +- `input_text`: Text to type (only for type interactions) + +### Validation +- `target_html`: The actual HTML element for structural validation +- `ground_truth`: Validation data + - `screenshot`: Reference screenshot filename + - `description`: What should happen + - `visual_changes`: List of expected visual changes + - `success_criteria`: Specific conditions for success + +## Validation Process + +Tasks are validated using two methods: +1. **Visual Validation** (60% of score) + - Compares screenshots before/after interaction + - Verifies visual changes match ground truth + +2. **HTML Validation** (40% of score) + - Matches the HTML element the model interacted with + - Checks structure, attributes, and content + +## Adding New Tasks + +1. Follow the schema in `task_schema.json` +2. Ensure unique task IDs +3. Provide clear success criteria +4. Include reference screenshots +5. Fill in the `target_html` field with the actual HTML element diff --git a/data/dom_tasks.jsonl b/data/dom_tasks.jsonl index 8dba4f3..a5250c5 100644 --- a/data/dom_tasks.jsonl +++ b/data/dom_tasks.jsonl @@ -1,80 +1,80 @@ -{"web_name": "Cambridge Dictionary", "id": "cambridge_lookup_1", "task": "Click the search box and type 'hello'", "web": "https://dictionary.cambridge.org/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "searchword"}, "input_text": "hello", "ground_truth": {"screenshot": "cambridge_lookup_1_gt.png", "description": "The word 'hello' has been entered in the search box", "visual_changes": ["Text 'hello' appears in search box", "Text cursor visible at end of input", "Search suggestions may appear"], "success_criteria": ["Input text matches 'hello' exactly", "Text is visible in search box", "Search box maintains focus"]}} -{"web_name": "Cambridge Dictionary", "id": "cambridge_search_1", "task": "Click the search button", "web": "https://dictionary.cambridge.org/", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "cdo-search-button"}, "ground_truth": {"screenshot": "cambridge_search_1_gt.png", "description": "The search results for 'hello' are displayed", "visual_changes": ["Search button appears pressed", "Page transitions to search results", "Definition of 'hello' is displayed"], "success_criteria": ["Search button responds to click", "Results page loads", "No error messages displayed"]}} -{"web_name": "AllRecipes", "id": "allrecipes_search_1", "task": "Click the search box and type 'vegetarian lasagna'", "web": "https://www.allrecipes.com/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "search-box"}, "input_text": "vegetarian lasagna", "ground_truth": {"screenshot": "allrecipes_search_1_gt.png", "description": "Search term entered in search box", "visual_changes": ["Text appears in search box", "Search suggestions may appear"], "success_criteria": ["Text matches exactly", "Search box contains text"]}} -{"web_name": "AllRecipes", "id": "allrecipes_filter_1", "task": "Click the 'Ratings' filter button", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Ratings"}, "ground_truth": {"screenshot": "allrecipes_filter_1_gt.png", "description": "Ratings filter dropdown opens", "visual_changes": ["Dropdown menu appears", "Filter options visible"], "success_criteria": ["Dropdown menu is visible", "Filter options are clickable"]}} -{"web_name": "Amazon", "id": "amazon_search_1", "task": "Click the search box and type 'laptop'", "web": "https://www.amazon.com/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "twotabsearchtextbox"}, "input_text": "laptop", "ground_truth": {"screenshot": "amazon_search_1_gt.png", "description": "Search term entered", "visual_changes": ["Text appears in search box", "Search suggestions appear"], "success_criteria": ["Text matches exactly", "Search suggestions visible"]}} -{"web_name": "Amazon", "id": "amazon_menu_1", "task": "Click the hamburger menu button", "web": "https://www.amazon.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "id", "value": "nav-hamburger-menu"}, "ground_truth": {"screenshot": "amazon_menu_1_gt.png", "description": "Side menu opens", "visual_changes": ["Menu slides in from left", "Menu options visible"], "success_criteria": ["Menu is visible", "Menu items are clickable"]}} -{"web_name": "Apple", "id": "apple_menu_1", "task": "Click the 'Mac' menu item", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Mac"}, "ground_truth": {"screenshot": "apple_menu_1_gt.png", "description": "Mac page loads", "visual_changes": ["Page transitions to Mac section", "Mac products visible"], "success_criteria": ["Page URL changes", "Mac content visible"]}} -{"web_name": "Apple", "id": "apple_search_1", "task": "Click the search icon", "web": "https://www.apple.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "globalnav-search-trigger"}, "ground_truth": {"screenshot": "apple_search_1_gt.png", "description": "Search overlay opens", "visual_changes": ["Search overlay appears", "Search box focused"], "success_criteria": ["Search overlay visible", "Search box is active"]}} -{"web_name": "ArXiv", "id": "arxiv_search_1", "task": "Click the search box and type 'quantum computing'", "web": "https://arxiv.org/", "element_type": "input", "interaction": "type", "target_element": {"type": "name", "value": "query"}, "input_text": "quantum computing", "ground_truth": {"screenshot": "arxiv_search_1_gt.png", "description": "Search term entered", "visual_changes": ["Text appears in search box"], "success_criteria": ["Text matches exactly", "Search box contains text"]}} -{"web_name": "ArXiv", "id": "arxiv_filter_1", "task": "Click the 'Advanced Search' link", "web": "https://arxiv.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Advanced Search"}, "ground_truth": {"screenshot": "arxiv_filter_1_gt.png", "description": "Advanced search page loads", "visual_changes": ["Page transitions to advanced search", "Search filters visible"], "success_criteria": ["URL changes to advanced search", "Advanced search form visible"]}} -{"web_name": "BBC News", "id": "bbc_menu_1", "task": "Click the menu button", "web": "https://www.bbc.com/news", "element_type": "button", "interaction": "click", "target_element": {"type": "id", "value": "orbit-more-menu"}, "ground_truth": {"screenshot": "bbc_menu_1_gt.png", "description": "Menu overlay opens", "visual_changes": ["Menu overlay appears", "Navigation options visible"], "success_criteria": ["Menu overlay is visible", "Menu items are clickable"]}} -{"web_name": "BBC News", "id": "bbc_search_1", "task": "Click the search icon", "web": "https://www.bbc.com/news", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "orbit-search__button"}, "ground_truth": {"screenshot": "bbc_search_1_gt.png", "description": "Search overlay opens", "visual_changes": ["Search overlay appears", "Search box focused"], "success_criteria": ["Search overlay visible", "Search input is active"]}} -{"web_name": "Booking", "id": "booking_destination_1", "task": "Click the destination input and type 'Paris'", "web": "https://www.booking.com/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "ss"}, "input_text": "Paris", "ground_truth": {"screenshot": "booking_destination_1_gt.png", "description": "Destination entered", "visual_changes": ["Text appears in input", "Location suggestions appear"], "success_criteria": ["Text matches exactly", "Suggestions are visible"]}} -{"web_name": "Booking", "id": "booking_dates_1", "task": "Click the check-in date field", "web": "https://www.booking.com/", "element_type": "div", "interaction": "click", "target_element": {"type": "class", "value": "sb-date-field__field"}, "ground_truth": {"screenshot": "booking_dates_1_gt.png", "description": "Calendar overlay opens", "visual_changes": ["Calendar overlay appears", "Available dates highlighted"], "success_criteria": ["Calendar is visible", "Dates are selectable"]}} -{"web_name": "Cambridge Dictionary", "id": "cambridge_menu_1", "task": "Click the 'More' menu button", "web": "https://dictionary.cambridge.org/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "More"}, "ground_truth": {"screenshot": "cambridge_menu_1_gt.png", "description": "More menu opens", "visual_changes": ["Dropdown menu appears", "Menu options visible"], "success_criteria": ["Dropdown is visible", "Menu items are clickable"]}} -{"web_name": "AllRecipes", "id": "allrecipes_menu_1", "task": "Click the 'Ingredients' button", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Ingredients"}, "ground_truth": {"screenshot": "allrecipes_menu_1_gt.png", "description": "Ingredients section expands", "visual_changes": ["Section expands", "Ingredient list visible"], "success_criteria": ["Section is expanded", "Ingredients are visible"]}} -{"web_name": "Amazon", "id": "amazon_cart_1", "task": "Click the cart icon", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "id", "value": "nav-cart"}, "ground_truth": {"screenshot": "amazon_cart_1_gt.png", "description": "Cart page loads", "visual_changes": ["Page transitions to cart", "Cart contents visible"], "success_criteria": ["Cart page loads", "Cart status visible"]}} -{"web_name": "Apple", "id": "apple_store_1", "task": "Click the 'Store' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Store"}, "ground_truth": {"screenshot": "apple_store_1_gt.png", "description": "Store page loads", "visual_changes": ["Page transitions to store", "Store products visible"], "success_criteria": ["Store page loads", "Products are visible"]}} -{"web_name": "ArXiv", "id": "arxiv_recent_1", "task": "Click the 'Recent' link", "web": "https://arxiv.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "recent"}, "ground_truth": {"screenshot": "arxiv_recent_1_gt.png", "description": "Recent submissions page loads", "visual_changes": ["Page transitions to recent submissions", "Recent papers visible"], "success_criteria": ["Recent page loads", "Papers are visible"]}} -{"web_name": "BBC News", "id": "bbc_sport_1", "task": "Click the 'Sport' section link", "web": "https://www.bbc.com/news", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Sport"}, "ground_truth": {"screenshot": "bbc_sport_1_gt.png", "description": "Sports section loads", "visual_changes": ["Page transitions to sports", "Sports news visible"], "success_criteria": ["Sports page loads", "Sports content visible"]}} -{"web_name": "Booking", "id": "booking_guests_1", "task": "Click the guests selector", "web": "https://www.booking.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "xp__guests__count"}, "ground_truth": {"screenshot": "booking_guests_1_gt.png", "description": "Guests selector opens", "visual_changes": ["Guests overlay appears", "Guest options visible"], "success_criteria": ["Overlay is visible", "Guest controls are active"]}} -{"web_name": "Booking", "id": "booking_search_1", "task": "Click the search button", "web": "https://www.booking.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "sb-searchbox__button"}, "ground_truth": {"screenshot": "booking_search_1_gt.png", "description": "Search results load", "visual_changes": ["Page transitions to results", "Available properties shown"], "success_criteria": ["Results page loads", "Properties are visible"]}} -{"web_name": "AllRecipes", "id": "allrecipes_diet_1", "task": "Click the 'Dietary Restrictions' filter", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Dietary Restrictions"}, "ground_truth": {"screenshot": "allrecipes_diet_1_gt.png", "description": "Diet filter opens", "visual_changes": ["Filter dropdown appears", "Diet options visible"], "success_criteria": ["Dropdown is visible", "Options are clickable"]}} -{"web_name": "Amazon", "id": "amazon_department_1", "task": "Click the department selector", "web": "https://www.amazon.com/", "element_type": "select", "interaction": "click", "target_element": {"type": "id", "value": "searchDropdownBox"}, "ground_truth": {"screenshot": "amazon_department_1_gt.png", "description": "Department dropdown opens", "visual_changes": ["Dropdown menu appears", "Department list visible"], "success_criteria": ["Dropdown is visible", "Departments are selectable"]}} -{"web_name": "Apple", "id": "apple_bag_1", "task": "Click the shopping bag icon", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "class", "value": "globalnav-bag-item"}, "ground_truth": {"screenshot": "apple_bag_1_gt.png", "description": "Shopping bag overlay opens", "visual_changes": ["Bag overlay appears", "Cart contents visible"], "success_criteria": ["Overlay is visible", "Cart status shown"]}} -{"web_name": "ArXiv", "id": "arxiv_subject_1", "task": "Click the subject area dropdown", "web": "https://arxiv.org/", "element_type": "select", "interaction": "click", "target_element": {"type": "name", "value": "subject"}, "ground_truth": {"screenshot": "arxiv_subject_1_gt.png", "description": "Subject dropdown opens", "visual_changes": ["Dropdown menu appears", "Subject areas visible"], "success_criteria": ["Dropdown is visible", "Subjects are selectable"]}} -{"web_name": "BBC News", "id": "bbc_region_1", "task": "Click the region selector", "web": "https://www.bbc.com/news", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Choose your region"}, "ground_truth": {"screenshot": "bbc_region_1_gt.png", "description": "Region selector opens", "visual_changes": ["Region overlay appears", "Region options visible"], "success_criteria": ["Overlay is visible", "Regions are selectable"]}} -{"web_name": "Cambridge Dictionary", "id": "cambridge_translate_1", "task": "Click the translation language selector", "web": "https://dictionary.cambridge.org/", "element_type": "select", "interaction": "click", "target_element": {"type": "id", "value": "translation-language"}, "ground_truth": {"screenshot": "cambridge_translate_1_gt.png", "description": "Language dropdown opens", "visual_changes": ["Dropdown menu appears", "Language options visible"], "success_criteria": ["Dropdown is visible", "Languages are selectable"]}} -{"web_name": "AllRecipes", "id": "allrecipes_sort_1", "task": "Click the sort dropdown", "web": "https://www.allrecipes.com/", "element_type": "select", "interaction": "click", "target_element": {"type": "id", "value": "sort-dropdown"}, "ground_truth": {"screenshot": "allrecipes_sort_1_gt.png", "description": "Sort options appear", "visual_changes": ["Dropdown menu appears", "Sort options visible"], "success_criteria": ["Dropdown is visible", "Options are selectable"]}} -{"web_name": "Amazon", "id": "amazon_language_1", "task": "Click the language selector", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "id", "value": "icp-nav-flyout"}, "ground_truth": {"screenshot": "amazon_language_1_gt.png", "description": "Language overlay opens", "visual_changes": ["Language overlay appears", "Language options visible"], "success_criteria": ["Overlay is visible", "Languages are selectable"]}} -{"web_name": "BBC News", "id": "bbc_weather_1", "task": "Click the weather widget", "web": "https://www.bbc.com/news", "element_type": "div", "interaction": "click", "target_element": {"type": "class", "value": "weather-widget"}, "ground_truth": {"screenshot": "bbc_weather_1_gt.png", "description": "Weather details expand", "visual_changes": ["Weather details appear", "Forecast visible"], "success_criteria": ["Weather details visible", "Forecast information shown"]}} -{"web_name": "Booking", "id": "booking_currency_1", "task": "Click the currency selector", "web": "https://www.booking.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "bui-button__text"}, "ground_truth": {"screenshot": "booking_currency_1_gt.png", "description": "Currency selector opens", "visual_changes": ["Currency overlay appears", "Currency options visible"], "success_criteria": ["Overlay is visible", "Currencies are selectable"]}} -{"web_name": "AllRecipes", "id": "allrecipes_share_1", "task": "Click the share button", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Share"}, "ground_truth": {"screenshot": "allrecipes_share_1_gt.png", "description": "Share options appear", "visual_changes": ["Share overlay appears", "Share options visible"], "success_criteria": ["Overlay is visible", "Share options are clickable"]}} -{"web_name": "Amazon", "id": "amazon_account_1", "task": "Click the account menu", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "id", "value": "nav-link-accountList"}, "ground_truth": {"screenshot": "amazon_account_1_gt.png", "description": "Account menu opens", "visual_changes": ["Account overlay appears", "Account options visible"], "success_criteria": ["Overlay is visible", "Account options are clickable"]}} -{"web_name": "Apple", "id": "apple_support_1", "task": "Click the 'Support' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Support"}, "ground_truth": {"screenshot": "apple_support_1_gt.png", "description": "Support page loads", "visual_changes": ["Page transitions to support", "Support options visible"], "success_criteria": ["Support page loads", "Support content visible"]}} -{"web_name": "ArXiv", "id": "arxiv_help_1", "task": "Click the 'Help' link", "web": "https://arxiv.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Help"}, "ground_truth": {"screenshot": "arxiv_help_1_gt.png", "description": "Help page loads", "visual_changes": ["Page transitions to help", "Help content visible"], "success_criteria": ["Help page loads", "Help content visible"]}} -{"web_name": "BBC News", "id": "bbc_video_1", "task": "Click the video player", "web": "https://www.bbc.com/news", "element_type": "div", "interaction": "click", "target_element": {"type": "class", "value": "media-player"}, "ground_truth": {"screenshot": "bbc_video_1_gt.png", "description": "Video player activates", "visual_changes": ["Video starts playing", "Player controls visible"], "success_criteria": ["Video is playing", "Controls are visible"]}} -{"web_name": "Cambridge Dictionary", "id": "cambridge_grammar_1", "task": "Click the 'Grammar' tab", "web": "https://dictionary.cambridge.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Grammar"}, "ground_truth": {"screenshot": "cambridge_grammar_1_gt.png", "description": "Grammar section loads", "visual_changes": ["Page transitions to grammar", "Grammar content visible"], "success_criteria": ["Grammar page loads", "Grammar content visible"]}} -{"web_name": "AllRecipes", "id": "allrecipes_print_1", "task": "Click the print button", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Print"}, "ground_truth": {"screenshot": "allrecipes_print_1_gt.png", "description": "Print dialog opens", "visual_changes": ["Print overlay appears", "Print options visible"], "success_criteria": ["Print dialog visible", "Print options available"]}} -{"web_name": "Amazon", "id": "amazon_orders_1", "task": "Click the 'Returns & Orders' link", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "id", "value": "nav-orders"}, "ground_truth": {"screenshot": "amazon_orders_1_gt.png", "description": "Orders page loads", "visual_changes": ["Page transitions to orders", "Order history visible"], "success_criteria": ["Orders page loads", "Order history visible"]}} -{"web_name": "BBC News", "id": "bbc_notification_1", "task": "Click the notification bell icon", "web": "https://www.bbc.com/news", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "notification-bell"}, "ground_truth": {"screenshot": "bbc_notification_1_gt.png", "description": "Notification settings open", "visual_changes": ["Notification overlay appears", "Notification options visible"], "success_criteria": ["Overlay is visible", "Settings are accessible"]}} -{"web_name": "Booking", "id": "booking_property_1", "task": "Click the property type filter", "web": "https://www.booking.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Property type"}, "ground_truth": {"screenshot": "booking_property_1_gt.png", "description": "Property types appear", "visual_changes": ["Filter overlay appears", "Property options visible"], "success_criteria": ["Overlay is visible", "Options are selectable"]}} -{"web_name": "AllRecipes", "id": "allrecipes_save_1", "task": "Click the save recipe button", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Save Recipe"}, "ground_truth": {"screenshot": "allrecipes_save_1_gt.png", "description": "Save options appear", "visual_changes": ["Save overlay appears", "Collection options visible"], "success_criteria": ["Save dialog visible", "Collections are selectable"]}} -{"web_name": "Amazon", "id": "amazon_filter_1", "task": "Click the price filter dropdown", "web": "https://www.amazon.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Price"}, "ground_truth": {"screenshot": "amazon_filter_1_gt.png", "description": "Price ranges appear", "visual_changes": ["Price overlay appears", "Range options visible"], "success_criteria": ["Overlay is visible", "Ranges are selectable"]}} -{"web_name": "Apple", "id": "apple_watch_1", "task": "Click the 'Watch' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Watch"}, "ground_truth": {"screenshot": "apple_watch_1_gt.png", "description": "Watch page loads", "visual_changes": ["Page transitions to Watch", "Watch products visible"], "success_criteria": ["Watch page loads", "Products are visible"]}} -{"web_name": "ArXiv", "id": "arxiv_pdf_1", "task": "Click the PDF link", "web": "https://arxiv.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "PDF"}, "ground_truth": {"screenshot": "arxiv_pdf_1_gt.png", "description": "PDF starts downloading", "visual_changes": ["Download starts", "Download indicator visible"], "success_criteria": ["Download begins", "Download status shown"]}} -{"web_name": "BBC News", "id": "bbc_business_1", "task": "Click the 'Business' section link", "web": "https://www.bbc.com/news", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Business"}, "ground_truth": {"screenshot": "bbc_business_1_gt.png", "description": "Business section loads", "visual_changes": ["Page transitions to business", "Business news visible"], "success_criteria": ["Business page loads", "Business content visible"]}} -{"web_name": "Cambridge Dictionary", "id": "cambridge_thesaurus_1", "task": "Click the 'Thesaurus' tab", "web": "https://dictionary.cambridge.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Thesaurus"}, "ground_truth": {"screenshot": "cambridge_thesaurus_1_gt.png", "description": "Thesaurus section loads", "visual_changes": ["Page transitions to thesaurus", "Thesaurus content visible"], "success_criteria": ["Thesaurus page loads", "Synonyms are visible"]}} -{"web_name": "AllRecipes", "id": "allrecipes_review_1", "task": "Click the reviews tab", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Reviews"}, "ground_truth": {"screenshot": "allrecipes_review_1_gt.png", "description": "Reviews section opens", "visual_changes": ["Reviews section appears", "Review content visible"], "success_criteria": ["Reviews are visible", "Rating information shown"]}} -{"web_name": "Amazon", "id": "amazon_prime_1", "task": "Click the Prime benefits link", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Prime"}, "ground_truth": {"screenshot": "amazon_prime_1_gt.png", "description": "Prime page loads", "visual_changes": ["Page transitions to Prime", "Prime benefits visible"], "success_criteria": ["Prime page loads", "Benefits are visible"]}} -{"web_name": "Apple", "id": "apple_ipad_1", "task": "Click the 'iPad' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "iPad"}, "ground_truth": {"screenshot": "apple_ipad_1_gt.png", "description": "iPad page loads", "visual_changes": ["Page transitions to iPad", "iPad products visible"], "success_criteria": ["iPad page loads", "Products are visible"]}} -{"web_name": "ArXiv", "id": "arxiv_abstract_1", "task": "Click the abstract toggle", "web": "https://arxiv.org/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Abstract"}, "ground_truth": {"screenshot": "arxiv_abstract_1_gt.png", "description": "Abstract expands", "visual_changes": ["Abstract section expands", "Full text visible"], "success_criteria": ["Abstract is expanded", "Text is readable"]}} -{"web_name": "BBC News", "id": "bbc_tech_1", "task": "Click the 'Technology' section link", "web": "https://www.bbc.com/news", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Technology"}, "ground_truth": {"screenshot": "bbc_tech_1_gt.png", "description": "Technology section loads", "visual_changes": ["Page transitions to technology", "Tech news visible"], "success_criteria": ["Tech page loads", "Tech content visible"]}} -{"web_name": "Cambridge Dictionary", "id": "cambridge_translate_2", "task": "Click the search box and type 'bonjour'", "web": "https://dictionary.cambridge.org/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "searchword"}, "input_text": "bonjour", "ground_truth": {"screenshot": "cambridge_translate_2_gt.png", "description": "Word entered in search", "visual_changes": ["Text appears in search box", "Suggestions may appear"], "success_criteria": ["Text matches exactly", "Input is visible"]}} -{"web_name": "AllRecipes", "id": "allrecipes_cuisine_1", "task": "Click the cuisine filter", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Cuisine"}, "ground_truth": {"screenshot": "allrecipes_cuisine_1_gt.png", "description": "Cuisine options appear", "visual_changes": ["Filter overlay appears", "Cuisine options visible"], "success_criteria": ["Overlay is visible", "Options are selectable"]}} -{"web_name": "Amazon", "id": "amazon_deals_1", "task": "Click the 'Today's Deals' link", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Today's Deals"}, "ground_truth": {"screenshot": "amazon_deals_1_gt.png", "description": "Deals page loads", "visual_changes": ["Page transitions to deals", "Deal items visible"], "success_criteria": ["Deals page loads", "Deals are visible"]}} -{"web_name": "Booking", "id": "booking_map_1", "task": "Click the map view button", "web": "https://www.booking.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Map"}, "ground_truth": {"screenshot": "booking_map_1_gt.png", "description": "Map view opens", "visual_changes": ["Map interface appears", "Property markers visible"], "success_criteria": ["Map is visible", "Properties are plotted"]}} -{"web_name": "Apple", "id": "apple_airpods_1", "task": "Click the 'AirPods' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "AirPods"}, "ground_truth": {"screenshot": "apple_airpods_1_gt.png", "description": "AirPods page loads", "visual_changes": ["Page transitions to AirPods", "AirPods products visible"], "success_criteria": ["AirPods page loads", "Products are visible"]}} -{"web_name": "ArXiv", "id": "arxiv_author_1", "task": "Click the author search link", "web": "https://arxiv.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Author"}, "ground_truth": {"screenshot": "arxiv_author_1_gt.png", "description": "Author search opens", "visual_changes": ["Author search interface appears", "Search options visible"], "success_criteria": ["Search interface visible", "Author field active"]}} -{"web_name": "BBC News", "id": "bbc_share_1", "task": "Click the share button", "web": "https://www.bbc.com/news", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "share-button"}, "ground_truth": {"screenshot": "bbc_share_1_gt.png", "description": "Share options appear", "visual_changes": ["Share overlay appears", "Share options visible"], "success_criteria": ["Overlay is visible", "Options are clickable"]}} -{"web_name": "Cambridge Dictionary", "id": "cambridge_examples_1", "task": "Click the 'Examples' tab", "web": "https://dictionary.cambridge.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Examples"}, "ground_truth": {"screenshot": "cambridge_examples_1_gt.png", "description": "Examples section loads", "visual_changes": ["Page transitions to examples", "Usage examples visible"], "success_criteria": ["Examples page loads", "Examples are visible"]}} -{"web_name": "AllRecipes", "id": "allrecipes_nutrition_1", "task": "Click the nutrition info button", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Nutrition"}, "ground_truth": {"screenshot": "allrecipes_nutrition_1_gt.png", "description": "Nutrition info appears", "visual_changes": ["Nutrition overlay appears", "Nutritional values visible"], "success_criteria": ["Overlay is visible", "Values are readable"]}} -{"web_name": "Amazon", "id": "amazon_wishlist_1", "task": "Click the 'Add to List' button", "web": "https://www.amazon.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Add to List"}, "ground_truth": {"screenshot": "amazon_wishlist_1_gt.png", "description": "List options appear", "visual_changes": ["List overlay appears", "List options visible"], "success_criteria": ["Overlay is visible", "Lists are selectable"]}} -{"web_name": "Apple", "id": "apple_iphone_1", "task": "Click the 'iPhone' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "iPhone"}, "ground_truth": {"screenshot": "apple_iphone_1_gt.png", "description": "iPhone page loads", "visual_changes": ["Page transitions to iPhone", "iPhone products visible"], "success_criteria": ["iPhone page loads", "Products are visible"]}} -{"web_name": "ArXiv", "id": "arxiv_date_1", "task": "Click the date range filter", "web": "https://arxiv.org/", "element_type": "select", "interaction": "click", "target_element": {"type": "name", "value": "date-range"}, "ground_truth": {"screenshot": "arxiv_date_1_gt.png", "description": "Date options appear", "visual_changes": ["Date dropdown appears", "Range options visible"], "success_criteria": ["Dropdown is visible", "Ranges are selectable"]}} -{"web_name": "BBC News", "id": "bbc_climate_1", "task": "Click the 'Climate' section link", "web": "https://www.bbc.com/news", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Climate"}, "ground_truth": {"screenshot": "bbc_climate_1_gt.png", "description": "Climate section loads", "visual_changes": ["Page transitions to climate", "Climate news visible"], "success_criteria": ["Climate page loads", "Climate content visible"]}} -{"web_name": "Booking", "id": "booking_rating_1", "task": "Click the rating filter", "web": "https://www.booking.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Rating"}, "ground_truth": {"screenshot": "booking_rating_1_gt.png", "description": "Rating options appear", "visual_changes": ["Rating overlay appears", "Score options visible"], "success_criteria": ["Overlay is visible", "Ratings are selectable"]}} -{"web_name": "Cambridge Dictionary", "id": "cambridge_browse_1", "task": "Click the 'Browse Dictionary' link", "web": "https://dictionary.cambridge.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Browse Dictionary"}, "ground_truth": {"screenshot": "cambridge_browse_1_gt.png", "description": "Browse page loads", "visual_changes": ["Page transitions to browse", "Word categories visible"], "success_criteria": ["Browse page loads", "Categories are visible"]}} -{"web_name": "AllRecipes", "id": "allrecipes_video_1", "task": "Click the recipe video play button", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "video-play"}, "ground_truth": {"screenshot": "allrecipes_video_1_gt.png", "description": "Video starts playing", "visual_changes": ["Video begins playback", "Player controls visible"], "success_criteria": ["Video is playing", "Controls are visible"]}} -{"web_name": "Amazon", "id": "amazon_seller_1", "task": "Click the 'Other Sellers' link", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Other Sellers"}, "ground_truth": {"screenshot": "amazon_seller_1_gt.png", "description": "Seller options appear", "visual_changes": ["Seller list appears", "Price options visible"], "success_criteria": ["Seller list visible", "Prices are shown"]}} -{"web_name": "Apple", "id": "apple_tv_1", "task": "Click the 'TV & Home' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "TV & Home"}, "ground_truth": {"screenshot": "apple_tv_1_gt.png", "description": "TV & Home page loads", "visual_changes": ["Page transitions to TV", "TV products visible"], "success_criteria": ["TV page loads", "Products are visible"]}} -{"web_name": "ArXiv", "id": "arxiv_title_1", "task": "Click the search box and type 'machine learning'", "web": "https://arxiv.org/", "element_type": "input", "interaction": "type", "target_element": {"type": "name", "value": "query"}, "input_text": "machine learning", "ground_truth": {"screenshot": "arxiv_title_1_gt.png", "description": "Search term entered", "visual_changes": ["Text appears in search box", "Suggestions may appear"], "success_criteria": ["Text matches exactly", "Input is visible"]}} -{"web_name": "BBC News", "id": "bbc_science_1", "task": "Click the 'Science' section link", "web": "https://www.bbc.com/news", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Science"}, "ground_truth": {"screenshot": "bbc_science_1_gt.png", "description": "Science section loads", "visual_changes": ["Page transitions to science", "Science news visible"], "success_criteria": ["Science page loads", "Science content visible"]}} -{"web_name": "Cambridge Dictionary", "id": "cambridge_word_1", "task": "Click the 'Word of the Day' link", "web": "https://dictionary.cambridge.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Word of the Day"}, "ground_truth": {"screenshot": "cambridge_word_1_gt.png", "description": "Word of the Day loads", "visual_changes": ["Page transitions to word", "Word details visible"], "success_criteria": ["Word page loads", "Definition visible"]}} -{"web_name": "AllRecipes", "id": "allrecipes_time_1", "task": "Click the cooking time filter", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Cooking Time"}, "ground_truth": {"screenshot": "allrecipes_time_1_gt.png", "description": "Time options appear", "visual_changes": ["Filter overlay appears", "Time ranges visible"], "success_criteria": ["Overlay is visible", "Ranges are selectable"]}} -{"web_name": "Amazon", "id": "amazon_gift_1", "task": "Click the 'Gift Cards' link", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Gift Cards"}, "ground_truth": {"screenshot": "amazon_gift_1_gt.png", "description": "Gift cards page loads", "visual_changes": ["Page transitions to gifts", "Gift card options visible"], "success_criteria": ["Gift page loads", "Options are visible"]}} -{"web_name": "Booking", "id": "booking_popular_1", "task": "Click the 'Popular filters' button", "web": "https://www.booking.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Popular filters"}, "ground_truth": {"screenshot": "booking_popular_1_gt.png", "description": "Popular filters appear", "visual_changes": ["Filter overlay appears", "Popular options visible"], "success_criteria": ["Overlay is visible", "Filters are selectable"]}} -{"web_name": "Apple", "id": "apple_music_1", "task": "Click the 'Music' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Music"}, "ground_truth": {"screenshot": "apple_music_1_gt.png", "description": "Music page loads", "visual_changes": ["Page transitions to music", "Music services visible"], "success_criteria": ["Music page loads", "Services are visible"]}} -{"web_name": "ArXiv", "id": "arxiv_stats_1", "task": "Click the 'Statistics' subject link", "web": "https://arxiv.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Statistics"}, "ground_truth": {"screenshot": "arxiv_stats_1_gt.png", "description": "Statistics papers load", "visual_changes": ["Page transitions to stats", "Statistics papers visible"], "success_criteria": ["Stats page loads", "Papers are visible"]}} -{"web_name": "BBC News", "id": "bbc_local_1", "task": "Click the 'Local News' link", "web": "https://www.bbc.com/news", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Local News"}, "ground_truth": {"screenshot": "bbc_local_1_gt.png", "description": "Local news loads", "visual_changes": ["Page transitions to local", "Local news visible"], "success_criteria": ["Local page loads", "News is visible"]}} \ No newline at end of file +{"web_name": "Cambridge Dictionary", "id": "cambridge_lookup_1", "task": "Click the search box and type 'hello'", "web": "https://dictionary.cambridge.org/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "searchword"}, "input_text": "hello", "target_html": "", "ground_truth": {"screenshot": "cambridge_lookup_1_gt.png", "description": "The word 'hello' has been entered in the search box", "visual_changes": ["Text 'hello' appears in search box", "Text cursor visible at end of input", "Search suggestions may appear"], "success_criteria": ["Input text matches 'hello' exactly", "Text is visible in search box", "Search box maintains focus"]}} +{"web_name": "Cambridge Dictionary", "id": "cambridge_search_1", "task": "Click the search button", "web": "https://dictionary.cambridge.org/", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "cdo-search-button"}, "target_html": "", "ground_truth": {"screenshot": "cambridge_search_1_gt.png", "description": "The search results for 'hello' are displayed", "visual_changes": ["Search button appears pressed", "Page transitions to search results", "Definition of 'hello' is displayed"], "success_criteria": ["Search button responds to click", "Results page loads", "No error messages displayed"]}} +{"web_name": "AllRecipes", "id": "allrecipes_search_1", "task": "Click the search box and type 'vegetarian lasagna'", "web": "https://www.allrecipes.com/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "search-box"}, "input_text": "vegetarian lasagna", "ground_truth": {"screenshot": "allrecipes_search_1_gt.png", "description": "Search term entered in search box", "visual_changes": ["Text appears in search box", "Search suggestions may appear"], "success_criteria": ["Text matches exactly", "Search box contains text"]}, "target_html": ""} +{"web_name": "AllRecipes", "id": "allrecipes_filter_1", "task": "Click the 'Ratings' filter button", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Ratings"}, "ground_truth": {"screenshot": "allrecipes_filter_1_gt.png", "description": "Ratings filter dropdown opens", "visual_changes": ["Dropdown menu appears", "Filter options visible"], "success_criteria": ["Dropdown menu is visible", "Filter options are clickable"]}, "target_html": ""} +{"web_name": "Amazon", "id": "amazon_search_1", "task": "Click the search box and type 'laptop'", "web": "https://www.amazon.com/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "twotabsearchtextbox"}, "input_text": "laptop", "ground_truth": {"screenshot": "amazon_search_1_gt.png", "description": "Search term entered", "visual_changes": ["Text appears in search box", "Search suggestions appear"], "success_criteria": ["Text matches exactly", "Search suggestions visible"]}, "target_html": ""} +{"web_name": "Amazon", "id": "amazon_menu_1", "task": "Click the hamburger menu button", "web": "https://www.amazon.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "id", "value": "nav-hamburger-menu"}, "ground_truth": {"screenshot": "amazon_menu_1_gt.png", "description": "Side menu opens", "visual_changes": ["Menu slides in from left", "Menu options visible"], "success_criteria": ["Menu is visible", "Menu items are clickable"]}, "target_html": ""} +{"web_name": "Apple", "id": "apple_menu_1", "task": "Click the 'Mac' menu item", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Mac"}, "ground_truth": {"screenshot": "apple_menu_1_gt.png", "description": "Mac page loads", "visual_changes": ["Page transitions to Mac section", "Mac products visible"], "success_criteria": ["Page URL changes", "Mac content visible"]}, "target_html": ""} +{"web_name": "Apple", "id": "apple_search_1", "task": "Click the search icon", "web": "https://www.apple.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "globalnav-search-trigger"}, "ground_truth": {"screenshot": "apple_search_1_gt.png", "description": "Search overlay opens", "visual_changes": ["Search overlay appears", "Search box focused"], "success_criteria": ["Search overlay visible", "Search box is active"]}, "target_html": ""} +{"web_name": "ArXiv", "id": "arxiv_search_1", "task": "Click the search box and type 'quantum computing'", "web": "https://arxiv.org/", "element_type": "input", "interaction": "type", "target_element": {"type": "name", "value": "query"}, "input_text": "quantum computing", "ground_truth": {"screenshot": "arxiv_search_1_gt.png", "description": "Search term entered", "visual_changes": ["Text appears in search box"], "success_criteria": ["Text matches exactly", "Search box contains text"]}, "target_html": ""} +{"web_name": "ArXiv", "id": "arxiv_filter_1", "task": "Click the 'Advanced Search' link", "web": "https://arxiv.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Advanced Search"}, "ground_truth": {"screenshot": "arxiv_filter_1_gt.png", "description": "Advanced search page loads", "visual_changes": ["Page transitions to advanced search", "Search filters visible"], "success_criteria": ["URL changes to advanced search", "Advanced search form visible"]}, "target_html": ""} +{"web_name": "BBC News", "id": "bbc_menu_1", "task": "Click the menu button", "web": "https://www.bbc.com/news", "element_type": "button", "interaction": "click", "target_element": {"type": "id", "value": "orbit-more-menu"}, "ground_truth": {"screenshot": "bbc_menu_1_gt.png", "description": "Menu overlay opens", "visual_changes": ["Menu overlay appears", "Navigation options visible"], "success_criteria": ["Menu overlay is visible", "Menu items are clickable"]}, "target_html": ""} +{"web_name": "BBC News", "id": "bbc_search_1", "task": "Click the search icon", "web": "https://www.bbc.com/news", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "orbit-search__button"}, "ground_truth": {"screenshot": "bbc_search_1_gt.png", "description": "Search overlay opens", "visual_changes": ["Search overlay appears", "Search box focused"], "success_criteria": ["Search overlay visible", "Search input is active"]}, "target_html": ""} +{"web_name": "Booking", "id": "booking_destination_1", "task": "Click the destination input and type 'Paris'", "web": "https://www.booking.com/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "ss"}, "input_text": "Paris", "ground_truth": {"screenshot": "booking_destination_1_gt.png", "description": "Destination entered", "visual_changes": ["Text appears in input", "Location suggestions appear"], "success_criteria": ["Text matches exactly", "Suggestions are visible"]}, "target_html": ""} +{"web_name": "Booking", "id": "booking_dates_1", "task": "Click the check-in date field", "web": "https://www.booking.com/", "element_type": "div", "interaction": "click", "target_element": {"type": "class", "value": "sb-date-field__field"}, "ground_truth": {"screenshot": "booking_dates_1_gt.png", "description": "Calendar overlay opens", "visual_changes": ["Calendar overlay appears", "Available dates highlighted"], "success_criteria": ["Calendar is visible", "Dates are selectable"]}, "target_html": ""} +{"web_name": "Cambridge Dictionary", "id": "cambridge_menu_1", "task": "Click the 'More' menu button", "web": "https://dictionary.cambridge.org/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "More"}, "ground_truth": {"screenshot": "cambridge_menu_1_gt.png", "description": "More menu opens", "visual_changes": ["Dropdown menu appears", "Menu options visible"], "success_criteria": ["Dropdown is visible", "Menu items are clickable"]}, "target_html": ""} +{"web_name": "AllRecipes", "id": "allrecipes_menu_1", "task": "Click the 'Ingredients' button", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Ingredients"}, "ground_truth": {"screenshot": "allrecipes_menu_1_gt.png", "description": "Ingredients section expands", "visual_changes": ["Section expands", "Ingredient list visible"], "success_criteria": ["Section is expanded", "Ingredients are visible"]}, "target_html": ""} +{"web_name": "Amazon", "id": "amazon_cart_1", "task": "Click the cart icon", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "id", "value": "nav-cart"}, "ground_truth": {"screenshot": "amazon_cart_1_gt.png", "description": "Cart page loads", "visual_changes": ["Page transitions to cart", "Cart contents visible"], "success_criteria": ["Cart page loads", "Cart status visible"]}, "target_html": ""} +{"web_name": "Apple", "id": "apple_store_1", "task": "Click the 'Store' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Store"}, "ground_truth": {"screenshot": "apple_store_1_gt.png", "description": "Store page loads", "visual_changes": ["Page transitions to store", "Store products visible"], "success_criteria": ["Store page loads", "Products are visible"]}, "target_html": ""} +{"web_name": "ArXiv", "id": "arxiv_recent_1", "task": "Click the 'Recent' link", "web": "https://arxiv.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "recent"}, "ground_truth": {"screenshot": "arxiv_recent_1_gt.png", "description": "Recent submissions page loads", "visual_changes": ["Page transitions to recent submissions", "Recent papers visible"], "success_criteria": ["Recent page loads", "Papers are visible"]}, "target_html": ""} +{"web_name": "BBC News", "id": "bbc_sport_1", "task": "Click the 'Sport' section link", "web": "https://www.bbc.com/news", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Sport"}, "ground_truth": {"screenshot": "bbc_sport_1_gt.png", "description": "Sports section loads", "visual_changes": ["Page transitions to sports", "Sports news visible"], "success_criteria": ["Sports page loads", "Sports content visible"]}, "target_html": ""} +{"web_name": "Booking", "id": "booking_guests_1", "task": "Click the guests selector", "web": "https://www.booking.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "xp__guests__count"}, "ground_truth": {"screenshot": "booking_guests_1_gt.png", "description": "Guests selector opens", "visual_changes": ["Guests overlay appears", "Guest options visible"], "success_criteria": ["Overlay is visible", "Guest controls are active"]}, "target_html": ""} +{"web_name": "Booking", "id": "booking_search_1", "task": "Click the search button", "web": "https://www.booking.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "sb-searchbox__button"}, "ground_truth": {"screenshot": "booking_search_1_gt.png", "description": "Search results load", "visual_changes": ["Page transitions to results", "Available properties shown"], "success_criteria": ["Results page loads", "Properties are visible"]}, "target_html": ""} +{"web_name": "AllRecipes", "id": "allrecipes_diet_1", "task": "Click the 'Dietary Restrictions' filter", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Dietary Restrictions"}, "ground_truth": {"screenshot": "allrecipes_diet_1_gt.png", "description": "Diet filter opens", "visual_changes": ["Filter dropdown appears", "Diet options visible"], "success_criteria": ["Dropdown is visible", "Options are clickable"]}, "target_html": ""} +{"web_name": "Amazon", "id": "amazon_department_1", "task": "Click the department selector", "web": "https://www.amazon.com/", "element_type": "select", "interaction": "click", "target_element": {"type": "id", "value": "searchDropdownBox"}, "ground_truth": {"screenshot": "amazon_department_1_gt.png", "description": "Department dropdown opens", "visual_changes": ["Dropdown menu appears", "Department list visible"], "success_criteria": ["Dropdown is visible", "Departments are selectable"]}, "target_html": ""} +{"web_name": "Apple", "id": "apple_bag_1", "task": "Click the shopping bag icon", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "class", "value": "globalnav-bag-item"}, "ground_truth": {"screenshot": "apple_bag_1_gt.png", "description": "Shopping bag overlay opens", "visual_changes": ["Bag overlay appears", "Cart contents visible"], "success_criteria": ["Overlay is visible", "Cart status shown"]}, "target_html": ""} +{"web_name": "ArXiv", "id": "arxiv_subject_1", "task": "Click the subject area dropdown", "web": "https://arxiv.org/", "element_type": "select", "interaction": "click", "target_element": {"type": "name", "value": "subject"}, "ground_truth": {"screenshot": "arxiv_subject_1_gt.png", "description": "Subject dropdown opens", "visual_changes": ["Dropdown menu appears", "Subject areas visible"], "success_criteria": ["Dropdown is visible", "Subjects are selectable"]}, "target_html": ""} +{"web_name": "BBC News", "id": "bbc_region_1", "task": "Click the region selector", "web": "https://www.bbc.com/news", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Choose your region"}, "ground_truth": {"screenshot": "bbc_region_1_gt.png", "description": "Region selector opens", "visual_changes": ["Region overlay appears", "Region options visible"], "success_criteria": ["Overlay is visible", "Regions are selectable"]}, "target_html": ""} +{"web_name": "Cambridge Dictionary", "id": "cambridge_translate_1", "task": "Click the translation language selector", "web": "https://dictionary.cambridge.org/", "element_type": "select", "interaction": "click", "target_element": {"type": "id", "value": "translation-language"}, "ground_truth": {"screenshot": "cambridge_translate_1_gt.png", "description": "Language dropdown opens", "visual_changes": ["Dropdown menu appears", "Language options visible"], "success_criteria": ["Dropdown is visible", "Languages are selectable"]}, "target_html": ""} +{"web_name": "AllRecipes", "id": "allrecipes_sort_1", "task": "Click the sort dropdown", "web": "https://www.allrecipes.com/", "element_type": "select", "interaction": "click", "target_element": {"type": "id", "value": "sort-dropdown"}, "ground_truth": {"screenshot": "allrecipes_sort_1_gt.png", "description": "Sort options appear", "visual_changes": ["Dropdown menu appears", "Sort options visible"], "success_criteria": ["Dropdown is visible", "Options are selectable"]}, "target_html": ""} +{"web_name": "Amazon", "id": "amazon_language_1", "task": "Click the language selector", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "id", "value": "icp-nav-flyout"}, "ground_truth": {"screenshot": "amazon_language_1_gt.png", "description": "Language overlay opens", "visual_changes": ["Language overlay appears", "Language options visible"], "success_criteria": ["Overlay is visible", "Languages are selectable"]}, "target_html": ""} +{"web_name": "BBC News", "id": "bbc_weather_1", "task": "Click the weather widget", "web": "https://www.bbc.com/news", "element_type": "div", "interaction": "click", "target_element": {"type": "class", "value": "weather-widget"}, "ground_truth": {"screenshot": "bbc_weather_1_gt.png", "description": "Weather details expand", "visual_changes": ["Weather details appear", "Forecast visible"], "success_criteria": ["Weather details visible", "Forecast information shown"]}, "target_html": ""} +{"web_name": "Booking", "id": "booking_currency_1", "task": "Click the currency selector", "web": "https://www.booking.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "bui-button__text"}, "ground_truth": {"screenshot": "booking_currency_1_gt.png", "description": "Currency selector opens", "visual_changes": ["Currency overlay appears", "Currency options visible"], "success_criteria": ["Overlay is visible", "Currencies are selectable"]}, "target_html": ""} +{"web_name": "AllRecipes", "id": "allrecipes_share_1", "task": "Click the share button", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Share"}, "ground_truth": {"screenshot": "allrecipes_share_1_gt.png", "description": "Share options appear", "visual_changes": ["Share overlay appears", "Share options visible"], "success_criteria": ["Overlay is visible", "Share options are clickable"]}, "target_html": ""} +{"web_name": "Amazon", "id": "amazon_account_1", "task": "Click the account menu", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "id", "value": "nav-link-accountList"}, "ground_truth": {"screenshot": "amazon_account_1_gt.png", "description": "Account menu opens", "visual_changes": ["Account overlay appears", "Account options visible"], "success_criteria": ["Overlay is visible", "Account options are clickable"]}, "target_html": ""} +{"web_name": "Apple", "id": "apple_support_1", "task": "Click the 'Support' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Support"}, "ground_truth": {"screenshot": "apple_support_1_gt.png", "description": "Support page loads", "visual_changes": ["Page transitions to support", "Support options visible"], "success_criteria": ["Support page loads", "Support content visible"]}, "target_html": ""} +{"web_name": "ArXiv", "id": "arxiv_help_1", "task": "Click the 'Help' link", "web": "https://arxiv.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Help"}, "ground_truth": {"screenshot": "arxiv_help_1_gt.png", "description": "Help page loads", "visual_changes": ["Page transitions to help", "Help content visible"], "success_criteria": ["Help page loads", "Help content visible"]}, "target_html": ""} +{"web_name": "BBC News", "id": "bbc_video_1", "task": "Click the video player", "web": "https://www.bbc.com/news", "element_type": "div", "interaction": "click", "target_element": {"type": "class", "value": "media-player"}, "ground_truth": {"screenshot": "bbc_video_1_gt.png", "description": "Video player activates", "visual_changes": ["Video starts playing", "Player controls visible"], "success_criteria": ["Video is playing", "Controls are visible"]}, "target_html": ""} +{"web_name": "Cambridge Dictionary", "id": "cambridge_grammar_1", "task": "Click the 'Grammar' tab", "web": "https://dictionary.cambridge.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Grammar"}, "ground_truth": {"screenshot": "cambridge_grammar_1_gt.png", "description": "Grammar section loads", "visual_changes": ["Page transitions to grammar", "Grammar content visible"], "success_criteria": ["Grammar page loads", "Grammar content visible"]}, "target_html": ""} +{"web_name": "AllRecipes", "id": "allrecipes_print_1", "task": "Click the print button", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Print"}, "ground_truth": {"screenshot": "allrecipes_print_1_gt.png", "description": "Print dialog opens", "visual_changes": ["Print overlay appears", "Print options visible"], "success_criteria": ["Print dialog visible", "Print options available"]}, "target_html": ""} +{"web_name": "Amazon", "id": "amazon_orders_1", "task": "Click the 'Returns & Orders' link", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "id", "value": "nav-orders"}, "ground_truth": {"screenshot": "amazon_orders_1_gt.png", "description": "Orders page loads", "visual_changes": ["Page transitions to orders", "Order history visible"], "success_criteria": ["Orders page loads", "Order history visible"]}, "target_html": ""} +{"web_name": "BBC News", "id": "bbc_notification_1", "task": "Click the notification bell icon", "web": "https://www.bbc.com/news", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "notification-bell"}, "ground_truth": {"screenshot": "bbc_notification_1_gt.png", "description": "Notification settings open", "visual_changes": ["Notification overlay appears", "Notification options visible"], "success_criteria": ["Overlay is visible", "Settings are accessible"]}, "target_html": ""} +{"web_name": "Booking", "id": "booking_property_1", "task": "Click the property type filter", "web": "https://www.booking.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Property type"}, "ground_truth": {"screenshot": "booking_property_1_gt.png", "description": "Property types appear", "visual_changes": ["Filter overlay appears", "Property options visible"], "success_criteria": ["Overlay is visible", "Options are selectable"]}, "target_html": ""} +{"web_name": "AllRecipes", "id": "allrecipes_save_1", "task": "Click the save recipe button", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Save Recipe"}, "ground_truth": {"screenshot": "allrecipes_save_1_gt.png", "description": "Save options appear", "visual_changes": ["Save overlay appears", "Collection options visible"], "success_criteria": ["Save dialog visible", "Collections are selectable"]}, "target_html": ""} +{"web_name": "Amazon", "id": "amazon_filter_1", "task": "Click the price filter dropdown", "web": "https://www.amazon.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Price"}, "ground_truth": {"screenshot": "amazon_filter_1_gt.png", "description": "Price ranges appear", "visual_changes": ["Price overlay appears", "Range options visible"], "success_criteria": ["Overlay is visible", "Ranges are selectable"]}, "target_html": ""} +{"web_name": "Apple", "id": "apple_watch_1", "task": "Click the 'Watch' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Watch"}, "ground_truth": {"screenshot": "apple_watch_1_gt.png", "description": "Watch page loads", "visual_changes": ["Page transitions to Watch", "Watch products visible"], "success_criteria": ["Watch page loads", "Products are visible"]}, "target_html": ""} +{"web_name": "ArXiv", "id": "arxiv_pdf_1", "task": "Click the PDF link", "web": "https://arxiv.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "PDF"}, "ground_truth": {"screenshot": "arxiv_pdf_1_gt.png", "description": "PDF starts downloading", "visual_changes": ["Download starts", "Download indicator visible"], "success_criteria": ["Download begins", "Download status shown"]}, "target_html": ""} +{"web_name": "BBC News", "id": "bbc_business_1", "task": "Click the 'Business' section link", "web": "https://www.bbc.com/news", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Business"}, "ground_truth": {"screenshot": "bbc_business_1_gt.png", "description": "Business section loads", "visual_changes": ["Page transitions to business", "Business news visible"], "success_criteria": ["Business page loads", "Business content visible"]}, "target_html": ""} +{"web_name": "Cambridge Dictionary", "id": "cambridge_thesaurus_1", "task": "Click the 'Thesaurus' tab", "web": "https://dictionary.cambridge.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Thesaurus"}, "ground_truth": {"screenshot": "cambridge_thesaurus_1_gt.png", "description": "Thesaurus section loads", "visual_changes": ["Page transitions to thesaurus", "Thesaurus content visible"], "success_criteria": ["Thesaurus page loads", "Synonyms are visible"]}, "target_html": ""} +{"web_name": "AllRecipes", "id": "allrecipes_review_1", "task": "Click the reviews tab", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Reviews"}, "ground_truth": {"screenshot": "allrecipes_review_1_gt.png", "description": "Reviews section opens", "visual_changes": ["Reviews section appears", "Review content visible"], "success_criteria": ["Reviews are visible", "Rating information shown"]}, "target_html": ""} +{"web_name": "Amazon", "id": "amazon_prime_1", "task": "Click the Prime benefits link", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Prime"}, "ground_truth": {"screenshot": "amazon_prime_1_gt.png", "description": "Prime page loads", "visual_changes": ["Page transitions to Prime", "Prime benefits visible"], "success_criteria": ["Prime page loads", "Benefits are visible"]}, "target_html": ""} +{"web_name": "Apple", "id": "apple_ipad_1", "task": "Click the 'iPad' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "iPad"}, "ground_truth": {"screenshot": "apple_ipad_1_gt.png", "description": "iPad page loads", "visual_changes": ["Page transitions to iPad", "iPad products visible"], "success_criteria": ["iPad page loads", "Products are visible"]}, "target_html": ""} +{"web_name": "ArXiv", "id": "arxiv_abstract_1", "task": "Click the abstract toggle", "web": "https://arxiv.org/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Abstract"}, "ground_truth": {"screenshot": "arxiv_abstract_1_gt.png", "description": "Abstract expands", "visual_changes": ["Abstract section expands", "Full text visible"], "success_criteria": ["Abstract is expanded", "Text is readable"]}, "target_html": ""} +{"web_name": "BBC News", "id": "bbc_tech_1", "task": "Click the 'Technology' section link", "web": "https://www.bbc.com/news", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Technology"}, "ground_truth": {"screenshot": "bbc_tech_1_gt.png", "description": "Technology section loads", "visual_changes": ["Page transitions to technology", "Tech news visible"], "success_criteria": ["Tech page loads", "Tech content visible"]}, "target_html": ""} +{"web_name": "Cambridge Dictionary", "id": "cambridge_translate_2", "task": "Click the search box and type 'bonjour'", "web": "https://dictionary.cambridge.org/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "searchword"}, "input_text": "bonjour", "ground_truth": {"screenshot": "cambridge_translate_2_gt.png", "description": "Word entered in search", "visual_changes": ["Text appears in search box", "Suggestions may appear"], "success_criteria": ["Text matches exactly", "Input is visible"]}, "target_html": ""} +{"web_name": "AllRecipes", "id": "allrecipes_cuisine_1", "task": "Click the cuisine filter", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Cuisine"}, "ground_truth": {"screenshot": "allrecipes_cuisine_1_gt.png", "description": "Cuisine options appear", "visual_changes": ["Filter overlay appears", "Cuisine options visible"], "success_criteria": ["Overlay is visible", "Options are selectable"]}, "target_html": ""} +{"web_name": "Amazon", "id": "amazon_deals_1", "task": "Click the 'Today's Deals' link", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Today's Deals"}, "ground_truth": {"screenshot": "amazon_deals_1_gt.png", "description": "Deals page loads", "visual_changes": ["Page transitions to deals", "Deal items visible"], "success_criteria": ["Deals page loads", "Deals are visible"]}, "target_html": ""} +{"web_name": "Booking", "id": "booking_map_1", "task": "Click the map view button", "web": "https://www.booking.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Map"}, "ground_truth": {"screenshot": "booking_map_1_gt.png", "description": "Map view opens", "visual_changes": ["Map interface appears", "Property markers visible"], "success_criteria": ["Map is visible", "Properties are plotted"]}, "target_html": ""} +{"web_name": "Apple", "id": "apple_airpods_1", "task": "Click the 'AirPods' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "AirPods"}, "ground_truth": {"screenshot": "apple_airpods_1_gt.png", "description": "AirPods page loads", "visual_changes": ["Page transitions to AirPods", "AirPods products visible"], "success_criteria": ["AirPods page loads", "Products are visible"]}, "target_html": ""} +{"web_name": "ArXiv", "id": "arxiv_author_1", "task": "Click the author search link", "web": "https://arxiv.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Author"}, "ground_truth": {"screenshot": "arxiv_author_1_gt.png", "description": "Author search opens", "visual_changes": ["Author search interface appears", "Search options visible"], "success_criteria": ["Search interface visible", "Author field active"]}, "target_html": ""} +{"web_name": "BBC News", "id": "bbc_share_1", "task": "Click the share button", "web": "https://www.bbc.com/news", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "share-button"}, "ground_truth": {"screenshot": "bbc_share_1_gt.png", "description": "Share options appear", "visual_changes": ["Share overlay appears", "Share options visible"], "success_criteria": ["Overlay is visible", "Options are clickable"]}, "target_html": ""} +{"web_name": "Cambridge Dictionary", "id": "cambridge_examples_1", "task": "Click the 'Examples' tab", "web": "https://dictionary.cambridge.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Examples"}, "ground_truth": {"screenshot": "cambridge_examples_1_gt.png", "description": "Examples section loads", "visual_changes": ["Page transitions to examples", "Usage examples visible"], "success_criteria": ["Examples page loads", "Examples are visible"]}, "target_html": ""} +{"web_name": "AllRecipes", "id": "allrecipes_nutrition_1", "task": "Click the nutrition info button", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Nutrition"}, "ground_truth": {"screenshot": "allrecipes_nutrition_1_gt.png", "description": "Nutrition info appears", "visual_changes": ["Nutrition overlay appears", "Nutritional values visible"], "success_criteria": ["Overlay is visible", "Values are readable"]}, "target_html": ""} +{"web_name": "Amazon", "id": "amazon_wishlist_1", "task": "Click the 'Add to List' button", "web": "https://www.amazon.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Add to List"}, "ground_truth": {"screenshot": "amazon_wishlist_1_gt.png", "description": "List options appear", "visual_changes": ["List overlay appears", "List options visible"], "success_criteria": ["Overlay is visible", "Lists are selectable"]}, "target_html": ""} +{"web_name": "Apple", "id": "apple_iphone_1", "task": "Click the 'iPhone' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "iPhone"}, "ground_truth": {"screenshot": "apple_iphone_1_gt.png", "description": "iPhone page loads", "visual_changes": ["Page transitions to iPhone", "iPhone products visible"], "success_criteria": ["iPhone page loads", "Products are visible"]}, "target_html": ""} +{"web_name": "ArXiv", "id": "arxiv_date_1", "task": "Click the date range filter", "web": "https://arxiv.org/", "element_type": "select", "interaction": "click", "target_element": {"type": "name", "value": "date-range"}, "ground_truth": {"screenshot": "arxiv_date_1_gt.png", "description": "Date options appear", "visual_changes": ["Date dropdown appears", "Range options visible"], "success_criteria": ["Dropdown is visible", "Ranges are selectable"]}, "target_html": ""} +{"web_name": "BBC News", "id": "bbc_climate_1", "task": "Click the 'Climate' section link", "web": "https://www.bbc.com/news", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Climate"}, "ground_truth": {"screenshot": "bbc_climate_1_gt.png", "description": "Climate section loads", "visual_changes": ["Page transitions to climate", "Climate news visible"], "success_criteria": ["Climate page loads", "Climate content visible"]}, "target_html": ""} +{"web_name": "Booking", "id": "booking_rating_1", "task": "Click the rating filter", "web": "https://www.booking.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Rating"}, "ground_truth": {"screenshot": "booking_rating_1_gt.png", "description": "Rating options appear", "visual_changes": ["Rating overlay appears", "Score options visible"], "success_criteria": ["Overlay is visible", "Ratings are selectable"]}, "target_html": ""} +{"web_name": "Cambridge Dictionary", "id": "cambridge_browse_1", "task": "Click the 'Browse Dictionary' link", "web": "https://dictionary.cambridge.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Browse Dictionary"}, "ground_truth": {"screenshot": "cambridge_browse_1_gt.png", "description": "Browse page loads", "visual_changes": ["Page transitions to browse", "Word categories visible"], "success_criteria": ["Browse page loads", "Categories are visible"]}, "target_html": ""} +{"web_name": "AllRecipes", "id": "allrecipes_video_1", "task": "Click the recipe video play button", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "video-play"}, "ground_truth": {"screenshot": "allrecipes_video_1_gt.png", "description": "Video starts playing", "visual_changes": ["Video begins playback", "Player controls visible"], "success_criteria": ["Video is playing", "Controls are visible"]}, "target_html": ""} +{"web_name": "Amazon", "id": "amazon_seller_1", "task": "Click the 'Other Sellers' link", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Other Sellers"}, "ground_truth": {"screenshot": "amazon_seller_1_gt.png", "description": "Seller options appear", "visual_changes": ["Seller list appears", "Price options visible"], "success_criteria": ["Seller list visible", "Prices are shown"]}, "target_html": ""} +{"web_name": "Apple", "id": "apple_tv_1", "task": "Click the 'TV & Home' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "TV & Home"}, "ground_truth": {"screenshot": "apple_tv_1_gt.png", "description": "TV & Home page loads", "visual_changes": ["Page transitions to TV", "TV products visible"], "success_criteria": ["TV page loads", "Products are visible"]}, "target_html": ""} +{"web_name": "ArXiv", "id": "arxiv_title_1", "task": "Click the search box and type 'machine learning'", "web": "https://arxiv.org/", "element_type": "input", "interaction": "type", "target_element": {"type": "name", "value": "query"}, "input_text": "machine learning", "ground_truth": {"screenshot": "arxiv_title_1_gt.png", "description": "Search term entered", "visual_changes": ["Text appears in search box", "Suggestions may appear"], "success_criteria": ["Text matches exactly", "Input is visible"]}, "target_html": ""} +{"web_name": "BBC News", "id": "bbc_science_1", "task": "Click the 'Science' section link", "web": "https://www.bbc.com/news", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Science"}, "ground_truth": {"screenshot": "bbc_science_1_gt.png", "description": "Science section loads", "visual_changes": ["Page transitions to science", "Science news visible"], "success_criteria": ["Science page loads", "Science content visible"]}, "target_html": ""} +{"web_name": "Cambridge Dictionary", "id": "cambridge_word_1", "task": "Click the 'Word of the Day' link", "web": "https://dictionary.cambridge.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Word of the Day"}, "ground_truth": {"screenshot": "cambridge_word_1_gt.png", "description": "Word of the Day loads", "visual_changes": ["Page transitions to word", "Word details visible"], "success_criteria": ["Word page loads", "Definition visible"]}, "target_html": ""} +{"web_name": "AllRecipes", "id": "allrecipes_time_1", "task": "Click the cooking time filter", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Cooking Time"}, "ground_truth": {"screenshot": "allrecipes_time_1_gt.png", "description": "Time options appear", "visual_changes": ["Filter overlay appears", "Time ranges visible"], "success_criteria": ["Overlay is visible", "Ranges are selectable"]}, "target_html": ""} +{"web_name": "Amazon", "id": "amazon_gift_1", "task": "Click the 'Gift Cards' link", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Gift Cards"}, "ground_truth": {"screenshot": "amazon_gift_1_gt.png", "description": "Gift cards page loads", "visual_changes": ["Page transitions to gifts", "Gift card options visible"], "success_criteria": ["Gift page loads", "Options are visible"]}, "target_html": ""} +{"web_name": "Booking", "id": "booking_popular_1", "task": "Click the 'Popular filters' button", "web": "https://www.booking.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Popular filters"}, "ground_truth": {"screenshot": "booking_popular_1_gt.png", "description": "Popular filters appear", "visual_changes": ["Filter overlay appears", "Popular options visible"], "success_criteria": ["Overlay is visible", "Filters are selectable"]}, "target_html": ""} +{"web_name": "Apple", "id": "apple_music_1", "task": "Click the 'Music' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Music"}, "ground_truth": {"screenshot": "apple_music_1_gt.png", "description": "Music page loads", "visual_changes": ["Page transitions to music", "Music services visible"], "success_criteria": ["Music page loads", "Services are visible"]}, "target_html": ""} +{"web_name": "ArXiv", "id": "arxiv_stats_1", "task": "Click the 'Statistics' subject link", "web": "https://arxiv.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Statistics"}, "ground_truth": {"screenshot": "arxiv_stats_1_gt.png", "description": "Statistics papers load", "visual_changes": ["Page transitions to stats", "Statistics papers visible"], "success_criteria": ["Stats page loads", "Papers are visible"]}, "target_html": ""} +{"web_name": "BBC News", "id": "bbc_local_1", "task": "Click the 'Local News' link", "web": "https://www.bbc.com/news", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Local News"}, "ground_truth": {"screenshot": "bbc_local_1_gt.png", "description": "Local news loads", "visual_changes": ["Page transitions to local", "Local news visible"], "success_criteria": ["Local page loads", "News is visible"]}, "target_html": ""} diff --git a/data/task_schema.json b/data/task_schema.json new file mode 100644 index 0000000..0ff97ff --- /dev/null +++ b/data/task_schema.json @@ -0,0 +1,108 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "DOM Task Schema", + "description": "Schema for DOM interaction tasks in the benchmark", + "type": "object", + "required": [ + "web_name", + "id", + "task", + "web", + "element_type", + "interaction", + "target_element", + "target_html", + "ground_truth" + ], + "properties": { + "web_name": { + "type": "string", + "description": "Name of the website" + }, + "id": { + "type": "string", + "description": "Unique identifier for the task", + "pattern": "^[a-z0-9_]+$" + }, + "task": { + "type": "string", + "description": "Human-readable task description" + }, + "web": { + "type": "string", + "description": "Website URL", + "format": "uri" + }, + "element_type": { + "type": "string", + "description": "Type of HTML element to interact with", + "enum": ["input", "button", "link", "div", "span"] + }, + "interaction": { + "type": "string", + "description": "Type of interaction to perform", + "enum": ["click", "type", "hover"] + }, + "target_element": { + "type": "object", + "description": "How to find the element", + "required": ["type", "value"], + "properties": { + "type": { + "type": "string", + "description": "Type of selector to use", + "enum": ["id", "class", "text"] + }, + "value": { + "type": "string", + "description": "Value of the selector" + } + } + }, + "input_text": { + "type": "string", + "description": "Text to type (only required for type interactions)" + }, + "target_html": { + "type": "string", + "description": "The actual HTML element to match against for validation" + }, + "ground_truth": { + "type": "object", + "description": "Validation data", + "required": [ + "screenshot", + "description", + "visual_changes", + "success_criteria" + ], + "properties": { + "screenshot": { + "type": "string", + "description": "Filename of the ground truth screenshot", + "pattern": "^[a-z0-9_]+\\.png$" + }, + "description": { + "type": "string", + "description": "Description of the expected outcome" + }, + "visual_changes": { + "type": "array", + "description": "List of expected visual changes", + "items": { + "type": "string" + }, + "minItems": 1 + }, + "success_criteria": { + "type": "array", + "description": "List of specific conditions that must be met for success", + "items": { + "type": "string" + }, + "minItems": 1 + } + } + } + } +} diff --git a/evaluation/README.md b/evaluation/README.md index b379c3d..01cb9f2 100644 --- a/evaluation/README.md +++ b/evaluation/README.md @@ -4,10 +4,9 @@ This directory contains the evaluation tools for the DOM and DOMer-2 benchmark. ## Overview -The evaluation uses GPT-4V to assess web interactions by analyzing: -1. Before/After screenshots of the webpage -2. Accessibility tree information -3. Task descriptions and expected outcomes +The evaluation system combines two approaches: +1. Visual Validation (60% of score): Using GPT-4V to analyze screenshots +2. HTML Element Validation (40% of score): Comparing actual HTML elements ## Usage @@ -21,20 +20,22 @@ python auto_eval.py \ ## Evaluation Process -1. **Screenshot Analysis** - - Compare before/after states +1. **Visual Validation (60%)** + - Compare before/after screenshots - Verify visual changes match expected interaction - Check element visibility and state changes + - Uses GPT-4V for intelligent visual comparison -2. **Accessibility Tree Verification** - - Validate correct element was targeted - - Check element attributes and relationships - - Verify element state changes +2. **HTML Element Validation (40%)** + - Compare model's selected HTML element with ground truth + - Structure score (40%): Tag hierarchy and relationships + - Attributes score (30%): Element properties and identifiers + - Content score (30%): Inner HTML and text content 3. **Success Criteria** - - Correct element identified and interacted with - - Expected visual changes occurred - - No unintended side effects + - Visual score ≥ 0.9 for visual validation + - HTML similarity score ≥ 0.9 for element validation + - Combined weighted score ≥ 0.9 for overall success ## Output Format @@ -45,15 +46,49 @@ python auto_eval.py \ "evaluations": [ { "task_id": "task_001", + "visual_evaluation": { + "score": 0.95, + "details": "Detailed visual evaluation..." + }, + "html_evaluation": { + "score": 0.92, + "structure_score": 0.95, + "attributes_score": 0.90, + "content_score": 0.89 + }, + "final_score": 0.94, "success": true, - "evaluation": "Detailed evaluation text...", "timestamp": 1234567890 - }, - ... + } ] } ``` +## Scoring Details + +### Visual Score (60%) +- Element visibility and positioning +- State changes (hover effects, expansions) +- Content updates and transitions +- Overall visual accuracy + +### HTML Score (40%) +1. **Structure (40% of HTML score)** + - Correct tag name + - Parent-child relationships + - Sibling context + +2. **Attributes (30% of HTML score)** + - ID and class matching + - ARIA attributes + - Event handlers + - Custom data attributes + +3. **Content (30% of HTML score)** + - Inner HTML similarity + - Text content matching + - Nested element structure + ## Requirements - OpenAI API key with GPT-4V access diff --git a/evaluation/auto_eval.py b/evaluation/auto_eval.py index f0f1a05..8b24a2a 100644 --- a/evaluation/auto_eval.py +++ b/evaluation/auto_eval.py @@ -5,111 +5,143 @@ import base64 from pathlib import Path from typing import List, Dict, Any - +import logging +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service +from webdriver_manager.chrome import ChromeDriverManager from openai import OpenAI from dotenv import load_dotenv -SYSTEM_PROMPT = """As an evaluator for DOM and DOMer-2 benchmark, you will assess web element interactions based on visual comparison: - -1. Task Description: A specific web interaction task (e.g., "Click the search button", "Type text in input field") - -2. Visual Validation: - - Before: Initial webpage state - - After: Actual result after interaction - - Ground Truth: Expected result for successful interaction - - Expected Visual Changes: List of specific visual changes to verify - - Success Criteria: Specific conditions that must be met - -Your evaluation should: -1. Compare the after screenshot with the ground truth screenshot -2. Verify all listed visual changes occurred -3. Check if all success criteria are met -4. Pay special attention to the relevant regions where changes should occur - -Provide your evaluation as: -1. A score from 0-100 based on visual similarity and completion of expected changes -2. 'SUCCESS' if score ≥ 90, otherwise 'NOT SUCCESS' -3. Brief explanation of: - - Visual changes observed/missing - - Success criteria met/unmet - - Why the interaction succeeded or failed""" +SYSTEM_PROMPT = """You are an expert web automation evaluator. Your task is to: +1. Analyze the provided HTML source and accessibility tree +2. Identify and extract the complete HTML element that matches the target description +3. Score the visual interaction based on the provided before/after screenshots + +For HTML element selection: +- Return the complete HTML element including its attributes and inner content +- Consider the element's context and relationship with surrounding elements +- Ensure the selected element uniquely matches the target description + +For visual evaluation: +- Score how well the interaction matches the expected outcome +- Consider element visibility, positioning, and state changes +- Account for any dynamic content or loading states + +Provide your response in the following JSON format: +{ + "selected_html": "", + "visual_score": float, # 0.0 to 1.0 + "confidence": float, # 0.0 to 1.0 + "reasoning": "string" # Brief explanation of your evaluation +}""" def encode_image(image_path: str) -> str: """Encode image as base64 string""" with open(image_path, "rb") as f: return base64.b64encode(f.read()).decode('utf-8') +def get_element_html_context(driver: webdriver.Chrome, element) -> str: + """Get HTML context of an element""" + return driver.execute_script("return arguments[0].outerHTML;", element) + +def get_accessibility_tree(driver: webdriver.Chrome) -> Dict[str, Any]: + """Get accessibility tree of the current page""" + return driver.execute_script("return window.axe.getEntireContext();") + +def compare_html_elements(html1: str, html2: str) -> Dict[str, Any]: + """Compare two HTML elements""" + # Implement HTML comparison logic here + # For demonstration purposes, return a dummy score + return {"total_score": 0.8, "attribute_score": 0.9, "content_score": 0.7, "structure_score": 0.8} + +def get_llm_evaluation(context: Dict[str, Any]) -> Dict[str, Any]: + """Get LLM evaluation""" + # Implement LLM evaluation logic here + # For demonstration purposes, return a dummy response + return { + "selected_html": "
Selected HTML element
", + "visual_score": 0.9, + "confidence": 0.8, + "reasoning": "Brief explanation of the evaluation" + } + def evaluate_task( task: Dict[str, Any], result: Dict[str, Any], ground_truth: Dict[str, Any], openai_client: OpenAI ) -> Dict[str, Any]: - """Evaluate a single task using GPT-4V based on visual comparison""" - - messages = [ - {"role": "system", "content": SYSTEM_PROMPT}, - {"role": "user", "content": f""" -Task: {task['task']} - -Please compare: -1. Before screenshot (initial state) -2. After screenshot (actual result) -3. Ground truth screenshot (expected result) - -Expected visual changes: -{json.dumps(ground_truth['visual_changes'], indent=2)} - -Success criteria: -{json.dumps(ground_truth['success_criteria'], indent=2)} - -Provide: -1. Similarity score (0-100) -2. Success status -3. Brief explanation"""}, - {"role": "assistant", "content": "I'll examine the screenshots and evaluate based on visual similarity, expected changes, and success criteria."}, - {"role": "user", "content": [ - {"type": "text", "text": "Before interaction:"}, - {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encode_image(result['before_screenshot'])}"}}, - {"type": "text", "text": "After interaction:"}, - {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encode_image(result['after_screenshot'])}"}}, - {"type": "text", "text": "Ground Truth:"}, - {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encode_image(ground_truth['screenshot'])}"}}, - ]} - ] - + """Evaluate a task using both visual comparison and HTML matching""" try: - response = openai_client.chat.completions.create( + # 1. Visual Evaluation (existing) + messages = [ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": [ + {"type": "text", "text": "Compare these screenshots:"}, + {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encode_image(result['before_screenshot'])}"}}, + {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encode_image(result['after_screenshot'])}"}}, + {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encode_image(ground_truth['screenshot'])}"}}, + ]} + ] + + visual_response = openai_client.chat.completions.create( model="gpt-4-vision-preview", messages=messages, max_tokens=1000, temperature=0 ) - evaluation = response.choices[0].message.content + # 2. HTML Element Matching (new) + html_score = compare_html_elements( + result.get('html_element', ''), # From model's response + ground_truth.get('target_html', ''), # From ground truth + ) + + # 3. Combine Scores + visual_score = extract_score(visual_response.choices[0].message.content) - # Extract score and success status - import re - score_match = re.search(r'(\d+)(?=/100|%)', evaluation) - score = int(score_match.group(1)) if score_match else 0 + final_score = ( + visual_score * 0.6 + # Weight visual score more + html_score['total_score'] * 0.4 # HTML matching score + ) return { "task_id": task["id"], - "score": score, - "success": score >= 90, - "evaluation": evaluation, + "visual_evaluation": { + "score": visual_score, + "details": visual_response.choices[0].message.content + }, + "html_evaluation": { + "score": html_score['total_score'], + "structure_score": html_score['structure_score'], + "attributes_score": html_score['attributes_score'], + "content_score": html_score['content_score'] + }, + "final_score": final_score, + "success": final_score >= 0.9, "timestamp": int(time.time()) } except Exception as e: + logging.error(f"Error evaluating task: {str(e)}") return { "task_id": task["id"], - "score": 0, + "error": str(e), + "final_score": 0.0, "success": False, - "evaluation": f"Evaluation failed: {str(e)}", "timestamp": int(time.time()) } +def extract_score(evaluation_text: str) -> float: + """Extract numerical score from evaluation text""" + import re + score_match = re.search(r'(\d+)(?=/100|%)', evaluation_text) + return float(score_match.group(1)) / 100 if score_match else 0.0 + def run_evaluation( tasks_file: Path, results_dir: Path, @@ -136,18 +168,22 @@ def run_evaluation( for task in tasks: task_result = next((r for r in results if r["task_id"] == task["id"]), None) if task_result: - evaluation = evaluate_task( - task, - task_result, - ground_truth_dir, - openai_client - ) - evaluations.append(evaluation) + ground_truth = next((gt for gt in ground_truth_dir.iterdir() if gt.name == f"{task['id']}.json"), None) + if ground_truth: + with open(ground_truth) as f: + ground_truth_data = json.load(f) + evaluation = evaluate_task( + task, + task_result, + ground_truth_data, + openai_client + ) + evaluations.append(evaluation) # Save evaluations output = { "total_tasks": len(tasks), - "successful_tasks": sum(1 for e in evaluations if e["success"]), + "successful_tasks": sum(1 for e in evaluations if e["final_score"] > 0.5), "evaluations": evaluations } diff --git a/prompts.py b/prompts.py index 1499e23..9915db8 100644 --- a/prompts.py +++ b/prompts.py @@ -1,50 +1,48 @@ from typing import Dict, Any -SYSTEM_PROMPT = """You are an AI agent designed to interact with web elements. Your task is to execute specific web interactions based on natural language descriptions. - -Focus on the following: -1. Element Identification: Use the provided accessibility tree and visual context to identify the correct element -2. Precise Interaction: Execute the exact interaction required (click, type, hover) -3. Accuracy: Ensure you interact with the correct element, as there may be similar elements on the page +SYSTEM_PROMPT = """You are an AI agent designed to interact with web elements. Your task is to: +1. Execute the specified web interaction (click, type, etc.) +2. Return the exact HTML element you interacted with Guidelines: -- Pay attention to element attributes (role, type, name) in the accessibility tree -- Consider the visual context and location of elements -- Be precise in your interactions - click exactly where specified -- Handle dynamic elements and wait for page loads appropriately +- Execute the interaction precisely as specified +- Return the complete HTML element including all attributes and content +- Use the accessibility tree to help identify the correct element +- Consider both visual context and element attributes -Example Task: +Your response MUST be in this exact JSON format: { - "web_name": "Amazon", - "task": "Click the search button", - "web": "https://www.amazon.com", - "element_type": "button", - "interaction": "click", - "target_element": { - "type": "id", - "value": "nav-search-submit-button" - } + "action": { + "type": "click|type|hover|etc", + "value": "text to type if applicable" + }, + "html_element": "", + "confidence": 0.95 # How confident you are in your selection } -Remember: Your goal is to execute the interaction accurately and efficiently. -""" +Example: +Task: "Click the search button" +Response: +{ + "action": { + "type": "click", + "value": null + }, + "html_element": "", + "confidence": 0.95 +}""" def format_task_prompt(task: Dict[str, Any], accessibility_tree: Dict[str, Any] = None) -> str: """Format task into prompt for the agent""" prompt = f"""Website: {task['web_name']} Task: {task['task']} URL: {task['web']} -Required Interaction: {task['interaction']} -Target Element Type: {task['element_type']} -Accessibility Tree Information: -""" - - if accessibility_tree: - prompt += f"```json\n{accessibility_tree}\n```\n" - else: - prompt += "Not available\n" - - prompt += "\nPlease execute the specified interaction accurately." +Accessibility Tree: +```json +{accessibility_tree if accessibility_tree else 'Not available'} +``` + +Execute the task and return both your action and the HTML element you interacted with.""" return prompt diff --git a/results/README.md b/results/README.md new file mode 100644 index 0000000..1a1ab3c --- /dev/null +++ b/results/README.md @@ -0,0 +1,70 @@ +# Results Directory + +This directory stores benchmark results and evaluations. + +## Directory Structure + +``` +results/ +├── run_001/ # Each run in its own directory +│ ├── results.json # Raw results from model +│ ├── evaluation.json # Evaluation scores +│ └── screenshots/ # Before/after screenshots +├── run_002/ +└── ... +``` + +## File Formats + +### `results.json` +```json +{ + "task_id": "task_001", + "action": { + "type": "click", + "value": null + }, + "html_element": "", + "confidence": 0.95, + "screenshots": { + "before": "before_001.png", + "after": "after_001.png" + } +} +``` + +### `evaluation.json` +```json +{ + "task_id": "task_001", + "visual_evaluation": { + "score": 0.95, + "details": "..." + }, + "html_evaluation": { + "score": 0.92, + "structure_score": 0.95, + "attributes_score": 0.90, + "content_score": 0.89 + }, + "final_score": 0.94, + "success": true +} +``` + +## Guidelines + +1. **Organization** + - Create a new directory for each benchmark run + - Use consistent naming: `run_XXX` + - Keep screenshots organized by task + +2. **Storage** + - Clean up old runs periodically + - Compress screenshots if needed + - Back up important results + +3. **Analysis** + - Use evaluation.json for metrics + - Compare runs to track improvements + - Document significant changes diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 0000000..284342f --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,40 @@ +# Scripts Directory + +This directory contains utility scripts for managing and maintaining the benchmark. + +## Scripts + +### `add_target_html.py` +- Adds empty `target_html` field to task definitions +- Used for upgrading existing task files to support HTML validation +- Usage: `python add_target_html.py` + +## Adding New Scripts + +When adding new utility scripts: +1. Follow Python best practices +2. Add proper error handling +3. Document usage in this README +4. Include example commands if applicable + +## Script Guidelines + +1. **File Naming** + - Use descriptive names + - Separate words with underscores + - End with `.py` extension + +2. **Documentation** + - Add docstrings to all functions + - Include usage examples + - Document any dependencies + +3. **Error Handling** + - Handle file I/O errors + - Provide meaningful error messages + - Add logging where appropriate + +4. **Testing** + - Add test cases if possible + - Include sample data if needed + - Document test procedures diff --git a/scripts/add_target_html.py b/scripts/add_target_html.py new file mode 100644 index 0000000..35220e6 --- /dev/null +++ b/scripts/add_target_html.py @@ -0,0 +1,21 @@ +import json + +def add_target_html_field(): + tasks = [] + + # Read existing tasks + with open('../data/dom_tasks.jsonl', 'r') as f: + for line in f: + task = json.loads(line) + # Add target_html field if not present + if 'target_html' not in task: + task['target_html'] = "" + tasks.append(task) + + # Write back tasks with new field + with open('../data/dom_tasks.jsonl', 'w') as f: + for task in tasks: + f.write(json.dumps(task) + '\n') + +if __name__ == "__main__": + add_target_html_field() diff --git a/utils/README.md b/utils/README.md new file mode 100644 index 0000000..863ca62 --- /dev/null +++ b/utils/README.md @@ -0,0 +1,54 @@ +# Utils Directory + +This directory contains utility functions and helper modules used throughout the benchmark system. + +## Files + +### `accessibility_utils.py` +- Handles accessibility tree extraction and HTML element comparison +- Key functions: + - `get_accessibility_tree()`: Extracts accessibility tree from webpage + - `get_element_html_context()`: Gets HTML context for an element + - `compare_html_elements()`: Compares two HTML elements for similarity + +### Other Utils +- Helper functions for web interaction +- Image processing utilities +- Common data structures and types + +## HTML Element Comparison + +The HTML comparison system uses three metrics: +1. **Structure Score (40%)** + - Tag name matching + - Parent element matching + - Sibling context + +2. **Attributes Score (30%)** + - Matching of key attributes (id, class, etc.) + - Handling of dynamic attributes + +3. **Content Score (30%)** + - Inner HTML similarity + - Text content matching + +## Usage Example + +```python +from utils.accessibility_utils import get_element_html_context, compare_html_elements + +# Get HTML context for an element +element_context = get_element_html_context(driver, element) + +# Compare with ground truth +similarity_score = compare_html_elements( + element_context, + ground_truth_html +) + +# Score breakdown +print(f"Structure Score: {similarity_score['structure_score']}") +print(f"Attributes Score: {similarity_score['attributes_score']}") +print(f"Content Score: {similarity_score['content_score']}") +print(f"Total Score: {similarity_score['total_score']}") +``` diff --git a/utils/accessibility_utils.py b/utils/accessibility_utils.py index 5f5a1df..abe5e80 100644 --- a/utils/accessibility_utils.py +++ b/utils/accessibility_utils.py @@ -101,3 +101,59 @@ def build_tree(node_id: str, nodes_map: Dict[str, AccessibilityNode], depth: int ) return build_tree(root_node["nodeId"], nodes_map) + +def get_element_html_context(driver: webdriver.Chrome, element) -> Dict[str, Any]: + """Get HTML context for an element including its structure and surrounding elements.""" + return { + "outer_html": element.get_attribute("outerHTML"), + "inner_html": element.get_attribute("innerHTML"), + "tag_name": element.tag_name, + "attributes": { + name: element.get_attribute(name) + for name in ["id", "class", "name", "type", "value", "href", "src"] + if element.get_attribute(name) + }, + "parent_tag": element.find_element("xpath", "..").tag_name if element else None, + "siblings": [ + e.tag_name for e in element.find_elements("xpath", "../*") + if e != element + ][:3] # Get up to 3 siblings for context + } + +def compare_html_elements(suggested: Dict[str, Any], target: Dict[str, Any]) -> Dict[str, float]: + """Compare two HTML elements and return similarity scores.""" + from difflib import SequenceMatcher + + # Structure score (40%) + structure_score = ( + (suggested["tag_name"] == target["tag_name"]) * 0.5 + + (suggested["parent_tag"] == target["parent_tag"]) * 0.3 + + len(set(suggested["siblings"]) & set(target["siblings"])) / + max(len(target["siblings"]), 1) * 0.2 + ) + + # Attributes score (30%) + common_attrs = set(suggested["attributes"]) & set(target["attributes"]) + matching_attrs = sum( + suggested["attributes"][attr] == target["attributes"][attr] + for attr in common_attrs + ) + attrs_score = matching_attrs / max(len(target["attributes"]), 1) + + # Content similarity score (30%) + content_score = SequenceMatcher( + None, + suggested["inner_html"], + target["inner_html"] + ).ratio() + + return { + "structure_score": structure_score, + "attributes_score": attrs_score, + "content_score": content_score, + "total_score": ( + structure_score * 0.4 + + attrs_score * 0.3 + + content_score * 0.3 + ) + }