diff --git a/README.md b/README.md
index b56ff30..0f15982 100644
--- a/README.md
+++ b/README.md
@@ -69,25 +69,62 @@ Located in `data/ground_truth/`, each task has:
 - `[task_id]_gt.png`: Screenshot of successful interaction
 - Description in task JSON explaining expected changes
 
+## Environment Setup
+
+1. Create a virtual environment and install dependencies:
+```bash
+python -m venv venv
+source venv/bin/activate  # On Windows: venv\Scripts\activate
+pip install -r requirements.txt
+```
+
+2. Set up environment variables in `.env`:
+```bash
+OPENAI_API_KEY=your_openai_api_key
+```
+
 ## Running the Benchmark
 
-1. **Run Tests**:
+1. Run tasks:
+```bash
+python run.py --tasks data/dom_tasks.jsonl --output results --evaluate
+```
+
+This will:
+- Execute each task in the tasks file
+- Save screenshots and results to the output directory
+- Run GPT-4V evaluation if --evaluate is specified
+
+## Ground Truth Management
+
+Ground truth images are stored in `evaluation/ground_truth/` with a consistent naming scheme:
+```
+evaluation/ground_truth/
+└── task_1_gt.png
+└── task_2_gt.png
+...
+```
+
+The tasks file references these images using relative paths:
+```json
+{
+  "id": 1,
+  "ground_truth": {
+    "screenshot": "evaluation/ground_truth/task_1_gt.png"
+  }
+}
+```
+
+## Testing
+
+Run environment tests:
 ```bash
-python run.py \
-    --tasks data/dom_tasks.jsonl \
-    --output results/run_001 \
-    --headless \
-    --save-accessibility-tree
+python test_env.py
 ```
 
-2. **Evaluate Results**:
+Run OpenAI API connection test:
 ```bash
-python evaluation/auto_eval.py \
-    --tasks data/dom_tasks.jsonl \
-    --results results/run_001 \
-    --ground-truth data/ground_truth \
-    --output results/run_001/evaluation.json \
-    --openai-key YOUR_API_KEY
+python test_openai.py
 ```
 
 ## Evaluation Process
diff --git a/data/dom_tasks.jsonl b/data/dom_tasks.jsonl
index a5250c5..8be12ae 100644
--- a/data/dom_tasks.jsonl
+++ b/data/dom_tasks.jsonl
@@ -1,80 +1 @@
-{"web_name": "Cambridge Dictionary", "id": "cambridge_lookup_1", "task": "Click the search box and type 'hello'", "web": "https://dictionary.cambridge.org/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "searchword"}, "input_text": "hello", "target_html": "", "ground_truth": {"screenshot": "cambridge_lookup_1_gt.png", "description": "The word 'hello' has been entered in the search box", "visual_changes": ["Text 'hello' appears in search box", "Text cursor visible at end of input", "Search suggestions may appear"], "success_criteria": ["Input text matches 'hello' exactly", "Text is visible in search box", "Search box maintains focus"]}}
-{"web_name": "Cambridge Dictionary", "id": "cambridge_search_1", "task": "Click the search button", "web": "https://dictionary.cambridge.org/", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "cdo-search-button"}, "target_html": "", "ground_truth": {"screenshot": "cambridge_search_1_gt.png", "description": "The search results for 'hello' are displayed", "visual_changes": ["Search button appears pressed", "Page transitions to search results", "Definition of 'hello' is displayed"], "success_criteria": ["Search button responds to click", "Results page loads", "No error messages displayed"]}}
-{"web_name": "AllRecipes", "id": "allrecipes_search_1", "task": "Click the search box and type 'vegetarian lasagna'", "web": "https://www.allrecipes.com/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "search-box"}, "input_text": "vegetarian lasagna", "ground_truth": {"screenshot": "allrecipes_search_1_gt.png", "description": "Search term entered in search box", "visual_changes": ["Text appears in search box", "Search suggestions may appear"], "success_criteria": ["Text matches exactly", "Search box contains text"]}, "target_html": ""}
-{"web_name": "AllRecipes", "id": "allrecipes_filter_1", "task": "Click the 'Ratings' filter button", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Ratings"}, "ground_truth": {"screenshot": "allrecipes_filter_1_gt.png", "description": "Ratings filter dropdown opens", "visual_changes": ["Dropdown menu appears", "Filter options visible"], "success_criteria": ["Dropdown menu is visible", "Filter options are clickable"]}, "target_html": ""}
-{"web_name": "Amazon", "id": "amazon_search_1", "task": "Click the search box and type 'laptop'", "web": "https://www.amazon.com/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "twotabsearchtextbox"}, "input_text": "laptop", "ground_truth": {"screenshot": "amazon_search_1_gt.png", "description": "Search term entered", "visual_changes": ["Text appears in search box", "Search suggestions appear"], "success_criteria": ["Text matches exactly", "Search suggestions visible"]}, "target_html": ""}
-{"web_name": "Amazon", "id": "amazon_menu_1", "task": "Click the hamburger menu button", "web": "https://www.amazon.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "id", "value": "nav-hamburger-menu"}, "ground_truth": {"screenshot": "amazon_menu_1_gt.png", "description": "Side menu opens", "visual_changes": ["Menu slides in from left", "Menu options visible"], "success_criteria": ["Menu is visible", "Menu items are clickable"]}, "target_html": ""}
-{"web_name": "Apple", "id": "apple_menu_1", "task": "Click the 'Mac' menu item", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Mac"}, "ground_truth": {"screenshot": "apple_menu_1_gt.png", "description": "Mac page loads", "visual_changes": ["Page transitions to Mac section", "Mac products visible"], "success_criteria": ["Page URL changes", "Mac content visible"]}, "target_html": ""}
-{"web_name": "Apple", "id": "apple_search_1", "task": "Click the search icon", "web": "https://www.apple.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "globalnav-search-trigger"}, "ground_truth": {"screenshot": "apple_search_1_gt.png", "description": "Search overlay opens", "visual_changes": ["Search overlay appears", "Search box focused"], "success_criteria": ["Search overlay visible", "Search box is active"]}, "target_html": ""}
-{"web_name": "ArXiv", "id": "arxiv_search_1", "task": "Click the search box and type 'quantum computing'", "web": "https://arxiv.org/", "element_type": "input", "interaction": "type", "target_element": {"type": "name", "value": "query"}, "input_text": "quantum computing", "ground_truth": {"screenshot": "arxiv_search_1_gt.png", "description": "Search term entered", "visual_changes": ["Text appears in search box"], "success_criteria": ["Text matches exactly", "Search box contains text"]}, "target_html": ""}
-{"web_name": "ArXiv", "id": "arxiv_filter_1", "task": "Click the 'Advanced Search' link", "web": "https://arxiv.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Advanced Search"}, "ground_truth": {"screenshot": "arxiv_filter_1_gt.png", "description": "Advanced search page loads", "visual_changes": ["Page transitions to advanced search", "Search filters visible"], "success_criteria": ["URL changes to advanced search", "Advanced search form visible"]}, "target_html": ""}
-{"web_name": "BBC News", "id": "bbc_menu_1", "task": "Click the menu button", "web": "https://www.bbc.com/news", "element_type": "button", "interaction": "click", "target_element": {"type": "id", "value": "orbit-more-menu"}, "ground_truth": {"screenshot": "bbc_menu_1_gt.png", "description": "Menu overlay opens", "visual_changes": ["Menu overlay appears", "Navigation options visible"], "success_criteria": ["Menu overlay is visible", "Menu items are clickable"]}, "target_html": ""}
-{"web_name": "BBC News", "id": "bbc_search_1", "task": "Click the search icon", "web": "https://www.bbc.com/news", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "orbit-search__button"}, "ground_truth": {"screenshot": "bbc_search_1_gt.png", "description": "Search overlay opens", "visual_changes": ["Search overlay appears", "Search box focused"], "success_criteria": ["Search overlay visible", "Search input is active"]}, "target_html": ""}
-{"web_name": "Booking", "id": "booking_destination_1", "task": "Click the destination input and type 'Paris'", "web": "https://www.booking.com/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "ss"}, "input_text": "Paris", "ground_truth": {"screenshot": "booking_destination_1_gt.png", "description": "Destination entered", "visual_changes": ["Text appears in input", "Location suggestions appear"], "success_criteria": ["Text matches exactly", "Suggestions are visible"]}, "target_html": ""}
-{"web_name": "Booking", "id": "booking_dates_1", "task": "Click the check-in date field", "web": "https://www.booking.com/", "element_type": "div", "interaction": "click", "target_element": {"type": "class", "value": "sb-date-field__field"}, "ground_truth": {"screenshot": "booking_dates_1_gt.png", "description": "Calendar overlay opens", "visual_changes": ["Calendar overlay appears", "Available dates highlighted"], "success_criteria": ["Calendar is visible", "Dates are selectable"]}, "target_html": ""}
-{"web_name": "Cambridge Dictionary", "id": "cambridge_menu_1", "task": "Click the 'More' menu button", "web": "https://dictionary.cambridge.org/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "More"}, "ground_truth": {"screenshot": "cambridge_menu_1_gt.png", "description": "More menu opens", "visual_changes": ["Dropdown menu appears", "Menu options visible"], "success_criteria": ["Dropdown is visible", "Menu items are clickable"]}, "target_html": ""}
-{"web_name": "AllRecipes", "id": "allrecipes_menu_1", "task": "Click the 'Ingredients' button", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Ingredients"}, "ground_truth": {"screenshot": "allrecipes_menu_1_gt.png", "description": "Ingredients section expands", "visual_changes": ["Section expands", "Ingredient list visible"], "success_criteria": ["Section is expanded", "Ingredients are visible"]}, "target_html": ""}
-{"web_name": "Amazon", "id": "amazon_cart_1", "task": "Click the cart icon", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "id", "value": "nav-cart"}, "ground_truth": {"screenshot": "amazon_cart_1_gt.png", "description": "Cart page loads", "visual_changes": ["Page transitions to cart", "Cart contents visible"], "success_criteria": ["Cart page loads", "Cart status visible"]}, "target_html": ""}
-{"web_name": "Apple", "id": "apple_store_1", "task": "Click the 'Store' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Store"}, "ground_truth": {"screenshot": "apple_store_1_gt.png", "description": "Store page loads", "visual_changes": ["Page transitions to store", "Store products visible"], "success_criteria": ["Store page loads", "Products are visible"]}, "target_html": ""}
-{"web_name": "ArXiv", "id": "arxiv_recent_1", "task": "Click the 'Recent' link", "web": "https://arxiv.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "recent"}, "ground_truth": {"screenshot": "arxiv_recent_1_gt.png", "description": "Recent submissions page loads", "visual_changes": ["Page transitions to recent submissions", "Recent papers visible"], "success_criteria": ["Recent page loads", "Papers are visible"]}, "target_html": ""}
-{"web_name": "BBC News", "id": "bbc_sport_1", "task": "Click the 'Sport' section link", "web": "https://www.bbc.com/news", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Sport"}, "ground_truth": {"screenshot": "bbc_sport_1_gt.png", "description": "Sports section loads", "visual_changes": ["Page transitions to sports", "Sports news visible"], "success_criteria": ["Sports page loads", "Sports content visible"]}, "target_html": ""}
-{"web_name": "Booking", "id": "booking_guests_1", "task": "Click the guests selector", "web": "https://www.booking.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "xp__guests__count"}, "ground_truth": {"screenshot": "booking_guests_1_gt.png", "description": "Guests selector opens", "visual_changes": ["Guests overlay appears", "Guest options visible"], "success_criteria": ["Overlay is visible", "Guest controls are active"]}, "target_html": ""}
-{"web_name": "Booking", "id": "booking_search_1", "task": "Click the search button", "web": "https://www.booking.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "sb-searchbox__button"}, "ground_truth": {"screenshot": "booking_search_1_gt.png", "description": "Search results load", "visual_changes": ["Page transitions to results", "Available properties shown"], "success_criteria": ["Results page loads", "Properties are visible"]}, "target_html": ""}
-{"web_name": "AllRecipes", "id": "allrecipes_diet_1", "task": "Click the 'Dietary Restrictions' filter", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Dietary Restrictions"}, "ground_truth": {"screenshot": "allrecipes_diet_1_gt.png", "description": "Diet filter opens", "visual_changes": ["Filter dropdown appears", "Diet options visible"], "success_criteria": ["Dropdown is visible", "Options are clickable"]}, "target_html": ""}
-{"web_name": "Amazon", "id": "amazon_department_1", "task": "Click the department selector", "web": "https://www.amazon.com/", "element_type": "select", "interaction": "click", "target_element": {"type": "id", "value": "searchDropdownBox"}, "ground_truth": {"screenshot": "amazon_department_1_gt.png", "description": "Department dropdown opens", "visual_changes": ["Dropdown menu appears", "Department list visible"], "success_criteria": ["Dropdown is visible", "Departments are selectable"]}, "target_html": ""}
-{"web_name": "Apple", "id": "apple_bag_1", "task": "Click the shopping bag icon", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "class", "value": "globalnav-bag-item"}, "ground_truth": {"screenshot": "apple_bag_1_gt.png", "description": "Shopping bag overlay opens", "visual_changes": ["Bag overlay appears", "Cart contents visible"], "success_criteria": ["Overlay is visible", "Cart status shown"]}, "target_html": ""}
-{"web_name": "ArXiv", "id": "arxiv_subject_1", "task": "Click the subject area dropdown", "web": "https://arxiv.org/", "element_type": "select", "interaction": "click", "target_element": {"type": "name", "value": "subject"}, "ground_truth": {"screenshot": "arxiv_subject_1_gt.png", "description": "Subject dropdown opens", "visual_changes": ["Dropdown menu appears", "Subject areas visible"], "success_criteria": ["Dropdown is visible", "Subjects are selectable"]}, "target_html": ""}
-{"web_name": "BBC News", "id": "bbc_region_1", "task": "Click the region selector", "web": "https://www.bbc.com/news", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Choose your region"}, "ground_truth": {"screenshot": "bbc_region_1_gt.png", "description": "Region selector opens", "visual_changes": ["Region overlay appears", "Region options visible"], "success_criteria": ["Overlay is visible", "Regions are selectable"]}, "target_html": ""}
-{"web_name": "Cambridge Dictionary", "id": "cambridge_translate_1", "task": "Click the translation language selector", "web": "https://dictionary.cambridge.org/", "element_type": "select", "interaction": "click", "target_element": {"type": "id", "value": "translation-language"}, "ground_truth": {"screenshot": "cambridge_translate_1_gt.png", "description": "Language dropdown opens", "visual_changes": ["Dropdown menu appears", "Language options visible"], "success_criteria": ["Dropdown is visible", "Languages are selectable"]}, "target_html": ""}
-{"web_name": "AllRecipes", "id": "allrecipes_sort_1", "task": "Click the sort dropdown", "web": "https://www.allrecipes.com/", "element_type": "select", "interaction": "click", "target_element": {"type": "id", "value": "sort-dropdown"}, "ground_truth": {"screenshot": "allrecipes_sort_1_gt.png", "description": "Sort options appear", "visual_changes": ["Dropdown menu appears", "Sort options visible"], "success_criteria": ["Dropdown is visible", "Options are selectable"]}, "target_html": ""}
-{"web_name": "Amazon", "id": "amazon_language_1", "task": "Click the language selector", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "id", "value": "icp-nav-flyout"}, "ground_truth": {"screenshot": "amazon_language_1_gt.png", "description": "Language overlay opens", "visual_changes": ["Language overlay appears", "Language options visible"], "success_criteria": ["Overlay is visible", "Languages are selectable"]}, "target_html": ""}
-{"web_name": "BBC News", "id": "bbc_weather_1", "task": "Click the weather widget", "web": "https://www.bbc.com/news", "element_type": "div", "interaction": "click", "target_element": {"type": "class", "value": "weather-widget"}, "ground_truth": {"screenshot": "bbc_weather_1_gt.png", "description": "Weather details expand", "visual_changes": ["Weather details appear", "Forecast visible"], "success_criteria": ["Weather details visible", "Forecast information shown"]}, "target_html": ""}
-{"web_name": "Booking", "id": "booking_currency_1", "task": "Click the currency selector", "web": "https://www.booking.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "bui-button__text"}, "ground_truth": {"screenshot": "booking_currency_1_gt.png", "description": "Currency selector opens", "visual_changes": ["Currency overlay appears", "Currency options visible"], "success_criteria": ["Overlay is visible", "Currencies are selectable"]}, "target_html": ""}
-{"web_name": "AllRecipes", "id": "allrecipes_share_1", "task": "Click the share button", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Share"}, "ground_truth": {"screenshot": "allrecipes_share_1_gt.png", "description": "Share options appear", "visual_changes": ["Share overlay appears", "Share options visible"], "success_criteria": ["Overlay is visible", "Share options are clickable"]}, "target_html": ""}
-{"web_name": "Amazon", "id": "amazon_account_1", "task": "Click the account menu", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "id", "value": "nav-link-accountList"}, "ground_truth": {"screenshot": "amazon_account_1_gt.png", "description": "Account menu opens", "visual_changes": ["Account overlay appears", "Account options visible"], "success_criteria": ["Overlay is visible", "Account options are clickable"]}, "target_html": ""}
-{"web_name": "Apple", "id": "apple_support_1", "task": "Click the 'Support' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Support"}, "ground_truth": {"screenshot": "apple_support_1_gt.png", "description": "Support page loads", "visual_changes": ["Page transitions to support", "Support options visible"], "success_criteria": ["Support page loads", "Support content visible"]}, "target_html": ""}
-{"web_name": "ArXiv", "id": "arxiv_help_1", "task": "Click the 'Help' link", "web": "https://arxiv.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Help"}, "ground_truth": {"screenshot": "arxiv_help_1_gt.png", "description": "Help page loads", "visual_changes": ["Page transitions to help", "Help content visible"], "success_criteria": ["Help page loads", "Help content visible"]}, "target_html": ""}
-{"web_name": "BBC News", "id": "bbc_video_1", "task": "Click the video player", "web": "https://www.bbc.com/news", "element_type": "div", "interaction": "click", "target_element": {"type": "class", "value": "media-player"}, "ground_truth": {"screenshot": "bbc_video_1_gt.png", "description": "Video player activates", "visual_changes": ["Video starts playing", "Player controls visible"], "success_criteria": ["Video is playing", "Controls are visible"]}, "target_html": ""}
-{"web_name": "Cambridge Dictionary", "id": "cambridge_grammar_1", "task": "Click the 'Grammar' tab", "web": "https://dictionary.cambridge.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Grammar"}, "ground_truth": {"screenshot": "cambridge_grammar_1_gt.png", "description": "Grammar section loads", "visual_changes": ["Page transitions to grammar", "Grammar content visible"], "success_criteria": ["Grammar page loads", "Grammar content visible"]}, "target_html": ""}
-{"web_name": "AllRecipes", "id": "allrecipes_print_1", "task": "Click the print button", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Print"}, "ground_truth": {"screenshot": "allrecipes_print_1_gt.png", "description": "Print dialog opens", "visual_changes": ["Print overlay appears", "Print options visible"], "success_criteria": ["Print dialog visible", "Print options available"]}, "target_html": ""}
-{"web_name": "Amazon", "id": "amazon_orders_1", "task": "Click the 'Returns & Orders' link", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "id", "value": "nav-orders"}, "ground_truth": {"screenshot": "amazon_orders_1_gt.png", "description": "Orders page loads", "visual_changes": ["Page transitions to orders", "Order history visible"], "success_criteria": ["Orders page loads", "Order history visible"]}, "target_html": ""}
-{"web_name": "BBC News", "id": "bbc_notification_1", "task": "Click the notification bell icon", "web": "https://www.bbc.com/news", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "notification-bell"}, "ground_truth": {"screenshot": "bbc_notification_1_gt.png", "description": "Notification settings open", "visual_changes": ["Notification overlay appears", "Notification options visible"], "success_criteria": ["Overlay is visible", "Settings are accessible"]}, "target_html": ""}
-{"web_name": "Booking", "id": "booking_property_1", "task": "Click the property type filter", "web": "https://www.booking.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Property type"}, "ground_truth": {"screenshot": "booking_property_1_gt.png", "description": "Property types appear", "visual_changes": ["Filter overlay appears", "Property options visible"], "success_criteria": ["Overlay is visible", "Options are selectable"]}, "target_html": ""}
-{"web_name": "AllRecipes", "id": "allrecipes_save_1", "task": "Click the save recipe button", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Save Recipe"}, "ground_truth": {"screenshot": "allrecipes_save_1_gt.png", "description": "Save options appear", "visual_changes": ["Save overlay appears", "Collection options visible"], "success_criteria": ["Save dialog visible", "Collections are selectable"]}, "target_html": ""}
-{"web_name": "Amazon", "id": "amazon_filter_1", "task": "Click the price filter dropdown", "web": "https://www.amazon.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Price"}, "ground_truth": {"screenshot": "amazon_filter_1_gt.png", "description": "Price ranges appear", "visual_changes": ["Price overlay appears", "Range options visible"], "success_criteria": ["Overlay is visible", "Ranges are selectable"]}, "target_html": ""}
-{"web_name": "Apple", "id": "apple_watch_1", "task": "Click the 'Watch' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Watch"}, "ground_truth": {"screenshot": "apple_watch_1_gt.png", "description": "Watch page loads", "visual_changes": ["Page transitions to Watch", "Watch products visible"], "success_criteria": ["Watch page loads", "Products are visible"]}, "target_html": ""}
-{"web_name": "ArXiv", "id": "arxiv_pdf_1", "task": "Click the PDF link", "web": "https://arxiv.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "PDF"}, "ground_truth": {"screenshot": "arxiv_pdf_1_gt.png", "description": "PDF starts downloading", "visual_changes": ["Download starts", "Download indicator visible"], "success_criteria": ["Download begins", "Download status shown"]}, "target_html": ""}
-{"web_name": "BBC News", "id": "bbc_business_1", "task": "Click the 'Business' section link", "web": "https://www.bbc.com/news", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Business"}, "ground_truth": {"screenshot": "bbc_business_1_gt.png", "description": "Business section loads", "visual_changes": ["Page transitions to business", "Business news visible"], "success_criteria": ["Business page loads", "Business content visible"]}, "target_html": ""}
-{"web_name": "Cambridge Dictionary", "id": "cambridge_thesaurus_1", "task": "Click the 'Thesaurus' tab", "web": "https://dictionary.cambridge.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Thesaurus"}, "ground_truth": {"screenshot": "cambridge_thesaurus_1_gt.png", "description": "Thesaurus section loads", "visual_changes": ["Page transitions to thesaurus", "Thesaurus content visible"], "success_criteria": ["Thesaurus page loads", "Synonyms are visible"]}, "target_html": ""}
-{"web_name": "AllRecipes", "id": "allrecipes_review_1", "task": "Click the reviews tab", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Reviews"}, "ground_truth": {"screenshot": "allrecipes_review_1_gt.png", "description": "Reviews section opens", "visual_changes": ["Reviews section appears", "Review content visible"], "success_criteria": ["Reviews are visible", "Rating information shown"]}, "target_html": ""}
-{"web_name": "Amazon", "id": "amazon_prime_1", "task": "Click the Prime benefits link", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Prime"}, "ground_truth": {"screenshot": "amazon_prime_1_gt.png", "description": "Prime page loads", "visual_changes": ["Page transitions to Prime", "Prime benefits visible"], "success_criteria": ["Prime page loads", "Benefits are visible"]}, "target_html": ""}
-{"web_name": "Apple", "id": "apple_ipad_1", "task": "Click the 'iPad' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "iPad"}, "ground_truth": {"screenshot": "apple_ipad_1_gt.png", "description": "iPad page loads", "visual_changes": ["Page transitions to iPad", "iPad products visible"], "success_criteria": ["iPad page loads", "Products are visible"]}, "target_html": ""}
-{"web_name": "ArXiv", "id": "arxiv_abstract_1", "task": "Click the abstract toggle", "web": "https://arxiv.org/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Abstract"}, "ground_truth": {"screenshot": "arxiv_abstract_1_gt.png", "description": "Abstract expands", "visual_changes": ["Abstract section expands", "Full text visible"], "success_criteria": ["Abstract is expanded", "Text is readable"]}, "target_html": ""}
-{"web_name": "BBC News", "id": "bbc_tech_1", "task": "Click the 'Technology' section link", "web": "https://www.bbc.com/news", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Technology"}, "ground_truth": {"screenshot": "bbc_tech_1_gt.png", "description": "Technology section loads", "visual_changes": ["Page transitions to technology", "Tech news visible"], "success_criteria": ["Tech page loads", "Tech content visible"]}, "target_html": ""}
-{"web_name": "Cambridge Dictionary", "id": "cambridge_translate_2", "task": "Click the search box and type 'bonjour'", "web": "https://dictionary.cambridge.org/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "searchword"}, "input_text": "bonjour", "ground_truth": {"screenshot": "cambridge_translate_2_gt.png", "description": "Word entered in search", "visual_changes": ["Text appears in search box", "Suggestions may appear"], "success_criteria": ["Text matches exactly", "Input is visible"]}, "target_html": ""}
-{"web_name": "AllRecipes", "id": "allrecipes_cuisine_1", "task": "Click the cuisine filter", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Cuisine"}, "ground_truth": {"screenshot": "allrecipes_cuisine_1_gt.png", "description": "Cuisine options appear", "visual_changes": ["Filter overlay appears", "Cuisine options visible"], "success_criteria": ["Overlay is visible", "Options are selectable"]}, "target_html": ""}
-{"web_name": "Amazon", "id": "amazon_deals_1", "task": "Click the 'Today's Deals' link", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Today's Deals"}, "ground_truth": {"screenshot": "amazon_deals_1_gt.png", "description": "Deals page loads", "visual_changes": ["Page transitions to deals", "Deal items visible"], "success_criteria": ["Deals page loads", "Deals are visible"]}, "target_html": ""}
-{"web_name": "Booking", "id": "booking_map_1", "task": "Click the map view button", "web": "https://www.booking.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Map"}, "ground_truth": {"screenshot": "booking_map_1_gt.png", "description": "Map view opens", "visual_changes": ["Map interface appears", "Property markers visible"], "success_criteria": ["Map is visible", "Properties are plotted"]}, "target_html": ""}
-{"web_name": "Apple", "id": "apple_airpods_1", "task": "Click the 'AirPods' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "AirPods"}, "ground_truth": {"screenshot": "apple_airpods_1_gt.png", "description": "AirPods page loads", "visual_changes": ["Page transitions to AirPods", "AirPods products visible"], "success_criteria": ["AirPods page loads", "Products are visible"]}, "target_html": ""}
-{"web_name": "ArXiv", "id": "arxiv_author_1", "task": "Click the author search link", "web": "https://arxiv.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Author"}, "ground_truth": {"screenshot": "arxiv_author_1_gt.png", "description": "Author search opens", "visual_changes": ["Author search interface appears", "Search options visible"], "success_criteria": ["Search interface visible", "Author field active"]}, "target_html": ""}
-{"web_name": "BBC News", "id": "bbc_share_1", "task": "Click the share button", "web": "https://www.bbc.com/news", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "share-button"}, "ground_truth": {"screenshot": "bbc_share_1_gt.png", "description": "Share options appear", "visual_changes": ["Share overlay appears", "Share options visible"], "success_criteria": ["Overlay is visible", "Options are clickable"]}, "target_html": ""}
-{"web_name": "Cambridge Dictionary", "id": "cambridge_examples_1", "task": "Click the 'Examples' tab", "web": "https://dictionary.cambridge.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Examples"}, "ground_truth": {"screenshot": "cambridge_examples_1_gt.png", "description": "Examples section loads", "visual_changes": ["Page transitions to examples", "Usage examples visible"], "success_criteria": ["Examples page loads", "Examples are visible"]}, "target_html": ""}
-{"web_name": "AllRecipes", "id": "allrecipes_nutrition_1", "task": "Click the nutrition info button", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Nutrition"}, "ground_truth": {"screenshot": "allrecipes_nutrition_1_gt.png", "description": "Nutrition info appears", "visual_changes": ["Nutrition overlay appears", "Nutritional values visible"], "success_criteria": ["Overlay is visible", "Values are readable"]}, "target_html": ""}
-{"web_name": "Amazon", "id": "amazon_wishlist_1", "task": "Click the 'Add to List' button", "web": "https://www.amazon.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Add to List"}, "ground_truth": {"screenshot": "amazon_wishlist_1_gt.png", "description": "List options appear", "visual_changes": ["List overlay appears", "List options visible"], "success_criteria": ["Overlay is visible", "Lists are selectable"]}, "target_html": ""}
-{"web_name": "Apple", "id": "apple_iphone_1", "task": "Click the 'iPhone' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "iPhone"}, "ground_truth": {"screenshot": "apple_iphone_1_gt.png", "description": "iPhone page loads", "visual_changes": ["Page transitions to iPhone", "iPhone products visible"], "success_criteria": ["iPhone page loads", "Products are visible"]}, "target_html": ""}
-{"web_name": "ArXiv", "id": "arxiv_date_1", "task": "Click the date range filter", "web": "https://arxiv.org/", "element_type": "select", "interaction": "click", "target_element": {"type": "name", "value": "date-range"}, "ground_truth": {"screenshot": "arxiv_date_1_gt.png", "description": "Date options appear", "visual_changes": ["Date dropdown appears", "Range options visible"], "success_criteria": ["Dropdown is visible", "Ranges are selectable"]}, "target_html": ""}
-{"web_name": "BBC News", "id": "bbc_climate_1", "task": "Click the 'Climate' section link", "web": "https://www.bbc.com/news", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Climate"}, "ground_truth": {"screenshot": "bbc_climate_1_gt.png", "description": "Climate section loads", "visual_changes": ["Page transitions to climate", "Climate news visible"], "success_criteria": ["Climate page loads", "Climate content visible"]}, "target_html": ""}
-{"web_name": "Booking", "id": "booking_rating_1", "task": "Click the rating filter", "web": "https://www.booking.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Rating"}, "ground_truth": {"screenshot": "booking_rating_1_gt.png", "description": "Rating options appear", "visual_changes": ["Rating overlay appears", "Score options visible"], "success_criteria": ["Overlay is visible", "Ratings are selectable"]}, "target_html": ""}
-{"web_name": "Cambridge Dictionary", "id": "cambridge_browse_1", "task": "Click the 'Browse Dictionary' link", "web": "https://dictionary.cambridge.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Browse Dictionary"}, "ground_truth": {"screenshot": "cambridge_browse_1_gt.png", "description": "Browse page loads", "visual_changes": ["Page transitions to browse", "Word categories visible"], "success_criteria": ["Browse page loads", "Categories are visible"]}, "target_html": ""}
-{"web_name": "AllRecipes", "id": "allrecipes_video_1", "task": "Click the recipe video play button", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "class", "value": "video-play"}, "ground_truth": {"screenshot": "allrecipes_video_1_gt.png", "description": "Video starts playing", "visual_changes": ["Video begins playback", "Player controls visible"], "success_criteria": ["Video is playing", "Controls are visible"]}, "target_html": ""}
-{"web_name": "Amazon", "id": "amazon_seller_1", "task": "Click the 'Other Sellers' link", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Other Sellers"}, "ground_truth": {"screenshot": "amazon_seller_1_gt.png", "description": "Seller options appear", "visual_changes": ["Seller list appears", "Price options visible"], "success_criteria": ["Seller list visible", "Prices are shown"]}, "target_html": ""}
-{"web_name": "Apple", "id": "apple_tv_1", "task": "Click the 'TV & Home' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "TV & Home"}, "ground_truth": {"screenshot": "apple_tv_1_gt.png", "description": "TV & Home page loads", "visual_changes": ["Page transitions to TV", "TV products visible"], "success_criteria": ["TV page loads", "Products are visible"]}, "target_html": ""}
-{"web_name": "ArXiv", "id": "arxiv_title_1", "task": "Click the search box and type 'machine learning'", "web": "https://arxiv.org/", "element_type": "input", "interaction": "type", "target_element": {"type": "name", "value": "query"}, "input_text": "machine learning", "ground_truth": {"screenshot": "arxiv_title_1_gt.png", "description": "Search term entered", "visual_changes": ["Text appears in search box", "Suggestions may appear"], "success_criteria": ["Text matches exactly", "Input is visible"]}, "target_html": ""}
-{"web_name": "BBC News", "id": "bbc_science_1", "task": "Click the 'Science' section link", "web": "https://www.bbc.com/news", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Science"}, "ground_truth": {"screenshot": "bbc_science_1_gt.png", "description": "Science section loads", "visual_changes": ["Page transitions to science", "Science news visible"], "success_criteria": ["Science page loads", "Science content visible"]}, "target_html": ""}
-{"web_name": "Cambridge Dictionary", "id": "cambridge_word_1", "task": "Click the 'Word of the Day' link", "web": "https://dictionary.cambridge.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Word of the Day"}, "ground_truth": {"screenshot": "cambridge_word_1_gt.png", "description": "Word of the Day loads", "visual_changes": ["Page transitions to word", "Word details visible"], "success_criteria": ["Word page loads", "Definition visible"]}, "target_html": ""}
-{"web_name": "AllRecipes", "id": "allrecipes_time_1", "task": "Click the cooking time filter", "web": "https://www.allrecipes.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Cooking Time"}, "ground_truth": {"screenshot": "allrecipes_time_1_gt.png", "description": "Time options appear", "visual_changes": ["Filter overlay appears", "Time ranges visible"], "success_criteria": ["Overlay is visible", "Ranges are selectable"]}, "target_html": ""}
-{"web_name": "Amazon", "id": "amazon_gift_1", "task": "Click the 'Gift Cards' link", "web": "https://www.amazon.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Gift Cards"}, "ground_truth": {"screenshot": "amazon_gift_1_gt.png", "description": "Gift cards page loads", "visual_changes": ["Page transitions to gifts", "Gift card options visible"], "success_criteria": ["Gift page loads", "Options are visible"]}, "target_html": ""}
-{"web_name": "Booking", "id": "booking_popular_1", "task": "Click the 'Popular filters' button", "web": "https://www.booking.com/", "element_type": "button", "interaction": "click", "target_element": {"type": "text", "value": "Popular filters"}, "ground_truth": {"screenshot": "booking_popular_1_gt.png", "description": "Popular filters appear", "visual_changes": ["Filter overlay appears", "Popular options visible"], "success_criteria": ["Overlay is visible", "Filters are selectable"]}, "target_html": ""}
-{"web_name": "Apple", "id": "apple_music_1", "task": "Click the 'Music' link", "web": "https://www.apple.com/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Music"}, "ground_truth": {"screenshot": "apple_music_1_gt.png", "description": "Music page loads", "visual_changes": ["Page transitions to music", "Music services visible"], "success_criteria": ["Music page loads", "Services are visible"]}, "target_html": ""}
-{"web_name": "ArXiv", "id": "arxiv_stats_1", "task": "Click the 'Statistics' subject link", "web": "https://arxiv.org/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Statistics"}, "ground_truth": {"screenshot": "arxiv_stats_1_gt.png", "description": "Statistics papers load", "visual_changes": ["Page transitions to stats", "Statistics papers visible"], "success_criteria": ["Stats page loads", "Papers are visible"]}, "target_html": ""}
-{"web_name": "BBC News", "id": "bbc_local_1", "task": "Click the 'Local News' link", "web": "https://www.bbc.com/news", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Local News"}, "ground_truth": {"screenshot": "bbc_local_1_gt.png", "description": "Local news loads", "visual_changes": ["Page transitions to local", "Local news visible"], "success_criteria": ["Local page loads", "News is visible"]}, "target_html": ""}
+{"web_name": "Cambridge Dictionary", "id": "cambridge_lookup_1", "task": "Click the search box and type 'hello'", "web": "https://dictionary.cambridge.org/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "searchword"}, "input_text": "hello", "target_html": "<input autofocus=\"\" aria-label=\"Search\" spellcheck=\"false\" type=\"text\" name=\"q\" autocomplete=\"off\" autocapitalize=\"none\" aria-required=\"true\" aria-invalid=\"false\" class=\"ft fon pr pt0 hbr-20 lc1 lp-10 lpl-15 cdo-search-input user-valid valid\" id=\"searchword\" lang=\"en\" placeholder=\"Search English\" maxlength=\"100\" value=\"\">", "ground_truth": {"screenshot": "evaluation/ground_truth/task_1_gt.png", "description": "The word 'hello' has been entered in the search box", "visual_changes": ["Text 'hello' appears in search box", "Text cursor visible at end of input", "Search suggestions may appear"], "success_criteria": ["Input text matches 'hello' exactly", "Text is visible in search box", "Search box maintains focus"]}}
diff --git a/data/evaluation_output.jsonl b/data/evaluation_output.jsonl
new file mode 100644
index 0000000..de0aec7
--- /dev/null
+++ b/data/evaluation_output.jsonl
@@ -0,0 +1,5 @@
+{
+  "total_tasks": 80,
+  "successful_tasks": 0,
+  "evaluations": []
+}
\ No newline at end of file
diff --git a/data/ground_truth.jsonl b/data/ground_truth.jsonl
new file mode 100644
index 0000000..4ba99b2
--- /dev/null
+++ b/data/ground_truth.jsonl
@@ -0,0 +1 @@
+{"task_id": 1, "target_html": "<button class='primary-button'>Click me</button>", "screenshot": "evaluation/ground_truth/task_1_gt.png"}
diff --git a/evaluation/README.md b/evaluation/README.md
index 01cb9f2..b9001fa 100644
--- a/evaluation/README.md
+++ b/evaluation/README.md
@@ -8,89 +8,70 @@ The evaluation system combines two approaches:
 1. Visual Validation (60% of score): Using GPT-4V to analyze screenshots
 2. HTML Element Validation (40% of score): Comparing actual HTML elements
 
-## Usage
+## Directory Structure
 
+```
+evaluation/
+├── ground_truth/        # Ground truth screenshots
+│   └── task_1_gt.png   # Named consistently as task_{id}_gt.png
+├── auto_eval.py        # Main evaluation script
+├── image_match.py      # GPT-4V based image comparison
+└── fuzzy_match.py      # HTML element comparison
+```
+
+## Environment Setup
+
+1. Ensure you have the OpenAI API key in your `.env` file:
+```bash
+OPENAI_API_KEY=your_openai_api_key
+```
+
+## Running Evaluation
+
+The evaluation is typically run through the main benchmark script:
+```bash
+python ../run.py --tasks data/tasks.jsonl --output data/results --evaluate
+```
+
+Or can be run separately:
 ```bash
 python auto_eval.py \
-    --tasks ../data/dom_tasks.jsonl \
-    --results ../results/run_001 \
-    --output ../results/run_001/evaluation.json \
-    --openai-key YOUR_API_KEY
+    --tasks-file data/tasks.jsonl \
+    --results-dir data/results.json \
+    --output-file data/evaluation.json
 ```
 
 ## Evaluation Process
 
-1. **Visual Validation (60%)**
-   - Compare before/after screenshots
-   - Verify visual changes match expected interaction
-   - Check element visibility and state changes
-   - Uses GPT-4V for intelligent visual comparison
+1. **Visual Validation (GPT-4V)**
+   - Compares before/after screenshots with ground truth
+   - Considers task-specific requirements
+   - Returns a score and detailed reasoning
 
-2. **HTML Element Validation (40%)**
-   - Compare model's selected HTML element with ground truth
-   - Structure score (40%): Tag hierarchy and relationships
-   - Attributes score (30%): Element properties and identifiers
-   - Content score (30%): Inner HTML and text content
+2. **HTML Element Validation**
+   - Compares target HTML with actual interaction
+   - Uses fuzzy matching for robustness
+   - Considers element attributes and structure
 
-3. **Success Criteria**
-   - Visual score ≥ 0.9 for visual validation
-   - HTML similarity score ≥ 0.9 for element validation
-   - Combined weighted score ≥ 0.9 for overall success
+The final score is a weighted average:
+- Visual Score: 60%
+- HTML Score: 40%
 
 ## Output Format
 
 ```json
 {
-    "total_tasks": 10,
-    "successful_tasks": 8,
-    "evaluations": [
-        {
-            "task_id": "task_001",
-            "visual_evaluation": {
-                "score": 0.95,
-                "details": "Detailed visual evaluation..."
-            },
-            "html_evaluation": {
-                "score": 0.92,
-                "structure_score": 0.95,
-                "attributes_score": 0.90,
-                "content_score": 0.89
-            },
-            "final_score": 0.94,
-            "success": true,
-            "timestamp": 1234567890
-        }
-    ]
+  "total_tasks": 10,
+  "successful_tasks": 8,
+  "evaluations": [
+    {
+      "task_id": 1,
+      "success": true,
+      "visual_score": 0.95,
+      "html_score": 0.90,
+      "final_score": 0.93,
+      "reasoning": "..."
+    }
+  ]
 }
 ```
-
-## Scoring Details
-
-### Visual Score (60%)
-- Element visibility and positioning
-- State changes (hover effects, expansions)
-- Content updates and transitions
-- Overall visual accuracy
-
-### HTML Score (40%)
-1. **Structure (40% of HTML score)**
-   - Correct tag name
-   - Parent-child relationships
-   - Sibling context
-
-2. **Attributes (30% of HTML score)**
-   - ID and class matching
-   - ARIA attributes
-   - Event handlers
-   - Custom data attributes
-
-3. **Content (30% of HTML score)**
-   - Inner HTML similarity
-   - Text content matching
-   - Nested element structure
-
-## Requirements
-
-- OpenAI API key with GPT-4V access
-- Python 3.8+
-- Required packages in `requirements.txt`
diff --git a/evaluation/auto_eval.py b/evaluation/auto_eval.py
index 993d705..6a0255e 100644
--- a/evaluation/auto_eval.py
+++ b/evaluation/auto_eval.py
@@ -1,238 +1,68 @@
-import argparse
-import os
+import logging
 import json
-import time
-import base64
 from pathlib import Path
-from typing import List, Dict, Any
-import logging
-from selenium import webdriver
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.webdriver.chrome.options import Options
-from selenium.webdriver.chrome.service import Service
-from webdriver_manager.chrome import ChromeDriverManager
+from typing import Dict, Any, List
 from openai import OpenAI
-from dotenv import load_dotenv
-from .image_match import compare_images
-from .fuzzy_match import fuzzy_match_html
-
-SYSTEM_PROMPT = """You are an expert web automation evaluator. Your task is to:
-1. Analyze the provided HTML source and accessibility tree
-2. Identify and extract the complete HTML element that matches the target description
-3. Score the visual interaction based on the provided before/after screenshots
-
-For HTML element selection:
-- Return the complete HTML element including its attributes and inner content
-- Consider the element's context and relationship with surrounding elements
-- Ensure the selected element uniquely matches the target description
-
-For visual evaluation:
-- Score how well the interaction matches the expected outcome
-- Consider element visibility, positioning, and state changes
-- Account for any dynamic content or loading states
-
-Provide your response in the following JSON format:
-{
-    "selected_html": "<complete html element>",
-    "visual_score": float,  # 0.0 to 1.0
-    "confidence": float,    # 0.0 to 1.0
-    "reasoning": "string"   # Brief explanation of your evaluation
-}"""
-
-def encode_image(image_path: str) -> str:
-    """Encode image as base64 string"""
-    with open(image_path, "rb") as f:
-        return base64.b64encode(f.read()).decode('utf-8')
 
-def get_element_html_context(driver: webdriver.Chrome, element) -> str:
-    """Get HTML context of an element"""
-    return driver.execute_script("return arguments[0].outerHTML;", element)
-
-def get_accessibility_tree(driver: webdriver.Chrome) -> Dict[str, Any]:
-    """Get accessibility tree of the current page"""
-    return driver.execute_script("return window.axe.getEntireContext();")
-
-def compare_html_elements(html1: str, html2: str) -> Dict[str, Any]:
-    """Compare two HTML elements for structural similarity"""
-    from bs4 import BeautifulSoup
-    import difflib
-    
-    if not html1 or not html2:
-        return {
-            "total_score": 0.0,
-            "structure_score": 0.0,
-            "attributes_score": 0.0,
-            "content_score": 0.0
-        }
-    
-    # Parse HTML
-    soup1 = BeautifulSoup(html1, 'html.parser')
-    soup2 = BeautifulSoup(html2, 'html.parser')
-    
-    # Compare structure (tag names and hierarchy)
-    def get_structure(soup):
-        return [tag.name for tag in soup.find_all()]
-    structure1 = get_structure(soup1)
-    structure2 = get_structure(soup2)
-    structure_score = difflib.SequenceMatcher(None, structure1, structure2).ratio()
-    
-    # Compare attributes
-    def get_attributes(soup):
-        attrs = []
-        for tag in soup.find_all():
-            attrs.extend(sorted(tag.attrs.items()))
-        return attrs
-    attrs1 = get_attributes(soup1)
-    attrs2 = get_attributes(soup2)
-    attributes_score = difflib.SequenceMatcher(None, attrs1, attrs2).ratio()
-    
-    # Compare content
-    def get_content(soup):
-        return [text.strip() for text in soup.stripped_strings]
-    content1 = get_content(soup1)
-    content2 = get_content(soup2)
-    content_score = difflib.SequenceMatcher(None, content1, content2).ratio()
-    
-    # Calculate total score (weighted average)
-    total_score = (
-        0.4 * structure_score +    # Structure is most important
-        0.3 * attributes_score +   # Attributes are second
-        0.3 * content_score        # Content is third
-    )
-    
-    return {
-        "total_score": total_score,
-        "structure_score": structure_score,
-        "attributes_score": attributes_score,
-        "content_score": content_score
-    }
-
-def evaluate_task(
-    task: Dict[str, Any],
-    result: Dict[str, Any],
-    ground_truth: Dict[str, Any],
-    openai_client: OpenAI
-) -> Dict[str, Any]:
-    """Evaluate a task using GPT-4V for visual comparison and GPT-4 for HTML comparison"""
-    try:
-        # 1. Visual Evaluation using GPT-4V (50% of total score)
-        visual_correct, visual_reason = compare_images(
-            prompt=task['task'],
-            ground_truth_path=ground_truth['screenshot'],
-            agent_image_path=result['after_screenshot'],
-            note=ground_truth.get('description', '')
-        )
-        visual_score = 1.0 if visual_correct else 0.0
-        
-        # 2. HTML Evaluation using GPT-4 (50% of total score)
-        html_correct, html_reason = fuzzy_match_html(
-            task_description=task['task'],
-            actual_html=result.get('html_element', ''),
-            expected_html=ground_truth.get('target_html', ''),
-            note=ground_truth.get('description', '')
-        )
-        html_score = 1.0 if html_correct else 0.0
-        
-        # Calculate final score (50-50 split between visual and HTML)
-        final_score = (
-            0.5 * visual_score +    # Visual evaluation (50%)
-            0.5 * html_score        # HTML evaluation (50%)
-        )
-        
-        success = final_score >= 0.9  # Success threshold
-        
-        return {
-            "task_id": task["id"],
-            "success": success,
-            "visual_evaluation": {
-                "score": visual_score,
-                "details": visual_reason
-            },
-            "html_evaluation": {
-                "score": html_score,
-                "details": html_reason
-            },
-            "final_score": final_score,
-            "timestamp": time.time()
-        }
-        
-    except Exception as e:
-        logging.error(f"Error evaluating task {task['id']}: {str(e)}")
-        return {
-            "task_id": task["id"],
-            "success": False,
-            "error": str(e),
-            "timestamp": time.time()
-        }
+from evaluation.image_match import compare_images
+from evaluation.fuzzy_match import fuzzy_match_html
 
 def run_evaluation(
     tasks_file: Path,
     results_dir: Path,
-    ground_truth_dir: Path,
     output_file: Path,
     openai_key: str
-):
-    """Run evaluation on benchmark results"""
-    # Load tasks
-    with open(tasks_file) as f:
+) -> None:
+    """Run evaluation on task results using image and HTML comparison"""
+    # Initialize OpenAI client
+    client = OpenAI(api_key=openai_key)
+    
+    # Load tasks and results
+    with tasks_file.open() as f:
         tasks = [json.loads(line) for line in f if line.strip()]
     
-    # Load results
-    with open(results_dir / "results.json") as f:
+    with results_dir.open() as f:
         results = json.load(f)
+        if not isinstance(results, list):
+            results = [results]
     
-    # Initialize OpenAI client
-    client = OpenAI(api_key=openai_key)
-    
-    # Run evaluation for each task
     evaluations = []
     for task in tasks:
-        # Get result for this task
-        task_result = next(
-            (r for r in results if r["task_id"] == task["id"]),
-            None
-        )
-        if not task_result:
-            logging.warning(f"No result found for task {task['id']}")
-            continue
-        
-        # Get ground truth
-        ground_truth = {
-            "screenshot": str(ground_truth_dir / task["ground_truth"]["screenshot"]),
-            "description": task["ground_truth"].get("description", ""),
-            "target_html": task.get("target_html", "")
-        }
-        
-        # Evaluate task
-        evaluation = evaluate_task(task, task_result, ground_truth, client)
-        evaluations.append(evaluation)
+        task_id = task['id']
+        result = next((r for r in results if r.get('task_id') == task_id), None)
+        if result:
+            # Visual evaluation using compare_images
+            visual_correctness, visual_reasoning = compare_images(
+                prompt=f"Task: {task['task']}\nInteraction: {task['interaction']}\nExpected: {task.get('expected_outcome', 'Complete the task as specified')}",
+                ground_truth_path=task['ground_truth']['screenshot'],
+                agent_image_path=result["after_screenshot"],
+                openai_client=client
+            )
+            
+            # HTML comparison using fuzzy_match
+            html_similarity = fuzzy_match_html(
+                task_description=f"{task['task']}\nInteraction: {task['interaction']}\nExpected: {task.get('expected_outcome', 'Complete the task as specified')}",
+                actual_html=result.get("html_element", ""),
+                expected_html=task.get('target_html', ''),
+                openai_client=client
+            )
+
+            # Combine scores and create evaluation
+            evaluation = {
+                "task_id": task.get("id"),
+                "success": result["success"],
+                "visual_score": visual_correctness,
+                "html_score": html_similarity,
+                "final_score": (visual_correctness + html_similarity) / 2,
+                "reasoning": visual_reasoning
+            }
+            evaluations.append(evaluation)
+            logging.info(f"Evaluated task {task_id}: score={evaluation.get('final_score', 0.0):.2f}")
     
-    # Save evaluations
-    with open(output_file, 'w') as f:
+    # Save evaluations to output file
+    with output_file.open('w') as f:
         json.dump({
             "total_tasks": len(tasks),
             "successful_tasks": sum(1 for e in evaluations if e.get("success", False)),
             "evaluations": evaluations
         }, f, indent=2)
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--tasks", type=Path, required=True)
-    parser.add_argument("--results", type=Path, required=True)
-    parser.add_argument("--ground-truth", type=Path, required=True)
-    parser.add_argument("--output", type=Path, required=True)
-    parser.add_argument("--openai-key", type=str, required=True)
-    args = parser.parse_args()
-    
-    run_evaluation(
-        args.tasks,
-        args.results,
-        args.ground_truth,
-        args.output,
-        args.openai_key
-    )
-
-if __name__ == "__main__":
-    main()
diff --git a/evaluation/fuzzy_match.py b/evaluation/fuzzy_match.py
index a84d190..60999c9 100644
--- a/evaluation/fuzzy_match.py
+++ b/evaluation/fuzzy_match.py
@@ -1,15 +1,39 @@
 import os
+import logging
 from openai import OpenAI
 from typing import Tuple
+from dotenv import load_dotenv
+
+load_dotenv()
+
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger(__name__)
 
 def fuzzy_match_html(
     task_description: str,
     actual_html: str,
     expected_html: str,
     note: str = None,
+    openai_client: OpenAI = None
 ) -> Tuple[bool, str]:
     """Compare HTML elements using GPT-4 for semantic understanding"""
-    openai = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+    
+    if openai_client is None:
+        raise ValueError("OpenAI client must be provided")
+    
+    print("\n=== HTML Task Evaluation ===")
+    print(f"Task Description: {task_description}")
+    print("Agent's HTML Output:")
+    print(actual_html)
+    print("\nExpected HTML:")
+    print(expected_html)
+    if note:
+        print(f"\nAdditional Context: {note}")
+    
+    # Debug logging for API key handling
+    logger.debug("Using provided OpenAI client")
+    
+    client = openai_client
     
     user_prompt = f"""You are evaluating if an HTML element matches the expected element for the following task: {task_description}
 
@@ -44,14 +68,22 @@ def fuzzy_match_html(
         {"role": "user", "content": user_prompt},
     ]
     
-    response = openai.chat.completions.create(
+    response = client.chat.completions.create(
         model="gpt-4",
         messages=messages,
         max_tokens=400,
-        temperature=0.0,
+        temperature=0,
         stream=False
     )
     
+    print("\n=== Judge's HTML Evaluation ===")
+    print(f"Judge's Response: {response.choices[0].message.content}")
+    print("==============================\n")
+    
+    logger.debug("GPT-4 Response for HTML Comparison:")
+    logger.debug(f"Task: {task_description}")
+    logger.debug(f"Response: {response.choices[0].message.content}")
+    
     content = response.choices[0].message.content
     is_match = "true" in content.lower().strip()
     
diff --git a/evaluation/ground_truth/task_1_gt.png b/evaluation/ground_truth/task_1_gt.png
new file mode 100644
index 0000000..07fa140
Binary files /dev/null and b/evaluation/ground_truth/task_1_gt.png differ
diff --git a/evaluation/image_match.py b/evaluation/image_match.py
index 3ff0215..29b36fa 100644
--- a/evaluation/image_match.py
+++ b/evaluation/image_match.py
@@ -2,7 +2,10 @@
 import requests
 from openai import OpenAI
 import os
+import logging
 
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger(__name__)
 
 from dotenv import load_dotenv
 load_dotenv()
@@ -20,47 +23,60 @@ def encode_image(image_path):
     with open(image_path, "rb") as image_file:
         return base64.b64encode(image_file.read()).decode('utf-8')
 
-def compare_images(prompt, ground_truth_path, agent_image_path, note = None):
-    # TODO: if the image is not there (the agent image path), then return False, "The agent did not generate an image"
-
+def compare_images(prompt, ground_truth_path, agent_image_path, note = None, openai_client = None):
+    if openai_client is None:
+        raise ValueError("OpenAI client must be provided")
+        
+    print("\n=== Visual Task Evaluation ===")
+    print(f"Task Description: {prompt}")
+    if note:
+        print(f"Additional Context: {note}")
+    print(f"Agent's Image: {agent_image_path}")
+    print(f"Ground Truth Image: {ground_truth_path}")
+    
     print (f"[DEBUG] Debugging the image output of this agent execution.")
     if not os.path.exists(agent_image_path):
         print (f"[DEBUG] The agent did not generate an image or generated the image with the wrong name or the wrong path.")
         return False, "The agent did not generate an image or generated the image with the wrong name or the wrong path."
     
-    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+    logger.debug("Using provided OpenAI client")
+    client = openai_client
 
     image1 = encode_image(ground_truth_path)
     image2 = encode_image(agent_image_path)
     user_prompt = f"The agent was trying to accomplish the following task: {prompt} The first image is the expected image and the second image is the agent's output. Does the image answer the question correctly as the expected image? Don't focus on unnecessary details, like axes titles or colors or image size or labels unless specified in the task."
     if note:
         user_prompt += f"Here are some notes to help you evaluate the images: {note}"
+    messages = [
+        {"role": "system", "content": system_prompt},
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": user_prompt},
+                {
+                    "type": "image",
+                    "image_url": f"data:image/jpeg;base64,{image1}"
+                },
+                {
+                    "type": "image",
+                    "image_url": f"data:image/jpeg;base64,{image2}"
+                }
+            ]
+        }
+    ]
     response = client.chat.completions.create(
-        model="gpt-4-vision-preview",
-        temperature=0,
-        messages=[
-            {"role": "system", "content": system_prompt},
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": user_prompt},
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": f"data:image/jpeg;base64,{image1}"
-                        }
-                    },
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": f"data:image/jpeg;base64,{image2}"
-                        }
-                    }
-                ]
-            }
-        ],
-        max_tokens=300
+        model="gpt-4o",  # Updated model name for vision tasks
+        messages=messages,
+        max_tokens=500,
     )
+    
+    print("\n=== Judge's Visual Evaluation ===")
+    print(f"Judge's Response: {response.choices[0].message.content}")
+    print("===============================\n")
+    
+    logger.debug("GPT-4V Response for Image Comparison:")
+    logger.debug(f"Prompt: {prompt}")
+    logger.debug(f"Response: {response.choices[0].message.content}")
     print (f"[DEBUG] Response from the image comparison: {response.choices[0].message.content}")
     print (f"[DEBUG] Image Correctness: {response.choices[0].message.content.lower().strip() == 'true'}")
     return "true" in response.choices[0].message.content.lower().strip(), response.choices[0].message.content
diff --git a/run.py b/run.py
index 069ac36..749e0a0 100644
--- a/run.py
+++ b/run.py
@@ -1,18 +1,22 @@
 import argparse
 import json
-import logging
+import os
 import time
+import logging
 from pathlib import Path
-from typing import Dict, List, Any
-
 from selenium import webdriver
-from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.chrome.service import Service
 from webdriver_manager.chrome import ChromeDriverManager
+from utils import execute_interaction, save_screenshot, get_accessibility_tree, save_accessibility_tree, load_tasks_with_ground_truth, save_results
+from evaluation.auto_eval import run_evaluation
+from openai import OpenAI
+from dotenv import load_dotenv
 
-from utils import WebInteractionUtils, load_tasks, save_results, get_accessibility_tree, compute_image_similarity
+# Load environment variables at the start
+load_dotenv()
 
-def setup_logger(output_dir: Path) -> None:
+def setup_logging(output_dir: Path) -> None:
     """Setup logging configuration"""
     log_file = output_dir / "benchmark.log"
     logging.basicConfig(
@@ -24,142 +28,148 @@ def setup_logger(output_dir: Path) -> None:
         ]
     )
 
-def setup_driver(
-    headless: bool = True,
-    download_dir: str = None,
-    force_device_scale: bool = True
-) -> webdriver.Chrome:
-    """Setup Chrome WebDriver with specified options"""
-    options = Options()
-    
-    if force_device_scale:
-        options.add_argument("--force-device-scale-factor=1")
-    if headless:
-        options.add_argument("--headless")
-        options.add_argument(
-            "--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
-        )
-    if download_dir:
-        options.add_experimental_option(
-            "prefs", {"download.default_directory": download_dir}
-        )
-    
-    options.add_argument("--no-sandbox")
-    options.add_argument("--disable-dev-shm-usage")
-    
-    service = Service(ChromeDriverManager().install())
-    return webdriver.Chrome(service=service, options=options)
+def construct_interaction(task):
+    """Construct interaction dict from task"""
+    return {
+        "action": task.get("interaction", "click"),  # Default to click
+        "selector": f"{task['target_element']['type']}={task['target_element']['value']}" if task.get('target_element') else "",
+        "value": task.get("input_text", "")  # For type actions
+    }
 
-def run_benchmark(
-    tasks_file: Path,
-    output_dir: Path,
-    headless: bool = True,
-    force_device_scale: bool = True,
-    save_accessibility_tree: bool = True,
-    image_match_threshold: float = 0.95
-) -> None:
-    """Run the DOM benchmark"""
-    
-    # Setup
+def main():
+    parser = argparse.ArgumentParser(description='Run web automation tasks')
+    parser.add_argument('--tasks', required=True, help='Path to tasks JSONL file')
+    parser.add_argument('--output', required=True, help='Output directory for results')
+    parser.add_argument('--save-accessibility-tree', action='store_true', help='Save accessibility tree')
+    parser.add_argument('--evaluate', action='store_true', help='Run evaluation after benchmark')
+    parser.add_argument('--wait-time', type=float, default=2.0, help='Wait time in seconds after page load and interactions')
+    args = parser.parse_args()
+
+    # Create output directory and setup logging
+    output_dir = Path(args.output)
     output_dir.mkdir(parents=True, exist_ok=True)
-    setup_logger(output_dir)
+    setup_logging(output_dir)
     
-    # Load tasks
-    tasks = load_tasks(tasks_file)
-    logging.info(f"Loaded {len(tasks)} tasks from {tasks_file}")
-    
-    # Setup WebDriver
-    driver = setup_driver(
-        headless=headless,
-        download_dir=str(output_dir / "downloads"),
-        force_device_scale=force_device_scale
+    # Setup Chrome
+    chrome_options = Options()
+    # chrome_options.add_argument('--headless')  # Comment out headless for debugging
+    chrome_options.add_argument('--no-sandbox')
+    chrome_options.add_argument('--disable-dev-shm-usage')
+    chrome_options.add_argument('--force-device-scale-factor=1')
+    chrome_options.add_argument('--window-size=1920,1080')  # Add window size
+    chrome_options.add_argument(
+        'user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
     )
-    utils = WebInteractionUtils(driver)
+    
+    service = Service(ChromeDriverManager().install())
+    driver = webdriver.Chrome(service=service, options=chrome_options)
     
     try:
+        # Load tasks
+        tasks = load_tasks_with_ground_truth(args.tasks)
+        logging.info(f"Loaded {len(tasks)} tasks")
         results = []
-        for i, task in enumerate(tasks):
-            task_id = task["id"]
-            logging.info(f"Running task {i+1}/{len(tasks)}: {task_id}")
-            
-            # Load webpage
-            driver.get(task["web"])
-            time.sleep(2)  # Wait for page load
-            
-            # Get accessibility tree
-            if save_accessibility_tree:
-                tree_file = output_dir / f"accessibility_tree_{task_id}.json"
-                tree = get_accessibility_tree(driver, str(tree_file))
-                logging.info(f"Saved accessibility tree to {tree_file}")
-            
-            # Take before screenshot
-            before_screenshot = output_dir / f"before_{task_id}.png"
-            driver.save_screenshot(str(before_screenshot))
-            
-            # Execute interaction
-            success = utils.execute_interaction(task)
-            time.sleep(1)  # Wait for interaction effect
-            
-            # Take after screenshot
-            after_screenshot = output_dir / f"after_{task_id}.png"
-            driver.save_screenshot(str(after_screenshot))
-            
-            # Compare screenshots
-            image_similarity = compute_image_similarity(str(before_screenshot), str(after_screenshot))
+        
+        for i, task in enumerate(tasks, 1):
+            task_id = task.get('id', 'unknown')
+            logging.info(f"\nProcessing task {i}/{len(tasks)}: {task_id}")
+            logging.info(f"Task description: {task.get('task', 'No description')}")
             
-            # Save result
             result = {
-                "task_id": task_id,
-                "success": success,
-                "image_similarity": image_similarity,
-                "passed_threshold": image_similarity >= image_match_threshold,
-                "timestamp": time.time(),
-                "accessibility_tree": str(tree_file) if save_accessibility_tree else None
+                'task_id': task_id,
+                'success': False,
+                'error': None,
+                'task_description': task.get('task'),
+                'timestamp': time.time()
             }
-            results.append(result)
             
-            logging.info(
-                f"Task {task_id} completed: success={success}, "
-                f"image_similarity={image_similarity:.3f}"
-            )
+            try:
+                # Navigate to page
+                url = task.get('web')  # Use 'web' field from task
+                if not url:
+                    raise ValueError("No URL provided in task")
+                    
+                logging.info(f"Navigating to {url}")
+                driver.get(url)
+                time.sleep(args.wait_time)  # Wait for page load
+                
+                # Save before screenshot
+                before_screenshot = str(output_dir / f"{task_id}_before.png")
+                save_screenshot(driver, before_screenshot)
+                result['before_screenshot'] = before_screenshot
+                logging.info(f"Saved before screenshot: {before_screenshot}")
+                
+                # Save accessibility tree before interaction
+                if args.save_accessibility_tree:
+                    before_tree = get_accessibility_tree(driver)
+                    before_tree_path = str(output_dir / f"{task_id}_before_tree.json")
+                    save_accessibility_tree(before_tree, before_tree_path)
+                    result['before_tree'] = before_tree_path
+                    logging.info(f"Saved before accessibility tree: {before_tree_path}")
+                
+                # Construct and execute interaction
+                interaction = construct_interaction(task)
+                logging.info(f"Executing interaction: {interaction}")
+                success = execute_interaction(driver, interaction)
+                time.sleep(args.wait_time)  # Wait for interaction effects
+                
+                # Save after screenshot
+                after_screenshot = str(output_dir / f"{task_id}_after.png")
+                save_screenshot(driver, after_screenshot)
+                result['after_screenshot'] = after_screenshot
+                logging.info(f"Saved after screenshot: {after_screenshot}")
+                
+                # Save accessibility tree after interaction
+                if args.save_accessibility_tree:
+                    after_tree = get_accessibility_tree(driver)
+                    after_tree_path = str(output_dir / f"{task_id}_after_tree.json")
+                    save_accessibility_tree(after_tree, after_tree_path)
+                    result['after_tree'] = after_tree_path
+                    logging.info(f"Saved after accessibility tree: {after_tree_path}")
+                
+                result['success'] = success
+                logging.info(f"Task completed successfully: {success}")
+                
+            except Exception as e:
+                result['error'] = str(e)
+                logging.error(f"Error processing task {task_id}: {str(e)}", exc_info=True)
+                
+            results.append(result)
         
-        # Save results
+        # Save results to JSON file
         results_file = output_dir / "results.json"
         save_results(results, str(results_file))
         logging.info(f"Results saved to {results_file}")
         
+        # Run evaluation if requested
+        if args.evaluate:
+            openai_key = os.getenv("OPENAI_API_KEY")
+            if not openai_key:
+                logging.error("No OpenAI API key found in environment")
+                return
+            
+            eval_output = output_dir / "evaluation.json"
+            run_evaluation(
+                tasks_file=Path(args.tasks),
+                results_dir=results_file,
+                output_file=eval_output,
+                openai_key=openai_key
+            )
+            logging.info(f"Evaluation complete. Results saved to {eval_output}")
+        
         # Print summary
-        successful = sum(1 for r in results if r["success"])
-        passed_threshold = sum(1 for r in results if r["passed_threshold"])
+        successful = sum(1 for r in results if r['success'])
+        errors = sum(1 for r in results if r.get('error'))
         logging.info(
             f"\nBenchmark Summary:\n"
             f"Total Tasks: {len(tasks)}\n"
             f"Successful Interactions: {successful}\n"
-            f"Passed Image Threshold: {passed_threshold}\n"
+            f"Failed Tasks: {errors}\n"
+            f"Success Rate: {(successful/len(tasks))*100:.1f}%"
         )
         
     finally:
         driver.quit()
 
-def main():
-    parser = argparse.ArgumentParser(description="Run DOM Benchmark")
-    parser.add_argument("--tasks", type=Path, required=True, help="Path to tasks JSONL file")
-    parser.add_argument("--output", type=Path, required=True, help="Output directory for results")
-    parser.add_argument("--headless", action="store_true", help="Run Chrome in headless mode")
-    parser.add_argument("--force-device-scale", action="store_true", help="Force device scale factor to 1")
-    parser.add_argument("--save-accessibility-tree", action="store_true", help="Save accessibility tree for each task")
-    parser.add_argument("--threshold", type=float, default=0.95, help="Image similarity threshold")
-    
-    args = parser.parse_args()
-    
-    run_benchmark(
-        tasks_file=args.tasks,
-        output_dir=args.output,
-        headless=args.headless,
-        force_device_scale=args.force_device_scale,
-        save_accessibility_tree=args.save_accessibility_tree,
-        image_match_threshold=args.threshold
-    )
-
-if __name__ == "__main__":
+if __name__ == '__main__':
     main()
diff --git a/test_env.py b/test_env.py
new file mode 100644
index 0000000..c6e5dc3
--- /dev/null
+++ b/test_env.py
@@ -0,0 +1,48 @@
+from dotenv import load_dotenv, find_dotenv
+import os
+import logging
+
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger(__name__)
+
+def test_api_key():
+    # Check if OPENAI_API_KEY is already in environment before loading .env
+    api_key_before = os.getenv("OPENAI_API_KEY")
+    print(f"API key before loading .env: {api_key_before}")
+    
+    # Find all possible .env files
+    env_path = find_dotenv()
+    print(f"\nFound .env file at: {env_path}")
+    
+    # Load the .env file
+    load_dotenv(env_path)
+    
+    # Get API key after loading .env
+    api_key_after = os.getenv("OPENAI_API_KEY")
+    print(f"\nAPI key after loading .env: {api_key_after}")
+    
+    # Check for .env in different directories
+    possible_locations = [
+        os.path.join(os.getcwd(), '.env'),
+        os.path.join(os.path.expanduser('~'), '.env'),
+        os.path.join(os.path.dirname(os.path.dirname(__file__)), '.env')
+    ]
+    
+    print("\nChecking possible .env locations:")
+    for loc in possible_locations:
+        if os.path.exists(loc):
+            print(f"Found .env at: {loc}")
+            with open(loc, 'r') as f:
+                content = f.read().strip()
+                print(f"Content starts with: {content[:50]}...")
+        else:
+            print(f"No .env at: {loc}")
+    
+    # Print all environment variables starting with OPENAI
+    print("\nAll OPENAI-related environment variables:")
+    for key, value in os.environ.items():
+        if 'OPENAI' in key:
+            print(f"{key}: {value}")
+
+if __name__ == "__main__":
+    test_api_key()
diff --git a/test_openai.py b/test_openai.py
new file mode 100644
index 0000000..7d39f09
--- /dev/null
+++ b/test_openai.py
@@ -0,0 +1,31 @@
+from dotenv import load_dotenv
+import os
+from openai import OpenAI
+import logging
+
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger(__name__)
+
+def test_openai_connection():
+    load_dotenv()
+    api_key = os.getenv("OPENAI_API_KEY")
+    if not api_key:
+        logger.error("No API key found in environment")
+        return
+    
+    logger.debug(f"Found API key starting with: {api_key[:7]}")
+    
+    try:
+        client = OpenAI(api_key=api_key)
+        response = client.chat.completions.create(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "Say hello"}],
+            max_tokens=10
+        )
+        logger.info("Successfully connected to OpenAI API")
+        logger.debug(f"Response: {response.choices[0].message.content}")
+    except Exception as e:
+        logger.error(f"Failed to connect to OpenAI API: {str(e)}")
+
+if __name__ == "__main__":
+    test_openai_connection()
diff --git a/utils.py b/utils.py
index 1bac142..d753c1f 100644
--- a/utils.py
+++ b/utils.py
@@ -1,115 +1,151 @@
+import os
+import json
+import time
+import logging
+from pathlib import Path
+from typing import Dict, Any, List, Optional, Union
 from selenium import webdriver
 from selenium.webdriver.common.by import By
-from selenium.webdriver.common.keys import Keys
-from selenium.webdriver.common.action_chains import ActionChains
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
-import json
-import base64
-import time
-import logging
-from typing import Dict, List, Any, Optional
-from PIL import Image
-import numpy as np
+from selenium.common.exceptions import TimeoutException, NoSuchElementException
 
-def get_accessibility_tree(driver: webdriver.Chrome, save_file: Optional[str] = None) -> Dict:
-    """Get accessibility tree of the current page"""
-    js_script = """
-        function getAccessibilityTree(node, tree = {}) {
-            tree.role = node.role || '';
-            tree.name = node.tagName || '';
-            tree.type = node.type || '';
-            tree.value = node.value || '';
-            tree.textContent = node.textContent ? node.textContent.trim() : '';
+def execute_interaction(driver: webdriver.Chrome, interaction: Dict[str, Any]) -> bool:
+    """Execute a single interaction on the webpage"""
+    try:
+        action = interaction.get("action", "").lower()
+        selector = interaction.get("selector", "")
+        value = interaction.get("value", "")
+        
+        if not selector:
+            logging.warning("No selector provided for interaction")
+            return False
             
-            const rect = node.getBoundingClientRect();
-            tree.location = {
-                x: rect.x,
-                y: rect.y,
-                width: rect.width,
-                height: rect.height
-            };
+        # Parse selector in format "type=value"
+        selector_parts = selector.split('=', 1)
+        if len(selector_parts) != 2:
+            logging.error(f"Invalid selector format: {selector}")
+            return False
             
-            tree.children = [];
-            const children = node.children;
-            for (let i = 0; i < children.length; i++) {
-                tree.children.push(getAccessibilityTree(children[i]));
-            }
-            return tree;
+        selector_type, selector_value = selector_parts
+        
+        # Map selector type to Selenium By
+        selector_map = {
+            'id': By.ID,
+            'class': By.CLASS_NAME,
+            'css': By.CSS_SELECTOR,
+            'xpath': By.XPATH,
+            'name': By.NAME,
+            'tag': By.TAG_NAME
         }
-        return getAccessibilityTree(document.documentElement);
-    """
-    tree = driver.execute_script(js_script)
-    
-    if save_file:
-        with open(save_file, 'w') as f:
-            json.dump(tree, f, indent=2)
-    
-    return tree
-
-class WebInteractionUtils:
-    def __init__(self, driver: webdriver.Chrome):
-        self.driver = driver
-        self.wait = WebDriverWait(driver, 10)
         
-    def find_element(self, locator_type: str, locator: str) -> Optional[Any]:
-        """Find element with wait and retry logic"""
-        try:
-            element = self.wait.until(
-                EC.presence_of_element_located((getattr(By, locator_type.upper()), locator))
-            )
-            return element
-        except Exception as e:
-            logging.error(f"Failed to find element {locator_type}={locator}: {str(e)}")
-            return None
-    
-    def execute_interaction(self, task: Dict[str, Any]) -> bool:
-        """Execute web interaction based on task definition"""
-        try:
-            # Find element
-            element = self.find_element(
-                task["target_element"].get("type", "XPATH"),
-                task["target_element"].get("value")
-            )
-            if not element:
-                return False
-                
-            # Execute interaction
-            interaction = task["interaction"].lower()
-            if interaction == "click":
-                element.click()
-            elif interaction == "type":
-                element.clear()
-                element.send_keys(task.get("input_text", ""))
-            elif interaction == "hover":
-                ActionChains(self.driver).move_to_element(element).perform()
-            else:
-                logging.error(f"Unknown interaction type: {interaction}")
-                return False
-                
-            return True
-            
-        except Exception as e:
-            logging.error(f"Failed to execute interaction: {str(e)}")
+        by_type = selector_map.get(selector_type.lower())
+        if not by_type:
+            logging.error(f"Unsupported selector type: {selector_type}")
+            return False
+        
+        # Wait for element to be present and interactable
+        wait = WebDriverWait(driver, 10)
+        element = wait.until(EC.presence_of_element_located((by_type, selector_value)))
+        wait.until(EC.element_to_be_clickable((by_type, selector_value)))
+        
+        logging.info(f"Found element using {selector_type}={selector_value}")
+        
+        if action == "click":
+            element.click()
+            logging.info("Clicked element")
+        elif action == "type":
+            element.clear()
+            element.send_keys(value)
+            logging.info(f"Typed '{value}' into element")
+        else:
+            logging.warning(f"Unsupported action: {action}")
             return False
+            
+        time.sleep(1)  # Allow for any animations/state changes
+        return True
+        
+    except TimeoutException:
+        logging.error(f"Timeout waiting for element: {selector}")
+        return False
+    except NoSuchElementException:
+        logging.error(f"Element not found: {selector}")
+        return False
+    except Exception as e:
+        logging.error(f"Error executing interaction: {str(e)}", exc_info=True)
+        return False
 
-def compute_image_similarity(img1_path: str, img2_path: str) -> float:
-    """Compute similarity between two images"""
-    img1 = np.array(Image.open(img1_path))
-    img2 = np.array(Image.open(img2_path))
-    
-    # Ensure same size
-    if img1.shape != img2.shape:
-        img2 = np.array(Image.open(img2_path).resize((img1.shape[1], img1.shape[0])))
-    
-    # Compute MSE
-    mse = np.mean((img1 - img2) ** 2)
-    
-    # Convert to similarity score (0 to 1)
-    similarity = 1 / (1 + mse)
-    
-    # Convert numpy float to Python float
-    return float(similarity)
+def save_screenshot(driver: webdriver.Chrome, filepath: str) -> bool:
+    """Save screenshot of the current page state"""
+    try:
+        Path(filepath).parent.mkdir(parents=True, exist_ok=True)
+        driver.save_screenshot(filepath)
+        logging.info(f"Screenshot saved to {filepath}")
+        return True
+    except Exception as e:
+        logging.error(f"Error saving screenshot: {str(e)}")
+        return False
+
+def get_accessibility_tree(driver: webdriver.Chrome) -> Dict[str, Any]:
+    """Get accessibility tree of the current page"""
+    try:
+        # Inject axe-core script if not already present
+        axe_script = """
+            if (!window.axe) {
+                var script = document.createElement('script');
+                script.src = 'https://cdnjs.cloudflare.com/ajax/libs/axe-core/4.7.0/axe.min.js';
+                script.type = 'text/javascript';
+                document.getElementsByTagName('head')[0].appendChild(script);
+                
+                // Wait for script to load
+                return new Promise((resolve) => {
+                    script.onload = () => {
+                        axe.configure({
+                            allowedOrigins: ['<same_origin>'],
+                            rules: []
+                        });
+                        resolve(true);
+                    };
+                });
+            }
+            return Promise.resolve(true);
+        """
+        driver.execute_async_script(axe_script)
+        time.sleep(1)  # Give a moment for axe to initialize
+        
+        # Now get the accessibility tree
+        return driver.execute_script("""
+            return {
+                url: window.location.href,
+                title: document.title,
+                tree: axe.utils.getSelector(document.documentElement)
+            };
+        """)
+    except Exception as e:
+        logging.error(f"Error getting accessibility tree: {str(e)}")
+        return {}
+
+def save_accessibility_tree(tree: Dict[str, Any], filepath: str) -> bool:
+    """Save accessibility tree to file"""
+    try:
+        Path(filepath).parent.mkdir(parents=True, exist_ok=True)
+        with open(filepath, 'w') as f:
+            json.dump(tree, f, indent=2)
+        logging.info(f"Accessibility tree saved to {filepath}")
+        return True
+    except Exception as e:
+        logging.error(f"Error saving accessibility tree: {str(e)}")
+        return False
+
+def load_tasks_with_ground_truth(tasks_file: str) -> List[Dict[str, Any]]:
+    """Load tasks from JSONL file. Ground truth paths are now included in the tasks file."""
+    tasks = []
+    with open(tasks_file) as f:
+        for line in f:
+            if line.strip():
+                task = json.loads(line)
+                tasks.append(task)
+    return tasks
 
 def load_tasks(tasks_file: str) -> List[Dict[str, Any]]:
     """Load tasks from JSONL file"""
@@ -122,18 +158,5 @@ def load_tasks(tasks_file: str) -> List[Dict[str, Any]]:
 
 def save_results(results: List[Dict[str, Any]], output_file: str) -> None:
     """Save benchmark results to JSON file"""
-    # Convert any numpy types to Python types
-    serializable_results = []
-    for result in results:
-        serializable_result = {}
-        for key, value in result.items():
-            if isinstance(value, np.floating):
-                serializable_result[key] = float(value)
-            elif isinstance(value, np.integer):
-                serializable_result[key] = int(value)
-            else:
-                serializable_result[key] = value
-        serializable_results.append(serializable_result)
-    
     with open(output_file, 'w') as f:
-        json.dump(serializable_results, f, indent=2)
+        json.dump(results, f, indent=2)
diff --git a/utils/screenshot_utils.py b/utils/screenshot_utils.py
index d81eb2a..d9e1e46 100644
--- a/utils/screenshot_utils.py
+++ b/utils/screenshot_utils.py
@@ -67,8 +67,8 @@ def capture_task_screenshots(task_data, ground_truth_dir):
     try:
         # Create screenshot paths
         task_id = task_data["id"]
-        before_path = Path(ground_truth_dir) / f"{task_id}_before.png"
-        after_path = Path(ground_truth_dir) / f"{task_id}_gt.png"
+        before_path = Path(ground_truth_dir) / f"task_{task_id}_before.png"
+        after_path = Path(ground_truth_dir) / f"task_{task_id}_gt.png"
         
         # Take before screenshot
         take_full_page_screenshot(driver, task_data["web"], str(before_path))
@@ -97,17 +97,20 @@ def capture_task_screenshots(task_data, ground_truth_dir):
     finally:
         driver.quit()
 
-if __name__ == "__main__":
-    # Example usage
-    from pathlib import Path
-    import json
+def main():
+    """Main function to capture screenshots for all tasks"""
+    ground_truth_dir = Path("evaluation/ground_truth")
+    ground_truth_dir.mkdir(exist_ok=True)
     
     # Load tasks
     tasks_file = Path("data/dom_tasks.jsonl")
-    ground_truth_dir = Path("data/ground_truth")
-    ground_truth_dir.mkdir(exist_ok=True)
     
     with open(tasks_file) as f:
         for line in f:
             task = json.loads(line)
             capture_task_screenshots(task, ground_truth_dir)
+
+if __name__ == "__main__":
+    from pathlib import Path
+    import json
+    main()