diff --git a/analyze_results.py b/analyze_results.py
new file mode 100644
index 0000000..81f8472
--- /dev/null
+++ b/analyze_results.py
@@ -0,0 +1,17 @@
+import json
+from pathlib import Path
+
+# Read results file
+results_file = Path('results/benchmark_results.json/results.json')
+with open(results_file) as f:
+    results = json.load(f)
+
+# Calculate succexss percentage
+total_tasks = len(results)
+successful_tasks = sum(1 for result in results if result.get('success', False))
+success_percentage = (successful_tasks / total_tasks) * 100 if total_tasks > 0 else 0
+
+print(f"\nResults Analysis:")
+print(f"Total Tasks: {total_tasks}")
+print(f"Successful Tasks: {successful_tasks}")
+print(f"Success Rate: {success_percentage:.2f}%")
diff --git a/data/dom_tasks.jsonl b/data/dom_tasks.jsonl
index b5c8599..2129883 100644
--- a/data/dom_tasks.jsonl
+++ b/data/dom_tasks.jsonl
@@ -2,94 +2,4 @@
 {"web_name": "Amazon", "id": "amazon_cart_1", "task": "Click the Cart button", "web": "https://www.amazon.com", "element_type": "link", "interaction": "click", "target_element": {"type": "id", "value": "nav-cart"}, "target_html": "<div id=\"nav-cart-text-container\" class=\"nav-progressive-attribute\"><span aria-hidden=\"true\" class=\"nav-line-1\"></span><span aria-hidden=\"true\" class=\"nav-line-2\">Cart<span class=\"nav-icon nav-arrow\"></span></span></div>", "ground_truth": {"screenshot": "evaluation/ground_truth/task_2_gt.png", "description": "The Cart button has been clicked"}} 
 {"web_name": "Google Maps", "id": "maps_search_1", "task": "Type 'San Francisco' into the search box and press search", "web": "https://www.google.com/maps", "element_type": "input", "interaction": "type_submit", "target_element": {"type": "id", "value": "searchboxinput"}, "input_text": "San Francisco", "target_html": "<span class=\"google-symbols\" aria-hidden=\"true\" style=\"font-size: 24px\"></span><input class=\"fontBodyMedium searchboxinput xiQnY\" role=\"combobox\" jslog=\"11886\" aria-controls=\"ydp1wd-haAclf\" aria-expanded=\"false\" aria-haspopup=\"grid\" autocomplete=\"off\" autofocus=\"\" id=\"searchboxinput\" name=\"q\" jsaction=\"keyup:omnibox.keyUp; input:omnibox.inputDetected; keydown:omnibox.keyDown; focus:omnibox.focus; blur:omnibox.blur\">", "ground_truth": {"screenshot": "evaluation/ground_truth/task_3_gt.png", "description": "San Francisco has been entered and search initiated"}} 
 {"web_name": "YouTube", "id": "youtube_logo_1", "task": "Hover over the YouTube logo in the top left", "web": "https://www.youtube.com", "element_type": "link", "interaction": "hover", "target_element": {"type": "id", "value": "logo"}, "target_html": "<div style=\"width: 100%; height: 100%; display: block; fill: currentcolor;\"><svg xmlns=\"http://www.w3.org/2000/svg\" id=\"yt-ringo2-svg_yt16\" width=\"93\" height=\"20\" viewBox=\"0 0 93 20\" focusable=\"false\" aria-hidden=\"true\" style=\"pointer-events: none; display: inherit; width: 100%; height: 100%;\"><g><path d=\"M14.4848 20C14.4848 20 23.5695 20 25.8229 19.4C27.0917 19.06 28.0459 18.08 28.3808 16.87C29 14.65 29 9.98 29 9.98C29 9.98 29 5.34 28.3808 3.14C28.0459 1.9 27.0917 0.94 25.8229 0.61C23.5695 0 14.4848 0 14.4848 0C14.4848 0 5.42037 0 3.17711 0.61C1.9286 0.94 0.954148 1.9 0.59888 3.14C0 5.34 0 9.98 0 9.98C0 9.98 0 14.65 0.59888 16.87C0.954148 18.08 1.9286 19.06 3.17711 19.4C5.42037 20 14.4848 20 14.4848 20Z\" fill=\"#FF0033\"></path><path d=\"M19 10L11.5 5.75V14.25L19 10Z\" fill=\"white\"></path></g><g id=\"youtube-paths_yt16\"><path d=\"M37.1384 18.8999V13.4399L40.6084 2.09994H38.0184L36.6984 7.24994C36.3984 8.42994 36.1284 9.65994 35.9284 10.7999H35.7684C35.6584 9.79994 35.3384 8.48994 35.0184 7.22994L33.7384 2.09994H31.1484L34.5684 13.4399V18.8999H37.1384Z\"></path><path d=\"M44.1003 6.29994C41.0703 6.29994 40.0303 8.04994 40.0303 11.8199V13.6099C40.0303 16.9899 40.6803 19.1099 44.0403 19.1099C47.3503 19.1099 48.0603 17.0899 48.0603 13.6099V11.8199C48.0603 8.44994 47.3803 6.29994 44.1003 6.29994ZM45.3903 14.7199C45.3903 16.3599 45.1003 17.3899 44.0503 17.3899C43.0203 17.3899 42.7303 16.3499 42.7303 14.7199V10.6799C42.7303 9.27994 42.9303 8.02994 44.0503 8.02994C45.2303 8.02994 45.3903 9.34994 45.3903 10.6799V14.7199Z\"></path><path d=\"M52.2713 19.0899C53.7313 19.0899 54.6413 18.4799 55.3913 17.3799H55.5013L55.6113 18.8999H57.6012V6.53994H54.9613V16.4699C54.6812 16.9599 54.0312 17.3199 53.4212 17.3199C52.6512 17.3199 52.4113 16.7099 52.4113 15.6899V6.53994H49.7812V15.8099C49.7812 17.8199 50.3613 19.0899 52.2713 19.0899Z\"></path><path d=\"M62.8261 18.8999V4.14994H65.8661V2.09994H57.1761V4.14994H60.2161V18.8999H62.8261Z\"></path><path d=\"M67.8728 19.0899C69.3328 19.0899 70.2428 18.4799 70.9928 17.3799H71.1028L71.2128 18.8999H73.2028V6.53994H70.5628V16.4699C70.2828 16.9599 69.6328 17.3199 69.0228 17.3199C68.2528 17.3199 68.0128 16.7099 68.0128 15.6899V6.53994H65.3828V15.8099C65.3828 17.8199 65.9628 19.0899 67.8728 19.0899Z\"></path><path d=\"M80.6744 6.26994C79.3944 6.26994 78.4744 6.82994 77.8644 7.73994H77.7344C77.8144 6.53994 77.8744 5.51994 77.8744 4.70994V1.43994H75.3244L75.3144 12.1799L75.3244 18.8999H77.5444L77.7344 17.6999H77.8044C78.3944 18.5099 79.3044 19.0199 80.5144 19.0199C82.5244 19.0199 83.3844 17.2899 83.3844 13.6099V11.6999C83.3844 8.25994 82.9944 6.26994 80.6744 6.26994ZM80.7644 13.6099C80.7644 15.9099 80.4244 17.2799 79.3544 17.2799C78.8544 17.2799 78.1644 17.0399 77.8544 16.5899V9.23994C78.1244 8.53994 78.7244 8.02994 79.3944 8.02994C80.4744 8.02994 80.7644 9.33994 80.7644 11.7299V13.6099Z\"></path><path d=\"M92.6517 11.4999C92.6517 8.51994 92.3517 6.30994 88.9217 6.30994C85.6917 6.30994 84.9717 8.45994 84.9717 11.6199V13.7899C84.9717 16.8699 85.6317 19.1099 88.8417 19.1099C91.3817 19.1099 92.6917 17.8399 92.5417 15.3799L90.2917 15.2599C90.2617 16.7799 89.9117 17.3999 88.9017 17.3999C87.6317 17.3999 87.5717 16.1899 87.5717 14.3899V13.5499H92.6517V11.4999ZM88.8617 7.96994C90.0817 7.96994 90.1717 9.11994 90.1717 11.0699V12.0799H87.5717V11.0699C87.5717 9.13994 87.6517 7.96994 88.8617 7.96994Z\"></path></g></svg></div>", "ground_truth": {"screenshot": "evaluation/ground_truth/task_4_gt.png", "description": "The YouTube logo is in hover state, showing a tooltip with 'YouTube Home'"}} 
-{"web_name": "React Documentation", "id": "react_search_1", "task": "Type 'hooks tutorial' in the search box", "web": "https://legacy.reactjs.org/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "algolia-doc-search"}, "input_text": "hooks tutorial", "target_html": "<input type=\"search\" id=\"algolia-doc-search\" placeholder=\"Search\" aria-label=\"Search docs\" class=\"css-7dpbpx ds-input\" autocomplete=\"off\" spellcheck=\"false\" role=\"combobox\" aria-autocomplete=\"list\" aria-expanded=\"false\" aria-labelledby=\"algolia-doc-search\" aria-owns=\"algolia-autocomplete-listbox-0\" dir=\"auto\" style=\"position: relative; vertical-align: top;\">", "ground_truth": {"screenshot": "evaluation/ground_truth/task_54_gt.png", "description": "The phrase 'hooks tutorial' has been entered in the search box"}}
-
-{"web_name": "Vue.js Documentation", "id": "vue_search_1", "task": "Type 'component props' into the search box and press search", "web": "https://vuejs.org/guide", "element_type": "input", "interaction": "type_submit", "target_element": {"type": "id", "value": "search-input"}, "input_text": "component props", "target_html": "<span class=\"DocSearch-Button-Container\"><svg width=\"20\" height=\"20\" class=\"DocSearch-Search-Icon\" viewBox=\"0 0 20 20\" aria-hidden=\"true\"><path d=\"M14.386 14.386l4.0877 4.0877-4.0877-4.0877c-2.9418 2.9419-7.7115 2.9419-10.6533 0-2.9419-2.9418-2.9419-7.7115 0-10.6533 2.9418-2.9419 7.7115-2.9419 10.6533 0 2.9419 2.9418 2.9419 7.7115 0 10.6533z\" stroke=\"currentColor\" fill=\"none\" fill-rule=\"evenodd\" stroke-linecap=\"round\" stroke-linejoin=\"round\"></path></svg><span class=\"DocSearch-Button-Placeholder\">Search</span></span>", "ground_truth": {"screenshot": "evaluation/ground_truth/task_55_gt.png", "description": "The phrase 'component props' has been entered and search initiated"}}
-
-{"web_name": "Django Documentation", "id": "django_search_1", "task": "Type 'model fields' in the search box and click search", "web": "https://docs.djangoproject.com", "element_type": "input", "interaction": "type_submit", "target_element": {"type": "id", "value": "search-input"}, "input_text": "model fields", "target_html": "<input type=\"search\" name=\"q\" placeholder=\"Search 5.1 documentation (⌘ + K)\" id=\"id_q\">", "ground_truth": {"screenshot": "evaluation/ground_truth/task_56_gt.png", "description": "The phrase 'model fields' has been entered in the search box and search initiated"}}
-
-{"web_name": "Flask Documentation", "id": "flask_search_1", "task": "Type 'route decorators' into the search box and press search", "web": "https://flask.palletsprojects.com", "element_type": "input", "interaction": "type_submit", "target_element": {"type": "id", "value": "searchbox"}, "input_text": "route decorators", "target_html": "<input type=\"text\" name=\"q\" aria-labelledby=\"searchlabel\" autocomplete=\"off\" autocorrect=\"off\" autocapitalize=\"off\" spellcheck=\"false\">", "ground_truth": {"screenshot": "evaluation/ground_truth/task_57_gt.png", "description": "The phrase 'route decorators' has been entered and search initiated"}}
-
-{"web_name": "MDN Web Docs", "id": "mdn_nav_1", "task": "Click the Guides link in the navigation", "web": "https://developer.mozilla.org", "element_type": "link", "interaction": "click", "target_element": {"type": "class", "value": "nav-guides"}, "target_html": "<a href=\"/en-US/docs/Learn\" class=\"top-level-entry\">Guides</a>", "ground_truth": {"screenshot": "evaluation/ground_truth/task_58_gt.png", "description": "The Guides link has been clicked"}}
-
-{"web_name": "W3Schools", "id": "w3_tutorial_1", "task": "Click the HTML Tutorial link", "web": "https://www.w3schools.com", "element_type": "link", "interaction": "click", "target_element": {"type": "class", "value": "w3-bar-item"}, "target_html": "<a onclick=\"TopNavBar.openNavItem('tutorials')\" class=\"tnb-nav-btn w3-bar-item w3-button barex bar-item-hover w3-padding-16 ga-top ga-top-tut-and-ref\" href=\"javascript:void(0)\" id=\"navbtn_tutorials\" title=\"Tutorials and References\" role=\"button\">Tutorials<i class=\"fa fa-caret-down\" style=\"font-size: 15px\" aria-hidden=\"true\"></i><i class=\"fa fa-caret-up\" style=\"display: none; font-size: 15px\" aria-hidden=\"true\"></i></a>", "ground_truth": {"screenshot": "evaluation/ground_truth/task_59_gt.png", "description": "The HTML Tutorial link has been clicked"}}
-
-{"web_name": "Python Documentation", "id": "python_nav_1", "task": "Click the Library Reference link", "web": "https://docs.python.org", "element_type": "link", "interaction": "click", "target_element": {"type": "class", "value": "reference-nav"}, "target_html": "<a class=\"biglink\" href=\"reference/index.html\">Language reference</a>", "ground_truth": {"screenshot": "evaluation/ground_truth/task_60_gt.png", "description": "The Library Reference link has been clicked"}}
-
-{"web_name": "Node.js Documentation", "id": "node_nav_1", "task": "Click the Docs link", "web": "https://nodejs.org/en", "element_type": "link", "interaction": "click", "target_element": {"type": "class", "value": "docs-link"}, "target_html": "<span class=\"NavItem_label__9GW08\">Docs</span>", "ground_truth": {"screenshot": "evaluation/ground_truth/task_61_gt.png", "description": "The docs link has been clicked"}}
-
-{"web_name": "MySQL Documentation", "id": "mysql_search_2", "task": "Type 'stored procedures' in the search box", "web": "https://dev.mysql.com/doc", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "search-input"}, "input_text": "stored procedures", "target_html": "<input id=\"l1-search-input\" type=\"search\" class=\"icon-search\" placeholder=\"Search\" aria-label=\"Search\" name=\"q\">", "ground_truth": {"screenshot": "evaluation/ground_truth/task_62_gt.png", "description": "The phrase 'stored procedures' has been entered in the search box"}}
-
-{"web_name": "PostgreSQL Documentation", "id": "postgres_nav_1", "task": "Click the Developers link", "web": "https://www.postgresql.org/docs", "element_type": "link", "interaction": "click", "target_element": {"type": "class", "value": "dev-link"}, "target_html": "<li class=\"nav-item p-2\"><a href=\"/developer/\" title=\"Developers\">Developers</a></li>", "ground_truth": {"screenshot": "evaluation/ground_truth/task_63_gt.png", "description": "The Developers link has been clicked"}}
-
-{"web_name": "MySQL Documentation", "id": "mysql_nav_1", "task": "Click the Reference Manual link", "web": "https://dev.mysql.com/doc", "element_type": "link", "interaction": "click", "target_element": {"type": "class", "value": "reference-link"}, "target_html": "<li class=\"nav-item p-2\"><a href=\"/doc/refman/8.0/en/\" title=\"Reference Manual\">Reference Manual</a></li>", "ground_truth": {"screenshot": "evaluation/ground_truth/task_64_gt.png", "description": "The Reference Manual link has been clicked"}}
-
-{"web_name": "PHP Documentation", "id": "php_nav_1", "task": "Click the Get Involved link", "web": "https://www.php.net/docs.php", "element_type": "link", "interaction": "click", "target_element": {"type": "class", "value": "get_involved"}, "target_html": "<a href=\"/get-involved.php\" class=\"navbar__link\">Get Involved</a>", "ground_truth": {"screenshot": "evaluation/ground_truth/task_65_gt.png", "description": "The Get Involved link has been clicked"}}
-
-{"web_name": "React Get Started", "id": "react_nav_1", "task": "Click the Get Started link", "web": "https://legacy.reactjs.org", "element_type": "link", "interaction": "click", "target_element": {"type": "class", "value": "nav-tutorial"}, "target_html": "<a class=\"css-1053yfl\" href=\"/docs/getting-started.html\">Get Started</a>", "ground_truth": {"screenshot": "evaluation/ground_truth/task_66_gt.png", "description": "The Get Started link has been clicked"}}
-
-{"web_name": "Django Community Link", "id": "django_nav_1", "task": "Click the Community link", "web": "https://docs.djangoproject.com", "element_type": "link", "interaction": "click", "target_element": {"type": "class", "value": "community-tutorial"}, "target_html": "<a href=\"https://www.djangoproject.com/community/\">Community</a>", "ground_truth": {"screenshot": "evaluation/ground_truth/task_67_gt.png", "description": "The Community link has been clicked"}}
-
-{"web_name": "Flask Documentation", "id": "flask_search_2", "task": "Type 'database' into the search box and press search", "web": "https://flask.palletsprojects.com", "element_type": "input", "interaction": "type_submit", "target_element": {"type": "id", "value": "searchbox"}, "input_text": "database", "target_html": "<input type=\"text\" name=\"q\" aria-labelledby=\"searchlabel\" autocomplete=\"off\" autocorrect=\"off\" autocapitalize=\"off\" spellcheck=\"false\">", "ground_truth": {"screenshot": "evaluation/ground_truth/task_68_gt.png", "description": "The word 'database' has been entered and search initiated"}}
-
-{"web_name": "PostgreSQL Documentation", "id": "postgres_search_1", "task": "Type 'indexes' in the search box", "web": "https://www.postgresql.org/docs", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "docSearch"}, "input_text": "indexes", "target_html": "<button class=\"btn btn-default\" type=\"submit\"><i class=\"fas fa-search\"></i></button>", "ground_truth": {"screenshot": "evaluation/ground_truth/task_69_gt.png", "description": "The word 'indexes' has been entered in the search box"}}
-
-{"web_name": "MySQL Documentation", "id": "mysql_search_3", "task": "Type 'triggers' into the search box and press search", "web": "https://dev.mysql.com/doc", "element_type": "input", "interaction": "type_submit", "target_element": {"type": "id", "value": "search-input"}, "input_text": "triggers", "target_html": "<input id=\"l1-search-input\" type=\"search\" class=\"icon-search\" placeholder=\"Search\" aria-label=\"Search\" name=\"q\">", "ground_truth": {"screenshot": "evaluation/ground_truth/task_70_gt.png", "description": "The word 'triggers' has been entered and search initiated"}}
-
-{"web_name": "Node.js Documentation", "id": "node_search_1", "task": "Type 'events' into the search box and press search", "web": "https://nodejs.org/docs", "element_type": "input", "interaction": "type_submit", "target_element": {"type": "id", "value": "search"}, "input_text": "events", "target_html": "<orama-search-button color-scheme=\"dark\" aria-label=\"Start typing...\" id=\"orama-ui-search-button-tgb9y218b1g\" class=\"theme-dark hydrated\" style=\"flex-grow: 1; --text-color-primary: #F6F7F9; --text-color-accent: #84BA64; --background-color-secondary: #0D121C; --background-color-tertiary: #2C3437; --border-color-accent: #84BA64; --border-color-primary: #2C3437; --border-color-tertiary: #99CC7D; --button-background-color-primary: #84BA64; --button-background-color-secondary: #0D121C; --button-background-color-secondary-hover: #2C3437; --button-border-color-secondary: #2C3437; --button-text-color-secondary: #E9EDF0; --chat-button-border-color-gradientThree: #84BA64; --chat-button-border-color-gradientFour: #2C682C; --chat-button-background-color-gradientOne: #84BA64; --chat-button-background-color-gradientTwo: #2C682C;\">Start typing...</orama-search-button>", "ground_truth": {"screenshot": "evaluation/ground_truth/task_71_gt.png", "description": "The word 'events' has been entered and search initiated"}}
-
-{"web_name": "React Documentation", "id": "react_search_2", "task": "Type 'context api' in the search box", "web": "https://react.dev/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "algolia-doc-search"}, "input_text": "context api", "target_html": "<button type=\"button\" class=\"flex 3xl:w-[56rem] 3xl:mx-0 relative ps-4 pe-1 py-1 h-10 bg-gray-30/20 dark:bg-gray-40/20 outline-none focus:outline-link betterhover:hover:bg-opacity-80 pointer items-center text-start w-full text-gray-30 rounded-full align-middle text-base\"><svg width=\"1em\" height=\"1em\" viewBox=\"0 0 20 20\" class=\"align-middle me-3 text-gray-30 shrink-0 group-betterhover:hover:text-gray-70\"><path d=\"M14.386 14.386l4.0877 4.0877-4.0877-4.0877c-2.9418 2.9419-7.7115 2.9419-10.6533 0-2.9419-2.9418-2.9419-7.7115 0-10.6533 2.9418-2.9419 7.7115-2.9419 10.6533 0 2.9419 2.9418 2.9419 7.7115 0 10.6533z\" stroke=\"currentColor\" fill=\"none\" stroke-width=\"2\" fill-rule=\"evenodd\" stroke-linecap=\"round\" stroke-linejoin=\"round\"></path></svg>Search<span class=\"hidden ms-auto sm:flex item-center me-1\"><kbd class=\"w-5 h-5 border border-transparent me-1 bg-wash dark:bg-wash-dark text-gray-30 align-middle p-0 inline-flex justify-center items-center text-xs text-center rounded-md\" data-platform=\"mac\">⌘</kbd><kbd class=\"w-10 h-5 border border-transparent me-1 bg-wash dark:bg-wash-dark text-gray-30 align-middle p-0 inline-flex justify-center items-center text-xs text-center rounded-md\" data-platform=\"win\">Ctrl</kbd><kbd class=\"w-5 h-5 border border-transparent me-1 bg-wash dark:bg-wash-dark text-gray-30 align-middle p-0 inline-flex justify-center items-center text-xs text-center rounded-md\">K</kbd></span></button>", "ground_truth": {"screenshot": "evaluation/ground_truth/task_72_gt.png", "description": "The phrase 'context api' has been entered in the search box"}}
-
-{"web_name": "Vue.js Documentation", "id": "vue_nav_1", "task": "Click the API Reference link", "web": "https://vuejs.org/guide", "element_type": "link", "interaction": "click", "target_element": {"type": "class", "value": "nav-api"}, "target_html": "<a class=\"vt-link link VPNavBarMenuLink\" href=\"/api/\" data-v-93948c33=\"\" data-v-7a48a17c=\"\"><!--[-->API<!--]--><!----><!----></a>", "ground_truth": {"screenshot": "evaluation/ground_truth/task_73_gt.png", "description": "The API Reference link has been clicked"}}
-
-{"web_name": "Django Documentation", "id": "django_search_2", "task": "Type 'forms' into the search box and press search", "web": "https://docs.djangoproject.com", "element_type": "input", "interaction": "type_submit", "target_element": {"type": "id", "value": "search-input"}, "input_text": "forms", "target_html": "<input type=\"search\" name=\"q\" placeholder=\"Search 5.1 documentation (⌘ + K)\" id=\"id_q\">", "ground_truth": {"screenshot": "evaluation/ground_truth/task_74_gt.png", "description": "The word 'forms' has been entered and search initiated"}}
-
-{"web_name": "Flask Documentation", "id": "flask_nav_1", "task": "Click the API Reference link", "web": "https://flask.palletsprojects.com", "element_type": "link", "interaction": "click", "target_element": {"type": "class", "value": "nav-api"}, "target_html": "<a class=\"reference internal\" href=\"#api-reference\">API Reference</a>", "ground_truth": {"screenshot": "evaluation/ground_truth/task_75_gt.png", "description": "The API Reference link has been clicked"}}
-
-{"web_name": "PostgreSQL Documentation", "id": "postgres_search_2", "task": "Type 'replication' in the search box", "web": "https://www.postgresql.org/docs", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "docSearch"}, "input_text": "replication", "target_html": "<button class=\"btn btn-default\" type=\"submit\"><i class=\"fas fa-search\"></i></button>", "ground_truth": {"screenshot": "evaluation/ground_truth/task_76_gt.png", "description": "The word 'replication' has been entered in the search box"}}
-
-{"web_name": "MySQL Documentation", "id": "mysql_nav_2", "task": "Click the Release Notes link", "web": "https://dev.mysql.com/doc", "element_type": "link", "interaction": "click", "target_element": {"type": "class", "value": "release_notes"}, "target_html": "<a class=\"fp-docs-banner-btn\" href=\"/doc/relnotes/mysql/8.4/en/\"><span>MySQL 8.4</span>Release Notes</a>", "ground_truth": {"screenshot": "evaluation/ground_truth/task_77_gt.png", "description": "The Release Notes link has been clicked"}}
-
-{"web_name": "PHP Documentation", "id": "php_search_1", "task": "Type 'regex' into the search box and press search", "web": "https://www.php.net/docs.php", "element_type": "input", "interaction": "type_submit", "target_element": {"type": "id", "value": "search-text"}, "input_text": "regex", "target_html": "<button id=\"navbar__search-button\" class=\"navbar__search-button\"><svg xmlns=\"http://www.w3.org/2000/svg\" aria-hidden=\"true\" width=\"24\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\"><circle cx=\"11\" cy=\"11\" r=\"8\"></circle><line x1=\"21\" y1=\"21\" x2=\"16.65\" y2=\"16.65\"></line></svg>Search docs</button>", "ground_truth": {"screenshot": "evaluation/ground_truth/task_42_gt.png", "description": "The word 'regex' has been entered and search initiated"}}
-
-{"web_name": "Node.js Documentation", "id": "node_nav_2", "task": "Click the Getting Started link", "web": "https://nodejs.org/en", "element_type": "link", "interaction": "click", "target_element": {"type": "class", "value": "nav-getting-started"}, "target_html": "<a class=\"nav-getting-started\" href=\"/getting-started\">Getting Started</a>", "ground_truth": {"screenshot": "evaluation/ground_truth/task_78_gt.png", "description": "The Getting Started link has been clicked"}}
-
-{"web_name": "Vue.js Documentation", "id": "vue_search_2", "task": "Type 'router' into the search box and press search", "web": "https://vuejs.org/guide", "element_type": "input", "interaction": "type_submit", "target_element": {"type": "id", "value": "search-input"}, "input_text": "router", "target_html": "<span class=\"DocSearch-Button-Container\"><svg width=\"20\" height=\"20\" class=\"DocSearch-Search-Icon\" viewBox=\"0 0 20 20\" aria-hidden=\"true\"><path d=\"M14.386 14.386l4.0877 4.0877-4.0877-4.0877c-2.9418 2.9419-7.7115 2.9419-10.6533 0-2.9419-2.9418-2.9419-7.7115 0-10.6533 2.9418-2.9419 7.7115-2.9419 10.6533 0 2.9419 2.9418 2.9419 7.7115 0 10.6533z\" stroke=\"currentColor\" fill=\"none\" fill-rule=\"evenodd\" stroke-linecap=\"round\" stroke-linejoin=\"round\"></path></svg><span class=\"DocSearch-Button-Placeholder\">Search</span></span>", "ground_truth": {"screenshot": "evaluation/ground_truth/task_79_gt.png", "description": "The word 'router' has been entered and search initiated"}}
-
-{"web_name": "Redis Documentation", "id": "redis_type_1", "task": "Type 'caching' into the search box", "web": "https://redis.io/docs/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "docsearch"}, "input_text": "caching", "target_html": "<input id=\"seach-home\" name=\"q\" class=\"appearance-none bg-transparent text-base sm:text-lg text-white placeholder-white w-full h-16 focus:outline-none\" autocomplete=\"off\" autocorrect=\"off\" autocapitalize=\"off\" spellcheck=\"false\" placeholder=\"Ask our AI-powered Redis copilot a question…\" maxlength=\"64\" type=\"text\" value=\"\" tabindex=\"\">", "ground_truth": {"screenshot": "evaluation/ground_truth/task_80_gt.png", "description": "The word 'caching' has been typed in the search box"}}
-
-{"web_name": "Python Documentation", "id": "python_type_2", "task": "Type 'list comprehension' into the search box", "web": "https://docs.python.org/3/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "search-input"}, "input_text": "list comprehension", "target_html": "<input placeholder=\"Quick search\" aria-label=\"Quick search\" type=\"search\" name=\"q\" id=\"search-box\">", "ground_truth": {"screenshot": "evaluation/ground_truth/task_81_gt.png", "description": "The phrase 'list comprehension' has been typed in the search box"}}
-
-{"web_name": "CPP Reference", "id": "cpp_type_1", "task": "Type 'vector' into the search box", "web": "https://en.cppreference.com/w/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "searchbox"}, "input_text": "vector", "target_html": "<input type=\"search\" name=\"q\">", "ground_truth": {"screenshot": "evaluation/ground_truth/task_82_gt.png", "description": "The word 'vector' has been typed in the search box"}}
-
-{"web_name": "Java Documentation", "id": "java_type_1", "task": "Type 'collections' into the search box", "web": "https://docs.oracle.com/en/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "search-input"}, "input_text": "collections", "target_html": "<input id=\"search-bar-input\" name=\"q\" placeholder=\"Search\" autocomplete=\"off\" role=\"searchbox\" class=\"background-white black-placeholder\">", "ground_truth": {"screenshot": "evaluation/ground_truth/task_83_gt.png", "description": "The word 'collections' has been typed in the search box"}}
-
-{"web_name": "PostgreSQL Documentation", "id": "postgres_type_1", "task": "Type 'joins' into the search box", "web": "https://www.postgresql.org/docs/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "q"}, "input_text": "joins", "target_html": "<button class=\"btn btn-default\" type=\"submit\"><i class=\"fas fa-search\"></i></button>", "ground_truth": {"screenshot": "evaluation/ground_truth/task_84_gt.png", "description": "The word 'joins' has been typed in the search box"}}
-
-{"web_name": "MySQL Documentation", "id": "mysql_type_1", "task": "Type 'indexes' into the search box", "web": "https://dev.mysql.com/doc/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "search-input"}, "input_text": "indexes", "target_html": "<input id=\"l1-search-input\" type=\"search\" class=\"icon-search\" placeholder=\"Search\" aria-label=\"Search\" name=\"q\">", "ground_truth": {"screenshot": "evaluation/ground_truth/task_85_gt.png", "description": "The word 'indexes' has been typed in the search box"}}
-
-{"web_name": "Rust Documentation", "id": "rust_type_1", "task": "Type 'ownership' into the search box", "web": "https://doc.rust-lang.org/book/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "search-input"}, "input_text": "ownership", "target_html": "<i class=\"fa fa-search\"></i>", "ground_truth": {"screenshot": "evaluation/ground_truth/task_86_gt.png", "description": "The word 'ownership' has been typed in the search box"}}
-
-{"web_name": "TypeScript Documentation", "id": "typescript_type_1", "task": "Type 'interface' into the search box", "web": "https://www.typescriptlang.org/docs/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "search"}, "input_text": "interface", "target_html": "<input id=\"search-box-top\" type=\"search\" placeholder=\"Search Docs\" aria-label=\"Search the TypeScript site\" class=\"ds-input\" autocomplete=\"off\" spellcheck=\"false\" role=\"combobox\" aria-autocomplete=\"list\" aria-expanded=\"false\" aria-owns=\"algolia-autocomplete-listbox-0\" dir=\"auto\" style=\"position: relative; vertical-align: top;\">", "ground_truth": {"screenshot": "evaluation/ground_truth/task_87_gt.png", "description": "The word 'interface' has been typed in the search box"}}
-
-{"web_name": "Scala Documentation", "id": "scala_type_1", "task": "Type 'traits' into the search box", "web": "https://docs.scala-lang.org", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "search-input"}, "input_text": "traits", "target_html": "<input type=\"text\" class=\"doc-search ds-input\" id=\"doc-search-bar\" placeholder=\"Search in doc...\" autocomplete=\"off\" spellcheck=\"false\" role=\"combobox\" aria-autocomplete=\"list\" aria-expanded=\"false\" aria-label=\"search input\" aria-owns=\"algolia-autocomplete-listbox-0\" dir=\"auto\" style=\"position: relative; vertical-align: top;\">", "ground_truth": {"screenshot": "evaluation/ground_truth/task_88_gt.png", "description": "The word 'traits' has been typed in the search box"}}
-
-{"web_name": "Swift Blog", "id": "swift_click_1", "task": "Click the Blog link", "web": "https://www.swift.org/documentation/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Blog"}, "target_html": "<a href=\"/blog/\" data-text=\"Blog\">Blog</a>", "ground_truth": {"screenshot": "evaluation/ground_truth/task_89_gt.png", "description": "The Blog link has been clicked"}}
-
-{"web_name": "Python Documentation", "id": "python_click_1", "task": "Click the Tutorial link", "web": "https://docs.python.org/3/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Tutorial"}, "target_html": "<a href=\"tutorial/index.html\">Tutorial</a>", "ground_truth": {"screenshot": "evaluation/ground_truth/task_92_gt.png", "description": "The Tutorial link has been clicked"}}
-
-{"web_name": "Go Documentation", "id": "go_click_1", "task": "Click the Getting Started link", "web": "https://golang.org/doc/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Getting Started"}, "target_html": "<a href=\"/doc/tutorial/getting-started.html\">Getting Started</a>", "ground_truth": {"screenshot": "evaluation/ground_truth/task_93_gt.png", "description": "The Getting Started link has been clicked"}}
-
-{"web_name": "Rust Documentation", "id": "rust_click_1", "task": "Click the Learn Rust link", "web": "https://www.rust-lang.org/learn", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Learn Rust"}, "target_html": "<a href=\"/learn\">Learn Rust</a>", "ground_truth": {"screenshot": "evaluation/ground_truth/task_94_gt.png", "description": "The Learn Rust link has been clicked"}}
-
-{"web_name": "TypeScript Documentation", "id": "typescript_click_1", "task": "Click the Handbook link", "web": "https://www.typescriptlang.org/docs/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Handbook"}, "target_html": "<a href=\"/docs/handbook/intro.html\">Handbook</a>", "ground_truth": {"screenshot": "evaluation/ground_truth/task_95_gt.png", "description": "The Handbook link has been clicked"}}
-
-{"web_name": "MDN Web Docs", "id": "mdn_submit_1", "task": "Type 'javascript' into the search box and press search", "web": "https://developer.mozilla.org/en-US/", "element_type": "input", "interaction": "type_submit", "target_element": {"type": "id", "value": "top-nav-search-input"}, "input_text": "javascript", "target_html": "<input type=\"search\" id=\"top-nav-search-input\" class=\"search-input\" placeholder=\"Search MDN\">", "ground_truth": {"screenshot": "evaluation/ground_truth/task_96_gt.png", "description": "The word 'javascript' has been entered and search initiated"}}
-
-{"web_name": "Python Documentation", "id": "python_submit_1", "task": "Type 'dictionary' into the search box and press search", "web": "https://docs.python.org/3/", "element_type": "input", "interaction": "type_submit", "target_element": {"type": "id", "value": "search-input"}, "input_text": "dictionary", "target_html": "<input type=\"search\" id=\"search-input\" name=\"q\" placeholder=\"Search docs\">", "ground_truth": {"screenshot": "evaluation/ground_truth/task_97_gt.png", "description": "The word 'dictionary' has been entered and search initiated"}}
-
-{"web_name": "Rust Documentation", "id": "rust_submit_1", "task": "Type 'traits' into the search box and press search", "web": "https://doc.rust-lang.org/book/", "element_type": "input", "interaction": "type_submit", "target_element": {"type": "id", "value": "search"}, "input_text": "traits", "target_html": "<input type=\"search\" id=\"search\" name=\"search\" placeholder=\"Search the book\">", "ground_truth": {"screenshot": "evaluation/ground_truth/task_98_gt.png", "description": "The word 'traits' has been entered and search initiated"}}
-
-{"web_name": "Go Documentation", "id": "go_submit_1", "task": "Type 'channels' into the search box and press search", "web": "https://golang.org/doc/", "element_type": "input", "interaction": "type_submit", "target_element": {"type": "id", "value": "search-input"}, "input_text": "channels", "target_html": "<input type=\"search\" id=\"search-input\" placeholder=\"Search docs\">", "ground_truth": {"screenshot": "evaluation/ground_truth/task_99_gt.png", "description": "The word 'channels' has been entered and search initiated"}}
-
-{"web_name": "TypeScript Documentation", "id": "typescript_submit_1", "task": "Type 'generics' into the search box and press search", "web": "https://www.typescriptlang.org/docs/", "element_type": "input", "interaction": "type_submit", "target_element": {"type": "id", "value": "search"}, "input_text": "generics", "target_html": "<input type=\"search\" id=\"search\" placeholder=\"Search docs\">", "ground_truth": {"screenshot": "evaluation/ground_truth/task_100_gt.png", "description": "The word 'generics' has been entered and search initiated"}}
+{"web_name": "React Documentation", "id": "react_search_1", "task": "Type 'hooks tutorial' in the search box", "web": "https://legacy.reactjs.org/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "algolia-doc-search"}, "input_text": "hooks tutorial", "target_html": "<input type=\"search\" id=\"algolia-doc-search\" placeholder=\"Search\" aria-label=\"Search docs\" class=\"css-7dpbpx ds-input\" autocomplete=\"off\" spellcheck=\"false\" role=\"combobox\" aria-autocomplete=\"list\" aria-expanded=\"false\" aria-labelledby=\"algolia-doc-search\" aria-owns=\"algolia-autocomplete-listbox-0\" dir=\"auto\" style=\"position: relative; vertical-align: top;\">", "ground_truth": {"screenshot": "evaluation/ground_truth/task_54_gt.png", "description": "The phrase 'hooks tutorial' has been entered in the search box"}}
\ No newline at end of file
diff --git a/evaluation/auto_eval.py b/evaluation/auto_eval.py
index e496002..ffe39d8 100644
--- a/evaluation/auto_eval.py
+++ b/evaluation/auto_eval.py
@@ -13,7 +13,7 @@ def run_serial_evaluation(
     results_dir: Path,
     output_file: Path,
     openai_key: str
-) -> None:
+) -> Dict[str, Any]:
     """Run evaluation on task results serially"""
     # Initialize OpenAI client
     client = OpenAI(api_key=openai_key)
@@ -59,7 +59,7 @@ def run_serial_evaluation(
                     "success": result["success"],
                     "visual_score": visual_score,
                     "html_score": html_score,
-                    "final_score": (visual_score + html_score) / 2,
+                    "final_score": (0.8 * visual_score + 0.2 * html_score),
                     "visual_reasoning": visual_reasoning,
                     "html_reasoning": html_reasoning
                 }
@@ -76,13 +76,18 @@ def run_serial_evaluation(
                     "error": str(e)
                 })
     
-    # Save evaluations to output file
-    with output_file.open('w') as f:
-        json.dump({
-            "total_tasks": len(tasks),
-            "successful_tasks": sum(1 for e in evaluations if e.get("success", False)),
-            "evaluations": evaluations
-        }, f, indent=2)
+    evaluation_results = {
+        "total_tasks": len(tasks),
+        "successful_tasks": sum(1 for e in evaluations if e.get("success", False)),
+        "evaluations": evaluations
+    }
+    
+    # Save evaluations if output file is provided
+    if output_file:
+        with output_file.open('w') as f:
+            json.dump(evaluation_results, f, indent=2)
+            
+    return evaluation_results
 
 def run_evaluation(
     tasks_file: Path,
@@ -90,9 +95,9 @@ def run_evaluation(
     output_file: Path,
     openai_key: str,
     max_workers: int = None
-) -> None:
+) -> Dict[str, Any]:
     """Run evaluation on task results using either serial or parallel mode"""
     if max_workers:
-        run_parallel_evaluation(tasks_file, results_dir, output_file, openai_key, max_workers)
+        return run_parallel_evaluation(tasks_file, results_dir, output_file, openai_key, max_workers)
     else:
-        run_serial_evaluation(tasks_file, results_dir, output_file, openai_key)
+        return run_serial_evaluation(tasks_file, results_dir, output_file, openai_key)
diff --git a/evaluation/fuzzy_match.py b/evaluation/fuzzy_match.py
index 70107f5..a8758b0 100644
--- a/evaluation/fuzzy_match.py
+++ b/evaluation/fuzzy_match.py
@@ -35,6 +35,22 @@ def fuzzy_match_html(
     
     client = openai_client
     
+    # Truncate inputs if too long
+    max_html_length = 2000  # Characters per HTML string
+    max_task_length = 500   # Characters for task description
+    
+    if len(actual_html) > max_html_length:
+        actual_html = actual_html[:max_html_length] + "..."
+        logger.warning("Actual HTML was truncated due to length")
+        
+    if len(expected_html) > max_html_length:
+        expected_html = expected_html[:max_html_length] + "..."
+        logger.warning("Expected HTML was truncated due to length")
+        
+    if len(task_description) > max_task_length:
+        task_description = task_description[:max_task_length] + "..."
+        logger.warning("Task description was truncated due to length")
+    
     user_prompt = f"""You are evaluating if an HTML element matches the expected element for the following task: {task_description}
 
 Expected HTML: {expected_html}
diff --git a/evaluation/image_match.py b/evaluation/image_match.py
index 76e3858..923302d 100644
--- a/evaluation/image_match.py
+++ b/evaluation/image_match.py
@@ -21,7 +21,11 @@
 
 def encode_image(image_path):
     with open(image_path, "rb") as image_file:
-        return base64.b64encode(image_file.read()).decode('utf-8')
+        image_data = image_file.read()
+        # Check file size (max 20MB)
+        if len(image_data) > 20 * 1024 * 1024:
+            raise ValueError(f"Image {image_path} is too large (>20MB)")
+        return base64.b64encode(image_data).decode('utf-8')
 
 def compare_images(prompt, ground_truth_path, agent_image_path, note = None, openai_client = None):
     if openai_client is None:
@@ -42,17 +46,40 @@ def compare_images(prompt, ground_truth_path, agent_image_path, note = None, ope
     logger.debug("Using provided OpenAI client")
     client = openai_client
 
-    image1 = encode_image(ground_truth_path)
-    image2 = encode_image(agent_image_path)
+    try:
+        image1 = encode_image(ground_truth_path)
+        image2 = encode_image(agent_image_path)
+    except ValueError as e:
+        logger.error(f"Image encoding error: {str(e)}")
+        return False, f"Image processing error: {str(e)}"
+        
+    # Truncate prompt if too long
+    max_prompt_length = 500
+    if len(prompt) > max_prompt_length:
+        prompt = prompt[:max_prompt_length] + "..."
+        
     user_prompt = f"The agent was trying to accomplish the following task: {prompt} The first image is the expected image and the second image is the agent's output. Does the image answer the question correctly as the expected image? Don't focus on unnecessary details, like axes titles or colors or image size or labels unless specified in the task."
     if note:
+        # Truncate note if too long
+        if len(note) > 200:
+            note = note[:200] + "..."
         user_prompt += f"Here are some notes to help you evaluate the images: {note}"
     messages = [
-        {"role": "system", "content": system_prompt},
+        {
+            "role": "system",
+            "content": """You are evaluating if a web automation task was completed successfully. Compare the screenshots and determine if the task's goal was achieved, focusing on the relevant UI changes that indicate success.
+
+Return a JSON object with:
+- correctness (boolean): Whether the task was completed successfully
+- reason (string): Clear explanation of your evaluation"""
+        },
         {
             "role": "user",
             "content": [
-                {"type": "text", "text": user_prompt},
+                {
+                    "type": "text",
+                    "text": user_prompt
+                },
                 {
                     "type": "image_url",
                     "image_url": {"url": f"data:image/jpeg;base64,{image1}"}
diff --git a/evaluation/parallel_eval.py b/evaluation/parallel_eval.py
index dd0617d..ca41e6e 100644
--- a/evaluation/parallel_eval.py
+++ b/evaluation/parallel_eval.py
@@ -4,6 +4,7 @@
 from typing import Dict, Any, List, Tuple
 from openai import OpenAI
 from concurrent.futures import ThreadPoolExecutor, as_completed
+import time
 
 from evaluation.image_match import compare_images
 from evaluation.fuzzy_match import fuzzy_match_html
@@ -32,16 +33,19 @@ def evaluate_task(task: Dict[str, Any], result: Dict[str, Any], client: OpenAI)
         visual_score = 1.0 if visual_correctness else 0.0
         html_score = 1.0 if html_correctness else 0.0
 
+        # Calculate final score: 80% visual, 20% HTML
+        final_score = (0.8 * visual_score) + (0.2 * html_score)
+
         evaluation = {
             "task_id": task_id,
             "success": result["success"],
             "visual_score": visual_score,
             "html_score": html_score,
-            "final_score": (visual_score + html_score) / 2,
+            "final_score": final_score,
             "visual_reasoning": visual_reasoning,
             "html_reasoning": html_reasoning
         }
-        logging.info(f"Evaluated task {task_id}: score={evaluation.get('final_score', 0.0):.2f}")
+        logging.info(f"Evaluated task {task_id}: score={final_score:.2f}")
         return evaluation
     except Exception as e:
         logging.error(f"Error evaluating task {task_id}: {str(e)}")
@@ -60,7 +64,7 @@ def run_parallel_evaluation(
     output_file: Path,
     openai_key: str,
     max_workers: int = 4
-) -> None:
+) -> Dict[str, Any]:
     """Run evaluation on task results in parallel"""
     # Initialize OpenAI client
     client = OpenAI(api_key=openai_key)
@@ -84,33 +88,50 @@ def run_parallel_evaluation(
         if result:
             task_pairs.append((task, result))
     
-    # Run evaluations in parallel
-    with ThreadPoolExecutor(max_workers=max_workers) as executor:
-        future_to_task = {
-            executor.submit(evaluate_task, task, result, client): task_id
-            for task, result in task_pairs
-        }
+    # Process tasks in smaller batches to avoid rate limits
+    batch_size = min(max_workers, 3)  # Process at most 3 tasks at a time
+    for i in range(0, len(task_pairs), batch_size):
+        batch = task_pairs[i:i + batch_size]
+        logging.info(f"Processing evaluation batch {i//batch_size + 1}/{(len(task_pairs) + batch_size - 1)//batch_size}")
         
-        for future in as_completed(future_to_task):
-            try:
-                evaluation = future.result()
-                evaluations.append(evaluation)
-            except Exception as e:
-                task_id = future_to_task[future]
-                logging.error(f"Error in evaluation future for task {task_id}: {str(e)}")
-                evaluations.append({
-                    "task_id": task_id,
-                    "success": False,
-                    "visual_score": 0.0,
-                    "html_score": 0.0,
-                    "final_score": 0.0,
-                    "error": str(e)
-                })
+        # Run evaluations in parallel for this batch
+        with ThreadPoolExecutor(max_workers=batch_size) as executor:
+            future_to_task = {
+                executor.submit(evaluate_task, task, result, client): task['id']
+                for task, result in batch
+            }
+            
+            for future in as_completed(future_to_task):
+                try:
+                    evaluation = future.result(timeout=60)  # 60 second timeout per evaluation
+                    evaluations.append(evaluation)
+                    logging.info(f"Completed evaluation for task {future_to_task[future]}")
+                except Exception as e:
+                    task_id = future_to_task[future]
+                    error_msg = f"Error in evaluation future for task {task_id}: {str(e)}"
+                    logging.error(error_msg)
+                    evaluations.append({
+                        "task_id": task_id,
+                        "success": False,
+                        "visual_score": 0.0,
+                        "html_score": 0.0,
+                        "final_score": 0.0,
+                        "error": error_msg
+                    })
+        
+        # Add a small delay between batches to avoid rate limits
+        if i + batch_size < len(task_pairs):
+            time.sleep(1)
+    
+    evaluation_results = {
+        "total_tasks": len(tasks),
+        "successful_tasks": sum(1 for e in evaluations if e.get("success", False)),
+        "evaluations": evaluations
+    }
     
-    # Save evaluations to output file
-    with output_file.open('w') as f:
-        json.dump({
-            "total_tasks": len(tasks),
-            "successful_tasks": sum(1 for e in evaluations if e.get("success", False)),
-            "evaluations": evaluations
-        }, f, indent=2)
+    # Save evaluations if output file is provided
+    if output_file:
+        with output_file.open('w') as f:
+            json.dump(evaluation_results, f, indent=2)
+            
+    return evaluation_results
diff --git a/models/base.py b/models/base.py
index 0228efe..2adaf27 100644
--- a/models/base.py
+++ b/models/base.py
@@ -14,26 +14,24 @@ class WebInteraction:
 
 @dataclass
 class TaskResult:
-    """Represents the result of executing a task."""
+    """Class to store task execution results"""
     task_id: str
     success: bool
-    before_screenshot: Optional[str] = None
-    after_screenshot: Optional[str] = None
+    error: Optional[str] = None
     html_element: Optional[str] = None
+    after_screenshot: Optional[str] = None
     accessibility_tree: Optional[Dict[str, Any]] = None
-    error: Optional[str] = None
     metadata: Optional[Dict[str, Any]] = None
 
     def to_dict(self) -> Dict[str, Any]:
-        """Convert to dictionary for serialization."""
+        """Convert to dictionary format"""
         return {
             "task_id": self.task_id,
             "success": self.success,
-            "before_screenshot": self.before_screenshot,
-            "after_screenshot": self.after_screenshot,
+            "error": self.error,
             "html_element": self.html_element,
+            "after_screenshot": self.after_screenshot,
             "accessibility_tree": self.accessibility_tree,
-            "error": self.error,
             "metadata": self.metadata
         }
 
diff --git a/models/gemini.py b/models/gemini.py
index 1cfb4ee..e924f3d 100644
--- a/models/gemini.py
+++ b/models/gemini.py
@@ -110,10 +110,7 @@ def parse_task(self, task: Dict[str, Any]) -> WebInteraction:
                     selector_type=interaction_data.get('selector_type', task['target_element']['type']),
                     selector_value=interaction_data.get('selector_value', task['target_element']['value']),
                     input_text=interaction_data.get('input_text'),
-                    description=task['task'],
-                    wait_time=interaction_data.get('wait_time', 0),
-                    hover_duration=interaction_data.get('hover_duration', 0),
-                    validation=interaction_data.get('validation', {})
+                    description=task['task']
                 )
         except Exception as e:
             print(f"Error parsing Gemini response: {str(e)}")
@@ -167,10 +164,7 @@ def handle_error(self, task: Dict[str, Any], error: str) -> Optional[WebInteract
                     selector_type=interaction_data['selector_type'],
                     selector_value=interaction_data['selector_value'],
                     input_text=interaction_data.get('input_text'),
-                    description=f"Error recovery: {task['task']}",
-                    wait_time=interaction_data.get('wait_time', 0),
-                    hover_duration=interaction_data.get('hover_duration', 0),
-                    validation=interaction_data.get('validation', {})
+                    description=f"Error recovery: {task['task']}"
                 )
         except Exception as e:
             print(f"Error in error handling: {str(e)}")
diff --git a/models/gpt4.py b/models/gpt4.py
index bdc881e..c0d108a 100644
--- a/models/gpt4.py
+++ b/models/gpt4.py
@@ -1,5 +1,8 @@
 import json
 import time
+import os
+import base64
+import logging
 from typing import Dict, Any, Optional, Tuple
 from openai import OpenAI
 from .base import BaseModel, WebInteraction, TaskResult
@@ -7,20 +10,24 @@
 class GPT4Model(BaseModel):
     """GPT-4 model implementation for the DOM benchmark."""
     
-    def __init__(self, api_key: str, model_config: Dict[str, Any] = None):
-        super().__init__("gpt-4", model_config or {})
-        self.client = OpenAI(api_key=api_key)
+    def __init__(self, api_key: str = None):
+        """Initialize GPT4Model with OpenAI API key"""
+        self.api_key = api_key or os.getenv('OPENAI_API_KEY')
+        if not self.api_key:
+            raise ValueError("OpenAI API key not provided")
+        
+        self.client = OpenAI(api_key=self.api_key)
         self.max_retries = 10
-        self.model = model_config.get("model", "gpt-4")
-        self.temperature = model_config.get("temperature", 0)
-        self.max_tokens = model_config.get("max_tokens", 1000)
+        self.model = "gpt-4"
+        self.temperature = 0
+        self.max_tokens = 1000
         
         # Enhanced system prompt with hover support
         self.system_prompt = """You are an AI assistant that helps users interact with web elements.
 Your task is to understand the user's intent and generate precise web element interactions.
 
 For each task, analyze:
-1. The user's goal and required interaction (click, type, scroll, wait, hover)
+1. The user's goal and required interaction (click, type, hover)
 2. The target element's properties and accessibility
 3. Any constraints or special conditions
 
@@ -29,31 +36,14 @@ def __init__(self, api_key: str, model_config: Dict[str, Any] = None):
 2. Consider element visibility and interactability
 3. Handle dynamic content and loading states
 4. Pay attention to timing and wait states
-5. Validate success criteria for each interaction
-6. For hover actions:
-   - Ensure element is visible and interactable
-   - Consider dynamic content (dropdowns, tooltips)
-   - Validate hover effects and state changes
 
 Generate interactions in this JSON format:
 {
-    "action": "click|type|scroll|wait|hover",
-    "selector_type": "css|xpath|id",
+    "action": "click|type|hover",
+    "selector_type": "css|xpath|id|class",
     "selector_value": "string",
     "input_text": "string",  # For type actions
-    "wait_time": integer,    # For wait actions in seconds
-    "scroll_direction": "up|down",  # For scroll actions
-    "hover_duration": integer,  # For hover actions in milliseconds
-    "validation": {
-        "expected_state": "visible|hidden|text_present|text_absent|hover_effect",
-        "validation_selector": "string",  # Element to validate
-        "expected_text": "string",  # For text validation
-        "hover_effects": {  # For hover validation
-            "type": "tooltip|dropdown|style_change",
-            "target_selector": "string",  # Element affected by hover
-            "expected_changes": ["color_change", "visibility", "content"]
-        }
-    }
+    "description": "string"  # Optional description of the interaction
 }"""
 
     def _call_api(self, messages: list, retry_count: int = 0) -> Tuple[Optional[dict], bool]:
@@ -111,14 +101,11 @@ def parse_task(self, task: Dict[str, Any]) -> WebInteraction:
             interaction_data = json.loads(content)
             
             return WebInteraction(
-                action=interaction_data.get('action', task.get('interaction', 'click')),
-                selector_type=interaction_data.get('selector_type', task['target_element']['type']),
+                action=interaction_data.get('action', task.get('interaction', 'click')).lower(),
+                selector_type=interaction_data.get('selector_type', task['target_element']['type']).lower(),
                 selector_value=interaction_data.get('selector_value', task['target_element']['value']),
-                input_text=interaction_data.get('input_text'),
-                description=task['task'],
-                wait_time=interaction_data.get('wait_time', 0),
-                hover_duration=interaction_data.get('hover_duration', 0),
-                validation=interaction_data.get('validation', {})
+                input_text=interaction_data.get('input_text', task.get('input_text')),
+                description=task.get('task')
             )
         except Exception as e:
             print(f"Error parsing GPT-4 response: {str(e)}")
@@ -169,10 +156,7 @@ def handle_error(self, task: Dict[str, Any], error: str) -> Optional[WebInteract
                 selector_type=interaction_data['selector_type'],
                 selector_value=interaction_data['selector_value'],
                 input_text=interaction_data.get('input_text'),
-                description=f"Error recovery: {task['task']}",
-                wait_time=interaction_data.get('wait_time', 0),
-                hover_duration=interaction_data.get('hover_duration', 0),
-                validation=interaction_data.get('validation', {})
+                description=f"Error recovery: {task['task']}"
             )
         except Exception as e:
             print(f"Error in error handling: {str(e)}")
@@ -218,3 +202,104 @@ def validate_result(self, task: Dict[str, Any], result: TaskResult) -> bool:
             if failure_reason:
                 print(f"Validation failed: {failure_reason}")
             return False
+
+    def evaluate_image_similarity(self, actual_img: str, expected_img: str) -> Dict[str, Any]:
+        """
+        Evaluate similarity between actual and expected screenshots
+        
+        Args:
+            actual_img: Path to actual screenshot
+            expected_img: Path to expected (ground truth) screenshot
+            
+        Returns:
+            Dict containing similarity score and explanation
+        """
+        try:
+            # Load images
+            with open(actual_img, "rb") as actual, open(expected_img, "rb") as expected:
+                response = self.client.chat.completions.create(
+                    model="gpt-4-vision-preview",
+                    messages=[
+                        {
+                            "role": "system",
+                            "content": "You are an expert at comparing web page screenshots to determine if the same interaction was performed."
+                        },
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": "Compare these two screenshots and determine if they show the same web interaction was performed. Focus on the relevant UI changes, not minor visual differences."
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/png;base64,{base64.b64encode(actual.read()).decode()}"}
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/png;base64,{base64.b64encode(expected.read()).decode()}"}
+                                }
+                            ]
+                        }
+                    ],
+                    max_tokens=300
+                )
+                
+            return {
+                "score": 1.0 if "same" in response.choices[0].message.content.lower() else 0.0,
+                "explanation": response.choices[0].message.content
+            }
+            
+        except Exception as e:
+            logging.error(f"Error evaluating image similarity: {str(e)}")
+            return {
+                "score": 0.0,
+                "explanation": f"Error evaluating images: {str(e)}"
+            }
+            
+    def evaluate_html_similarity(self, actual_html: str, expected_html: str) -> Dict[str, Any]:
+        """
+        Evaluate similarity between actual and expected HTML
+        
+        Args:
+            actual_html: Actual HTML string
+            expected_html: Expected HTML string
+            
+        Returns:
+            Dict containing similarity score and explanation
+        """
+        try:
+            response = self.client.chat.completions.create(
+                model="gpt-4",
+                messages=[
+                    {
+                        "role": "system",
+                        "content": "You are an expert at comparing HTML elements to determine if they refer to the same interactive element."
+                    },
+                    {
+                        "role": "user",
+                        "content": f"""Compare these two HTML elements and determine if they refer to the same interactive element:
+                        
+                        Actual HTML:
+                        {actual_html}
+                        
+                        Expected HTML:
+                        {expected_html}
+                        
+                        Focus on key attributes like id, class, role, and text content. Ignore minor differences in formatting or dynamic attributes."""
+                    }
+                ],
+                max_tokens=300
+            )
+            
+            return {
+                "score": 1.0 if "same" in response.choices[0].message.content.lower() else 0.0,
+                "explanation": response.choices[0].message.content
+            }
+            
+        except Exception as e:
+            logging.error(f"Error evaluating HTML similarity: {str(e)}")
+            return {
+                "score": 0.0,
+                "explanation": f"Error comparing HTML: {str(e)}"
+            }
diff --git a/parallel_runner.py b/parallel_runner.py
index bd875e5..d122a6d 100644
--- a/parallel_runner.py
+++ b/parallel_runner.py
@@ -9,7 +9,7 @@
 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.chrome.service import Service
-from webdriver_manager.chrome import ChromeDriverManager
+from selenium.webdriver.chrome.service import Service as ChromeService
 
 from utils import (
     execute_interaction,
@@ -23,6 +23,7 @@ class TaskRunner:
     """Handles parallel execution of benchmark tasks"""
     
     def __init__(self, 
+                 model,
                  max_workers: int = 4,
                  output_dir: Path = None,
                  save_accessibility_tree: bool = True,
@@ -31,11 +32,13 @@ def __init__(self,
         Initialize TaskRunner
         
         Args:
+            model: Language model to use for task parsing
             max_workers: Maximum number of concurrent Chrome instances
             output_dir: Directory for results and screenshots
             save_accessibility_tree: Whether to save accessibility trees
             wait_time: Wait time between actions in seconds
         """
+        self.model = model
         self.max_workers = max_workers
         self.output_dir = output_dir or Path("results")
         self.save_accessibility_tree = save_accessibility_tree
@@ -57,20 +60,29 @@ def __init__(self,
         # Thread-safe queue for results
         self.results_queue = queue.Queue()
     
-    def setup_driver(self) -> webdriver.Chrome:
+    def setup_driver(self):
         """Create and configure Chrome WebDriver instance"""
         chrome_options = Options()
         chrome_options.add_argument('--no-sandbox')
         chrome_options.add_argument('--disable-dev-shm-usage')
         chrome_options.add_argument('--force-device-scale-factor=1')
         chrome_options.add_argument('--window-size=1920,1080')
+        chrome_options.add_argument('--disable-gpu')   # Disable GPU hardware acceleration
+        chrome_options.add_argument('--start-maximized')  # Start maximized
+        chrome_options.add_argument('--disable-extensions')  # Disable extensions
+        chrome_options.add_argument('--disable-popup-blocking')  # Disable popup blocking
         chrome_options.add_argument(
-            'user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 '
-            '(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
+            'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) '
+            'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.6778.140 Safari/537.36'
         )
         
-        service = Service(ChromeDriverManager().install())
-        return webdriver.Chrome(service=service, options=chrome_options)
+        # Use Selenium Manager instead of ChromeDriverManager
+        service = Service()
+        driver = webdriver.Chrome(service=service, options=chrome_options)
+        
+        # Navigate to about:blank first to ensure a clean start
+        driver.get("about:blank")
+        return driver
     
     def execute_task(self, task: Dict[str, Any]) -> Dict[str, Any]:
         """Execute a single benchmark task"""
@@ -88,7 +100,7 @@ def execute_task(self, task: Dict[str, Any]) -> Dict[str, Any]:
             driver = self.setup_driver()
             
             # Navigate to page
-            url = task.get('web')
+            url = task.get('web')  # Changed from 'url' to 'web' to match task data
             if not url:
                 raise ValueError("No URL provided in task")
                 
@@ -96,11 +108,6 @@ def execute_task(self, task: Dict[str, Any]) -> Dict[str, Any]:
             driver.get(url)
             time.sleep(self.wait_time)
             
-            # Save before screenshot
-            before_screenshot = self.output_dir / f"{task_id}_before.png"
-            save_screenshot(driver, str(before_screenshot))
-            result['before_screenshot'] = str(before_screenshot)
-            
             # Save accessibility tree before interaction
             if self.save_accessibility_tree:
                 before_tree = get_accessibility_tree(driver)
@@ -109,14 +116,21 @@ def execute_task(self, task: Dict[str, Any]) -> Dict[str, Any]:
                 result['before_tree'] = str(before_tree_path)
             
             # Execute interaction
+            web_interaction = self.model.parse_task(task)
             interaction = {
-                "action": task.get("interaction", "click"),
-                "selector": f"{task['target_element']['type']}={task['target_element']['value']}" if task.get('target_element') else "",
-                "value": task.get("input_text", "")
+                'action': web_interaction.action,
+                'target_element': {
+                    'type': web_interaction.selector_type,
+                    'value': web_interaction.selector_value
+                },
+                'input_text': web_interaction.input_text
             }
             
+            logging.info(f"Task {task_id}: Executing interaction: {interaction}")
             success, element_html = execute_interaction(driver, interaction)
-            result['success'] = success
+            if not success:
+                raise ValueError("Interaction failed")
+            result['success'] = True
             result['html_element'] = element_html
             time.sleep(self.wait_time)
             
@@ -146,28 +160,48 @@ def run_tasks(self, tasks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         """Run tasks in parallel using ThreadPoolExecutor"""
         results = []
         
-        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
-            # Submit all tasks
-            future_to_task = {
-                executor.submit(self.execute_task, task): task
-                for task in tasks
-            }
+        # Process tasks in smaller batches to avoid overwhelming the system
+        batch_size = min(self.max_workers, 5)  # Process at most 5 tasks at a time
+        for i in range(0, len(tasks), batch_size):
+            batch = tasks[i:i + batch_size]
+            logging.info(f"Processing task batch {i//batch_size + 1}/{(len(tasks) + batch_size - 1)//batch_size}")
+            
+            with ThreadPoolExecutor(max_workers=batch_size) as executor:
+                # Submit batch of tasks
+                future_to_task = {
+                    executor.submit(self.execute_task, task): task
+                    for task in batch
+                }
+                
+                # Process completed tasks
+                for future in as_completed(future_to_task):
+                    task = future_to_task[future]
+                    task_id = task.get('id', 'unknown')
+                    try:
+                        result = future.result(timeout=120)  # 2 minute timeout per task
+                        results.append(result)
+                        logging.info(f"Completed task {task_id}")
+                    except Exception as e:
+                        error_msg = f"Task {task_id} failed with error: {str(e)}"
+                        logging.error(error_msg)
+                        results.append({
+                            'task_id': task_id,
+                            'success': False,
+                            'error': error_msg,
+                            'task_description': task.get('task'),
+                            'timestamp': time.time()
+                        })
             
-            # Process completed tasks
-            for future in as_completed(future_to_task):
-                task = future_to_task[future]
-                try:
-                    result = future.result()
-                    results.append(result)
-                    logging.info(f"Completed task {task.get('id', 'unknown')}")
-                except Exception as e:
-                    logging.error(f"Task failed: {str(e)}", exc_info=True)
+            # Add a small delay between batches
+            if i + batch_size < len(tasks):
+                time.sleep(1)
         
         return results
 
 def run_parallel_benchmark(
     tasks_file: str,
     output_dir: str,
+    model,
     max_workers: int = 4,
     save_accessibility_tree: bool = True,
     wait_time: float = 2.0
@@ -178,6 +212,7 @@ def run_parallel_benchmark(
     Args:
         tasks_file: Path to JSONL file containing tasks
         output_dir: Directory for results and screenshots
+        model: Language model to use for task parsing
         max_workers: Maximum number of concurrent Chrome instances
         save_accessibility_tree: Whether to save accessibility trees
         wait_time: Wait time between actions in seconds
@@ -191,6 +226,7 @@ def run_parallel_benchmark(
     
     # Initialize runner
     runner = TaskRunner(
+        model=model,
         max_workers=max_workers,
         output_dir=Path(output_dir),
         save_accessibility_tree=save_accessibility_tree,
diff --git a/run.py b/run.py
index efc56b2..5a5e449 100644
--- a/run.py
+++ b/run.py
@@ -6,6 +6,7 @@
 from evaluation.auto_eval import run_evaluation
 from models import GPT4Model, ClaudeModel, GeminiModel
 import os
+from dotenv import load_dotenv
 
 def get_model(model_name):
     """Get the appropriate model based on command line argument."""
@@ -13,8 +14,8 @@ def get_model(model_name):
     
     models = {
         'gpt4': lambda: GPT4Model(api_key=os.getenv("OPENAI_API_KEY")),
-        'claude': lambda: ClaudeModel(api_key=os.getenv("ANTHROPIC_API_KEY")),
-        'gemini': lambda: GeminiModel(api_key=os.getenv("GOOGLE_API_KEY"))
+        'claude': lambda: ClaudeModel(api_key=os.getenv("ANTHROPIC_API_KEY"), model_config={}),
+        'gemini': lambda: GeminiModel(api_key=os.getenv("GOOGLE_API_KEY"), model_config={})
     }
     
     if model_name not in models:
@@ -49,23 +50,19 @@ def main():
     if args.mode == 'parallel':
         results = run_parallel_benchmark(
             tasks_file=args.tasks,
-            output_dir=args.output,
-            model=model,
+            output_dir=str(output_dir),
             max_workers=args.max_workers,
             save_accessibility_tree=args.save_accessibility_tree,
             wait_time=args.wait_time,
-            evaluate=args.evaluate,
-            evaluate_mode=args.evaluate_mode
+            model=model
         )
     else:
         results = run_serial_benchmark(
             tasks_file=args.tasks,
-            output_dir=args.output,
-            model=model,
+            output_dir=str(output_dir),
             save_accessibility_tree=args.save_accessibility_tree,
             wait_time=args.wait_time,
-            evaluate=args.evaluate,
-            evaluate_mode=args.evaluate_mode
+            model=model
         )
     
     # Save results
@@ -75,14 +72,48 @@ def main():
     
     # Run evaluation if requested
     if args.evaluate:
-        eval_output = output_dir / "evaluation.json"
-        run_evaluation(
+        # Run evaluations
+        eval_results = run_evaluation(
             tasks_file=Path(args.tasks),
             results_dir=results_file,
-            output_file=eval_output,
+            output_file=None,  # Don't save to separate file
             openai_key=os.getenv('OPENAI_API_KEY'),
             max_workers=args.max_workers if args.evaluate_mode == 'parallel' else None
         )
+        
+        # Update results with evaluations
+        for result in results:
+            task_id = result['task_id']
+            eval_result = next((e for e in eval_results['evaluations'] if e['task_id'] == task_id), None)
+            if eval_result:
+                # Get evaluation scores and explanations, with defaults if missing
+                visual_score = eval_result.get('visual_score', 0.0)
+                html_score = eval_result.get('html_score', 0.0)
+                final_score = eval_result.get('final_score', 0.0)  # Get final score from evaluation
+                visual_reasoning = eval_result.get('visual_reasoning', 'No visual evaluation available')
+                html_reasoning = eval_result.get('html_reasoning', 'No HTML evaluation available')
+                
+                # Add evaluation scores to result
+                result['final_score'] = final_score  # Add final score at top level
+                result['llm_evaluations'] = {
+                    'image_similarity': {
+                        'score': visual_score,
+                        'explanation': visual_reasoning
+                    },
+                    'html_fuzzy_match': {
+                        'score': html_score,
+                        'explanation': html_reasoning
+                    }
+                }
+                # Update success based on evaluation scores
+                # Only mark as success if both image and HTML evaluations pass
+                result['success'] = (visual_score > 0.5 and html_score > 0.5)
+                if not result['success'] and not result['error']:
+                    result['error'] = "Failed evaluation checks"
+        
+        # Save updated results
+        with open(results_file, 'w') as f:
+            json.dump(results, f, indent=2)
 
 if __name__ == '__main__':
     main()
diff --git a/serial_runner.py b/serial_runner.py
index 0e2f4c2..c8bf31f 100644
--- a/serial_runner.py
+++ b/serial_runner.py
@@ -20,6 +20,7 @@ class SerialTaskRunner:
     """Handles serial execution of benchmark tasks"""
     
     def __init__(self,
+                 model,
                  output_dir: Path = None,
                  save_accessibility_tree: bool = True,
                  wait_time: float = 2.0):
@@ -27,10 +28,12 @@ def __init__(self,
         Initialize SerialTaskRunner
         
         Args:
+            model: Language model to use for task parsing
             output_dir: Directory for results and screenshots
             save_accessibility_tree: Whether to save accessibility trees
             wait_time: Wait time between actions in seconds
         """
+        self.model = model
         self.output_dir = output_dir or Path("results")
         self.save_accessibility_tree = save_accessibility_tree
         self.wait_time = wait_time
@@ -49,92 +52,114 @@ def __init__(self,
             ]
         )
     
-    def setup_driver(self) -> webdriver.Chrome:
+    def setup_driver(self):
         """Create and configure Chrome WebDriver instance"""
         chrome_options = Options()
         chrome_options.add_argument('--no-sandbox')
         chrome_options.add_argument('--disable-dev-shm-usage')
         chrome_options.add_argument('--force-device-scale-factor=1')
         chrome_options.add_argument('--window-size=1920,1080')
+        chrome_options.add_argument('--disable-gpu')   # Disable GPU hardware acceleration
+        chrome_options.add_argument('--start-maximized')  # Start maximized
+        chrome_options.add_argument('--disable-extensions')  # Disable extensions
+        chrome_options.add_argument('--disable-popup-blocking')  # Disable popup blocking
         chrome_options.add_argument(
-            'user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 '
-            '(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
+            'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) '
+            'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.6778.140 Safari/537.36'
         )
         
-        service = Service(ChromeDriverManager().install())
-        return webdriver.Chrome(service=service, options=chrome_options)
+        # Use Selenium Manager instead of ChromeDriverManager
+        service = Service()
+        driver = webdriver.Chrome(service=service, options=chrome_options)
+        
+        # Navigate to about:blank first to ensure a clean start
+        driver.get("about:blank")
+        return driver
     
     def execute_task(self, task: Dict[str, Any], task_num: int, total_tasks: int) -> Dict[str, Any]:
         """Execute a single benchmark task"""
         task_id = task.get('id', 'unknown')
-        logging.info(f"\nProcessing task {task_num}/{total_tasks}: {task_id}")
-        logging.info(f"Task description: {task.get('task', 'No description')}")
+        logging.info(f"\n{'='*50}")
+        logging.info(f"Starting task {task_num}/{total_tasks}: {task_id}")
+        logging.info(f"Task details: {task}")
         
         result = {
             'task_id': task_id,
             'success': False,
             'error': None,
-            'task_description': task.get('task'),
-            'timestamp': time.time()
+            'after_screenshot': None,
+            'llm_evaluations': {
+                'image_similarity': None,
+                'html_fuzzy_match': None
+            }
         }
         
         try:
-            # Navigate to page
+            driver = self.setup_driver()
+            logging.info(f"Browser initialized for task {task_id}")
+            
+            # Navigate to URL
             url = task.get('web')
             if not url:
                 raise ValueError("No URL provided in task")
                 
-            logging.info(f"Task {task_id}: Navigating to {url}")
-            self.driver.get(url)
-            time.sleep(self.wait_time)
-            
-            # Save before screenshot
-            before_screenshot = self.output_dir / f"{task_id}_before.png"
-            save_screenshot(self.driver, str(before_screenshot))
-            result['before_screenshot'] = str(before_screenshot)
-            logging.info(f"Saved before screenshot: {before_screenshot}")
-            
-            # Save accessibility tree before interaction
-            if self.save_accessibility_tree:
-                before_tree = get_accessibility_tree(self.driver)
-                before_tree_path = self.output_dir / f"{task_id}_before_tree.json"
-                save_accessibility_tree(before_tree, str(before_tree_path))
-                result['before_tree'] = str(before_tree_path)
-                logging.info(f"Saved before accessibility tree: {before_tree_path}")
+            logging.info(f"Navigating to URL: {url}")
+            driver.get(url)
+            time.sleep(self.wait_time)  # Wait for page load
             
             # Execute interaction
+            web_interaction = self.model.parse_task(task)
             interaction = {
-                "action": task.get("interaction", "click"),
-                "selector": f"{task['target_element']['type']}={task['target_element']['value']}" if task.get('target_element') else "",
-                "value": task.get("input_text", "")
+                'action': web_interaction.action,
+                'target_element': {
+                    'type': web_interaction.selector_type,
+                    'value': web_interaction.selector_value
+                },
+                'input_text': web_interaction.input_text
             }
             
-            logging.info(f"Executing interaction: {interaction}")
-            success, element_html = execute_interaction(self.driver, interaction)
-            result['success'] = success
+            logging.info(f"Task {task_id}: Executing interaction: {interaction}")
+            success, element_html = execute_interaction(driver, interaction)
+            if not success:
+                raise ValueError("Interaction failed")
             result['html_element'] = element_html
-            time.sleep(self.wait_time)
+            time.sleep(self.wait_time)  # Wait for interaction to complete
             
-            # Save after screenshot
-            after_screenshot = self.output_dir / f"{task_id}_after.png"
-            save_screenshot(self.driver, str(after_screenshot))
-            result['after_screenshot'] = str(after_screenshot)
-            logging.info(f"Saved after screenshot: {after_screenshot}")
+            # Take after screenshot
+            after_screenshot = save_screenshot(driver, self.output_dir / f"{task_id}_after.png")
+            result['after_screenshot'] = after_screenshot
             
-            # Save accessibility tree after interaction
             if self.save_accessibility_tree:
-                after_tree = get_accessibility_tree(self.driver)
-                after_tree_path = self.output_dir / f"{task_id}_after_tree.json"
-                save_accessibility_tree(after_tree, str(after_tree_path))
-                result['after_tree'] = str(after_tree_path)
-                logging.info(f"Saved after accessibility tree: {after_tree_path}")
-            
-            logging.info(f"Task completed successfully: {success}")
+                after_tree = get_accessibility_tree(driver)
+                save_accessibility_tree(after_tree, self.output_dir / f"{task_id}_after_tree.json")
+                logging.info("Saved after screenshots and accessibility tree")
             
+            # Only mark as success if we have all required data
+            if after_screenshot and element_html:
+                # We have the data but need to wait for evaluations to determine final success
+                # Set to False for now, will be updated after evaluations
+                result['success'] = False
+                logging.info(f"Task {task_id} completed data collection")
+            else:
+                result['success'] = False
+                result['error'] = "Missing required data (screenshots or HTML element)"
+        
         except Exception as e:
-            result['error'] = str(e)
-            logging.error(f"Error in task {task_id}: {str(e)}", exc_info=True)
-            
+            error_msg = f"Error in task {task_id}: {str(e)}"
+            logging.error(error_msg, exc_info=True)
+            result['error'] = error_msg
+            result['success'] = False
+        
+        finally:
+            try:
+                if 'driver' in locals():
+                    driver.quit()
+                    logging.info(f"Browser closed for task {task_id}")
+            except Exception as e:
+                logging.error(f"Error closing browser: {str(e)}")
+        
+        logging.info(f"Task {task_id} result: {result}")
+        logging.info(f"{'='*50}\n")
         return result
     
     def run_tasks(self, tasks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
@@ -168,6 +193,7 @@ def run_tasks(self, tasks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
 def run_serial_benchmark(
     tasks_file: str,
     output_dir: str,
+    model,
     save_accessibility_tree: bool = True,
     wait_time: float = 2.0
 ) -> List[Dict[str, Any]]:
@@ -177,11 +203,9 @@ def run_serial_benchmark(
     Args:
         tasks_file: Path to JSONL file containing tasks
         output_dir: Directory for results and screenshots
+        model: Language model to use for task parsing
         save_accessibility_tree: Whether to save accessibility trees
         wait_time: Wait time between actions in seconds
-    
-    Returns:
-        List of task results
     """
     # Load tasks
     tasks = load_tasks_with_ground_truth(tasks_file)
@@ -189,6 +213,7 @@ def run_serial_benchmark(
     
     # Initialize runner
     runner = SerialTaskRunner(
+        model=model,
         output_dir=Path(output_dir),
         save_accessibility_tree=save_accessibility_tree,
         wait_time=wait_time
diff --git a/utils.py b/utils.py
index 32780dd..40395df 100644
--- a/utils.py
+++ b/utils.py
@@ -15,25 +15,33 @@ def execute_interaction(driver: webdriver.Chrome, interaction: Dict[str, Any]) -
     """Execute a single interaction on the webpage and return success status and element HTML"""
     try:
         action = interaction.get("action", "").lower()
-        selector = interaction.get("selector", "")
-        value = interaction.get("value", "")
         
-        if not selector:
-            logging.warning("No selector provided for interaction")
-            return False, None
-            
-        # Parse selector in format "type=value"
-        selector_parts = selector.split('=', 1)
-        if len(selector_parts) != 2:
-            logging.error(f"Invalid selector format: {selector}")
+        # Get selector info from either old or new format
+        selector_type = None
+        selector_value = None
+        
+        # Try new format first (target_element)
+        target_element = interaction.get("target_element")
+        if target_element:
+            selector_type = target_element.get("type")
+            selector_value = target_element.get("value")
+        
+        # Fall back to old format if needed
+        if not selector_type or not selector_value:
+            selector = interaction.get("selector")
+            if selector:
+                selector_parts = selector.split('=', 1)
+                if len(selector_parts) == 2:
+                    selector_type, selector_value = selector_parts
+        
+        if not selector_type or not selector_value:
+            logging.warning("No valid selector found in interaction")
             return False, None
             
-        selector_type, selector_value = selector_parts
-        
         # Map selector type to Selenium By
         selector_map = {
             'id': By.ID,
-            'class': By.CLASS_NAME,
+            'class': By.CSS_SELECTOR,
             'css': By.CSS_SELECTOR,
             'xpath': By.XPATH,
             'name': By.NAME,
@@ -44,15 +52,26 @@ def execute_interaction(driver: webdriver.Chrome, interaction: Dict[str, Any]) -
         if not by_type:
             logging.error(f"Unsupported selector type: {selector_type}")
             return False, None
-        
+            
+        # For class selectors, convert to CSS format
+        selector_value_to_use = selector_value
+        if selector_type.lower() == 'class':
+            # Handle space-separated class names by converting to CSS format
+            classes = selector_value.split()
+            selector_value_to_use = '.' + '.'.join(classes)
+            logging.info(f"Converted class selector '{selector_value}' to CSS selector '{selector_value_to_use}'")
+            
         # Wait for element to be present and interactable
-        wait = WebDriverWait(driver, 10)
-        element = wait.until(EC.presence_of_element_located((by_type, selector_value)))
-        wait.until(EC.element_to_be_clickable((by_type, selector_value)))
+        wait = WebDriverWait(driver, 30)
+        element = wait.until(EC.presence_of_element_located((by_type, selector_value_to_use)))
+        wait.until(EC.element_to_be_clickable((by_type, selector_value_to_use)))
         
         # Get element's outer HTML
         element_html = element.get_attribute('outerHTML')
         
+        # Prioritize input_text over value
+        value = interaction.get("input_text", interaction.get("value", ""))
+        
         # Execute the interaction
         if action == "click":
             element.click()
@@ -66,22 +85,24 @@ def execute_interaction(driver: webdriver.Chrome, interaction: Dict[str, Any]) -
             logging.error(f"Unsupported action: {action}")
             return False, element_html
             
+        logging.info(f"Successfully executed {action} on {selector_type}={selector_value} with value '{value}'")
         return True, element_html
         
     except Exception as e:
         logging.error(f"Error executing interaction: {str(e)}")
         return False, None
 
-def save_screenshot(driver: webdriver.Chrome, filepath: str) -> bool:
+def save_screenshot(driver: webdriver.Chrome, filepath: Union[str, Path]) -> Optional[str]:
     """Save screenshot of the current page state"""
     try:
-        Path(filepath).parent.mkdir(parents=True, exist_ok=True)
-        driver.save_screenshot(filepath)
+        filepath = Path(filepath)
+        filepath.parent.mkdir(parents=True, exist_ok=True)
+        driver.save_screenshot(str(filepath))
         logging.info(f"Screenshot saved to {filepath}")
-        return True
+        return str(filepath)
     except Exception as e:
         logging.error(f"Error saving screenshot: {str(e)}")
-        return False
+        return None
 
 def get_accessibility_tree(driver: webdriver.Chrome) -> Dict[str, Any]:
     """Get accessibility tree of the current page"""