diff --git a/analyze_results.py b/analyze_results.py new file mode 100644 index 0000000..81f8472 --- /dev/null +++ b/analyze_results.py @@ -0,0 +1,17 @@ +import json +from pathlib import Path + +# Read results file +results_file = Path('results/benchmark_results.json/results.json') +with open(results_file) as f: + results = json.load(f) + +# Calculate succexss percentage +total_tasks = len(results) +successful_tasks = sum(1 for result in results if result.get('success', False)) +success_percentage = (successful_tasks / total_tasks) * 100 if total_tasks > 0 else 0 + +print(f"\nResults Analysis:") +print(f"Total Tasks: {total_tasks}") +print(f"Successful Tasks: {successful_tasks}") +print(f"Success Rate: {success_percentage:.2f}%") diff --git a/data/dom_tasks.jsonl b/data/dom_tasks.jsonl index b5c8599..2129883 100644 --- a/data/dom_tasks.jsonl +++ b/data/dom_tasks.jsonl @@ -2,94 +2,4 @@ {"web_name": "Amazon", "id": "amazon_cart_1", "task": "Click the Cart button", "web": "https://www.amazon.com", "element_type": "link", "interaction": "click", "target_element": {"type": "id", "value": "nav-cart"}, "target_html": "
Cart
", "ground_truth": {"screenshot": "evaluation/ground_truth/task_2_gt.png", "description": "The Cart button has been clicked"}} {"web_name": "Google Maps", "id": "maps_search_1", "task": "Type 'San Francisco' into the search box and press search", "web": "https://www.google.com/maps", "element_type": "input", "interaction": "type_submit", "target_element": {"type": "id", "value": "searchboxinput"}, "input_text": "San Francisco", "target_html": "", "ground_truth": {"screenshot": "evaluation/ground_truth/task_3_gt.png", "description": "San Francisco has been entered and search initiated"}} {"web_name": "YouTube", "id": "youtube_logo_1", "task": "Hover over the YouTube logo in the top left", "web": "https://www.youtube.com", "element_type": "link", "interaction": "hover", "target_element": {"type": "id", "value": "logo"}, "target_html": "
", "ground_truth": {"screenshot": "evaluation/ground_truth/task_4_gt.png", "description": "The YouTube logo is in hover state, showing a tooltip with 'YouTube Home'"}} -{"web_name": "React Documentation", "id": "react_search_1", "task": "Type 'hooks tutorial' in the search box", "web": "https://legacy.reactjs.org/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "algolia-doc-search"}, "input_text": "hooks tutorial", "target_html": "", "ground_truth": {"screenshot": "evaluation/ground_truth/task_54_gt.png", "description": "The phrase 'hooks tutorial' has been entered in the search box"}} - -{"web_name": "Vue.js Documentation", "id": "vue_search_1", "task": "Type 'component props' into the search box and press search", "web": "https://vuejs.org/guide", "element_type": "input", "interaction": "type_submit", "target_element": {"type": "id", "value": "search-input"}, "input_text": "component props", "target_html": "Search", "ground_truth": {"screenshot": "evaluation/ground_truth/task_55_gt.png", "description": "The phrase 'component props' has been entered and search initiated"}} - -{"web_name": "Django Documentation", "id": "django_search_1", "task": "Type 'model fields' in the search box and click search", "web": "https://docs.djangoproject.com", "element_type": "input", "interaction": "type_submit", "target_element": {"type": "id", "value": "search-input"}, "input_text": "model fields", "target_html": "", "ground_truth": {"screenshot": "evaluation/ground_truth/task_56_gt.png", "description": "The phrase 'model fields' has been entered in the search box and search initiated"}} - -{"web_name": "Flask Documentation", "id": "flask_search_1", "task": "Type 'route decorators' into the search box and press search", "web": "https://flask.palletsprojects.com", "element_type": "input", "interaction": "type_submit", "target_element": {"type": "id", "value": "searchbox"}, "input_text": "route decorators", "target_html": "", "ground_truth": {"screenshot": "evaluation/ground_truth/task_57_gt.png", "description": "The phrase 'route decorators' has been entered and search initiated"}} - -{"web_name": "MDN Web Docs", "id": "mdn_nav_1", "task": "Click the Guides link in the navigation", "web": "https://developer.mozilla.org", "element_type": "link", "interaction": "click", "target_element": {"type": "class", "value": "nav-guides"}, "target_html": "Guides", "ground_truth": {"screenshot": "evaluation/ground_truth/task_58_gt.png", "description": "The Guides link has been clicked"}} - -{"web_name": "W3Schools", "id": "w3_tutorial_1", "task": "Click the HTML Tutorial link", "web": "https://www.w3schools.com", "element_type": "link", "interaction": "click", "target_element": {"type": "class", "value": "w3-bar-item"}, "target_html": "Tutorials", "ground_truth": {"screenshot": "evaluation/ground_truth/task_59_gt.png", "description": "The HTML Tutorial link has been clicked"}} - -{"web_name": "Python Documentation", "id": "python_nav_1", "task": "Click the Library Reference link", "web": "https://docs.python.org", "element_type": "link", "interaction": "click", "target_element": {"type": "class", "value": "reference-nav"}, "target_html": "Language reference", "ground_truth": {"screenshot": "evaluation/ground_truth/task_60_gt.png", "description": "The Library Reference link has been clicked"}} - -{"web_name": "Node.js Documentation", "id": "node_nav_1", "task": "Click the Docs link", "web": "https://nodejs.org/en", "element_type": "link", "interaction": "click", "target_element": {"type": "class", "value": "docs-link"}, "target_html": "Docs", "ground_truth": {"screenshot": "evaluation/ground_truth/task_61_gt.png", "description": "The docs link has been clicked"}} - -{"web_name": "MySQL Documentation", "id": "mysql_search_2", "task": "Type 'stored procedures' in the search box", "web": "https://dev.mysql.com/doc", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "search-input"}, "input_text": "stored procedures", "target_html": "", "ground_truth": {"screenshot": "evaluation/ground_truth/task_62_gt.png", "description": "The phrase 'stored procedures' has been entered in the search box"}} - -{"web_name": "PostgreSQL Documentation", "id": "postgres_nav_1", "task": "Click the Developers link", "web": "https://www.postgresql.org/docs", "element_type": "link", "interaction": "click", "target_element": {"type": "class", "value": "dev-link"}, "target_html": "
  • Developers
  • ", "ground_truth": {"screenshot": "evaluation/ground_truth/task_63_gt.png", "description": "The Developers link has been clicked"}} - -{"web_name": "MySQL Documentation", "id": "mysql_nav_1", "task": "Click the Reference Manual link", "web": "https://dev.mysql.com/doc", "element_type": "link", "interaction": "click", "target_element": {"type": "class", "value": "reference-link"}, "target_html": "
  • Reference Manual
  • ", "ground_truth": {"screenshot": "evaluation/ground_truth/task_64_gt.png", "description": "The Reference Manual link has been clicked"}} - -{"web_name": "PHP Documentation", "id": "php_nav_1", "task": "Click the Get Involved link", "web": "https://www.php.net/docs.php", "element_type": "link", "interaction": "click", "target_element": {"type": "class", "value": "get_involved"}, "target_html": "Get Involved", "ground_truth": {"screenshot": "evaluation/ground_truth/task_65_gt.png", "description": "The Get Involved link has been clicked"}} - -{"web_name": "React Get Started", "id": "react_nav_1", "task": "Click the Get Started link", "web": "https://legacy.reactjs.org", "element_type": "link", "interaction": "click", "target_element": {"type": "class", "value": "nav-tutorial"}, "target_html": "Get Started", "ground_truth": {"screenshot": "evaluation/ground_truth/task_66_gt.png", "description": "The Get Started link has been clicked"}} - -{"web_name": "Django Community Link", "id": "django_nav_1", "task": "Click the Community link", "web": "https://docs.djangoproject.com", "element_type": "link", "interaction": "click", "target_element": {"type": "class", "value": "community-tutorial"}, "target_html": "Community", "ground_truth": {"screenshot": "evaluation/ground_truth/task_67_gt.png", "description": "The Community link has been clicked"}} - -{"web_name": "Flask Documentation", "id": "flask_search_2", "task": "Type 'database' into the search box and press search", "web": "https://flask.palletsprojects.com", "element_type": "input", "interaction": "type_submit", "target_element": {"type": "id", "value": "searchbox"}, "input_text": "database", "target_html": "", "ground_truth": {"screenshot": "evaluation/ground_truth/task_68_gt.png", "description": "The word 'database' has been entered and search initiated"}} - -{"web_name": "PostgreSQL Documentation", "id": "postgres_search_1", "task": "Type 'indexes' in the search box", "web": "https://www.postgresql.org/docs", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "docSearch"}, "input_text": "indexes", "target_html": "", "ground_truth": {"screenshot": "evaluation/ground_truth/task_69_gt.png", "description": "The word 'indexes' has been entered in the search box"}} - -{"web_name": "MySQL Documentation", "id": "mysql_search_3", "task": "Type 'triggers' into the search box and press search", "web": "https://dev.mysql.com/doc", "element_type": "input", "interaction": "type_submit", "target_element": {"type": "id", "value": "search-input"}, "input_text": "triggers", "target_html": "", "ground_truth": {"screenshot": "evaluation/ground_truth/task_70_gt.png", "description": "The word 'triggers' has been entered and search initiated"}} - -{"web_name": "Node.js Documentation", "id": "node_search_1", "task": "Type 'events' into the search box and press search", "web": "https://nodejs.org/docs", "element_type": "input", "interaction": "type_submit", "target_element": {"type": "id", "value": "search"}, "input_text": "events", "target_html": "Start typing...", "ground_truth": {"screenshot": "evaluation/ground_truth/task_71_gt.png", "description": "The word 'events' has been entered and search initiated"}} - -{"web_name": "React Documentation", "id": "react_search_2", "task": "Type 'context api' in the search box", "web": "https://react.dev/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "algolia-doc-search"}, "input_text": "context api", "target_html": "", "ground_truth": {"screenshot": "evaluation/ground_truth/task_72_gt.png", "description": "The phrase 'context api' has been entered in the search box"}} - -{"web_name": "Vue.js Documentation", "id": "vue_nav_1", "task": "Click the API Reference link", "web": "https://vuejs.org/guide", "element_type": "link", "interaction": "click", "target_element": {"type": "class", "value": "nav-api"}, "target_html": "API", "ground_truth": {"screenshot": "evaluation/ground_truth/task_73_gt.png", "description": "The API Reference link has been clicked"}} - -{"web_name": "Django Documentation", "id": "django_search_2", "task": "Type 'forms' into the search box and press search", "web": "https://docs.djangoproject.com", "element_type": "input", "interaction": "type_submit", "target_element": {"type": "id", "value": "search-input"}, "input_text": "forms", "target_html": "", "ground_truth": {"screenshot": "evaluation/ground_truth/task_74_gt.png", "description": "The word 'forms' has been entered and search initiated"}} - -{"web_name": "Flask Documentation", "id": "flask_nav_1", "task": "Click the API Reference link", "web": "https://flask.palletsprojects.com", "element_type": "link", "interaction": "click", "target_element": {"type": "class", "value": "nav-api"}, "target_html": "API Reference", "ground_truth": {"screenshot": "evaluation/ground_truth/task_75_gt.png", "description": "The API Reference link has been clicked"}} - -{"web_name": "PostgreSQL Documentation", "id": "postgres_search_2", "task": "Type 'replication' in the search box", "web": "https://www.postgresql.org/docs", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "docSearch"}, "input_text": "replication", "target_html": "", "ground_truth": {"screenshot": "evaluation/ground_truth/task_76_gt.png", "description": "The word 'replication' has been entered in the search box"}} - -{"web_name": "MySQL Documentation", "id": "mysql_nav_2", "task": "Click the Release Notes link", "web": "https://dev.mysql.com/doc", "element_type": "link", "interaction": "click", "target_element": {"type": "class", "value": "release_notes"}, "target_html": "MySQL 8.4Release Notes", "ground_truth": {"screenshot": "evaluation/ground_truth/task_77_gt.png", "description": "The Release Notes link has been clicked"}} - -{"web_name": "PHP Documentation", "id": "php_search_1", "task": "Type 'regex' into the search box and press search", "web": "https://www.php.net/docs.php", "element_type": "input", "interaction": "type_submit", "target_element": {"type": "id", "value": "search-text"}, "input_text": "regex", "target_html": "", "ground_truth": {"screenshot": "evaluation/ground_truth/task_42_gt.png", "description": "The word 'regex' has been entered and search initiated"}} - -{"web_name": "Node.js Documentation", "id": "node_nav_2", "task": "Click the Getting Started link", "web": "https://nodejs.org/en", "element_type": "link", "interaction": "click", "target_element": {"type": "class", "value": "nav-getting-started"}, "target_html": "Getting Started", "ground_truth": {"screenshot": "evaluation/ground_truth/task_78_gt.png", "description": "The Getting Started link has been clicked"}} - -{"web_name": "Vue.js Documentation", "id": "vue_search_2", "task": "Type 'router' into the search box and press search", "web": "https://vuejs.org/guide", "element_type": "input", "interaction": "type_submit", "target_element": {"type": "id", "value": "search-input"}, "input_text": "router", "target_html": "Search", "ground_truth": {"screenshot": "evaluation/ground_truth/task_79_gt.png", "description": "The word 'router' has been entered and search initiated"}} - -{"web_name": "Redis Documentation", "id": "redis_type_1", "task": "Type 'caching' into the search box", "web": "https://redis.io/docs/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "docsearch"}, "input_text": "caching", "target_html": "", "ground_truth": {"screenshot": "evaluation/ground_truth/task_80_gt.png", "description": "The word 'caching' has been typed in the search box"}} - -{"web_name": "Python Documentation", "id": "python_type_2", "task": "Type 'list comprehension' into the search box", "web": "https://docs.python.org/3/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "search-input"}, "input_text": "list comprehension", "target_html": "", "ground_truth": {"screenshot": "evaluation/ground_truth/task_81_gt.png", "description": "The phrase 'list comprehension' has been typed in the search box"}} - -{"web_name": "CPP Reference", "id": "cpp_type_1", "task": "Type 'vector' into the search box", "web": "https://en.cppreference.com/w/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "searchbox"}, "input_text": "vector", "target_html": "", "ground_truth": {"screenshot": "evaluation/ground_truth/task_82_gt.png", "description": "The word 'vector' has been typed in the search box"}} - -{"web_name": "Java Documentation", "id": "java_type_1", "task": "Type 'collections' into the search box", "web": "https://docs.oracle.com/en/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "search-input"}, "input_text": "collections", "target_html": "", "ground_truth": {"screenshot": "evaluation/ground_truth/task_83_gt.png", "description": "The word 'collections' has been typed in the search box"}} - -{"web_name": "PostgreSQL Documentation", "id": "postgres_type_1", "task": "Type 'joins' into the search box", "web": "https://www.postgresql.org/docs/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "q"}, "input_text": "joins", "target_html": "", "ground_truth": {"screenshot": "evaluation/ground_truth/task_84_gt.png", "description": "The word 'joins' has been typed in the search box"}} - -{"web_name": "MySQL Documentation", "id": "mysql_type_1", "task": "Type 'indexes' into the search box", "web": "https://dev.mysql.com/doc/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "search-input"}, "input_text": "indexes", "target_html": "", "ground_truth": {"screenshot": "evaluation/ground_truth/task_85_gt.png", "description": "The word 'indexes' has been typed in the search box"}} - -{"web_name": "Rust Documentation", "id": "rust_type_1", "task": "Type 'ownership' into the search box", "web": "https://doc.rust-lang.org/book/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "search-input"}, "input_text": "ownership", "target_html": "", "ground_truth": {"screenshot": "evaluation/ground_truth/task_86_gt.png", "description": "The word 'ownership' has been typed in the search box"}} - -{"web_name": "TypeScript Documentation", "id": "typescript_type_1", "task": "Type 'interface' into the search box", "web": "https://www.typescriptlang.org/docs/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "search"}, "input_text": "interface", "target_html": "", "ground_truth": {"screenshot": "evaluation/ground_truth/task_87_gt.png", "description": "The word 'interface' has been typed in the search box"}} - -{"web_name": "Scala Documentation", "id": "scala_type_1", "task": "Type 'traits' into the search box", "web": "https://docs.scala-lang.org", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "search-input"}, "input_text": "traits", "target_html": "", "ground_truth": {"screenshot": "evaluation/ground_truth/task_88_gt.png", "description": "The word 'traits' has been typed in the search box"}} - -{"web_name": "Swift Blog", "id": "swift_click_1", "task": "Click the Blog link", "web": "https://www.swift.org/documentation/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Blog"}, "target_html": "Blog", "ground_truth": {"screenshot": "evaluation/ground_truth/task_89_gt.png", "description": "The Blog link has been clicked"}} - -{"web_name": "Python Documentation", "id": "python_click_1", "task": "Click the Tutorial link", "web": "https://docs.python.org/3/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Tutorial"}, "target_html": "Tutorial", "ground_truth": {"screenshot": "evaluation/ground_truth/task_92_gt.png", "description": "The Tutorial link has been clicked"}} - -{"web_name": "Go Documentation", "id": "go_click_1", "task": "Click the Getting Started link", "web": "https://golang.org/doc/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Getting Started"}, "target_html": "Getting Started", "ground_truth": {"screenshot": "evaluation/ground_truth/task_93_gt.png", "description": "The Getting Started link has been clicked"}} - -{"web_name": "Rust Documentation", "id": "rust_click_1", "task": "Click the Learn Rust link", "web": "https://www.rust-lang.org/learn", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Learn Rust"}, "target_html": "Learn Rust", "ground_truth": {"screenshot": "evaluation/ground_truth/task_94_gt.png", "description": "The Learn Rust link has been clicked"}} - -{"web_name": "TypeScript Documentation", "id": "typescript_click_1", "task": "Click the Handbook link", "web": "https://www.typescriptlang.org/docs/", "element_type": "link", "interaction": "click", "target_element": {"type": "text", "value": "Handbook"}, "target_html": "Handbook", "ground_truth": {"screenshot": "evaluation/ground_truth/task_95_gt.png", "description": "The Handbook link has been clicked"}} - -{"web_name": "MDN Web Docs", "id": "mdn_submit_1", "task": "Type 'javascript' into the search box and press search", "web": "https://developer.mozilla.org/en-US/", "element_type": "input", "interaction": "type_submit", "target_element": {"type": "id", "value": "top-nav-search-input"}, "input_text": "javascript", "target_html": "", "ground_truth": {"screenshot": "evaluation/ground_truth/task_96_gt.png", "description": "The word 'javascript' has been entered and search initiated"}} - -{"web_name": "Python Documentation", "id": "python_submit_1", "task": "Type 'dictionary' into the search box and press search", "web": "https://docs.python.org/3/", "element_type": "input", "interaction": "type_submit", "target_element": {"type": "id", "value": "search-input"}, "input_text": "dictionary", "target_html": "", "ground_truth": {"screenshot": "evaluation/ground_truth/task_97_gt.png", "description": "The word 'dictionary' has been entered and search initiated"}} - -{"web_name": "Rust Documentation", "id": "rust_submit_1", "task": "Type 'traits' into the search box and press search", "web": "https://doc.rust-lang.org/book/", "element_type": "input", "interaction": "type_submit", "target_element": {"type": "id", "value": "search"}, "input_text": "traits", "target_html": "", "ground_truth": {"screenshot": "evaluation/ground_truth/task_98_gt.png", "description": "The word 'traits' has been entered and search initiated"}} - -{"web_name": "Go Documentation", "id": "go_submit_1", "task": "Type 'channels' into the search box and press search", "web": "https://golang.org/doc/", "element_type": "input", "interaction": "type_submit", "target_element": {"type": "id", "value": "search-input"}, "input_text": "channels", "target_html": "", "ground_truth": {"screenshot": "evaluation/ground_truth/task_99_gt.png", "description": "The word 'channels' has been entered and search initiated"}} - -{"web_name": "TypeScript Documentation", "id": "typescript_submit_1", "task": "Type 'generics' into the search box and press search", "web": "https://www.typescriptlang.org/docs/", "element_type": "input", "interaction": "type_submit", "target_element": {"type": "id", "value": "search"}, "input_text": "generics", "target_html": "", "ground_truth": {"screenshot": "evaluation/ground_truth/task_100_gt.png", "description": "The word 'generics' has been entered and search initiated"}} +{"web_name": "React Documentation", "id": "react_search_1", "task": "Type 'hooks tutorial' in the search box", "web": "https://legacy.reactjs.org/", "element_type": "input", "interaction": "type", "target_element": {"type": "id", "value": "algolia-doc-search"}, "input_text": "hooks tutorial", "target_html": "", "ground_truth": {"screenshot": "evaluation/ground_truth/task_54_gt.png", "description": "The phrase 'hooks tutorial' has been entered in the search box"}} \ No newline at end of file diff --git a/evaluation/auto_eval.py b/evaluation/auto_eval.py index e496002..ffe39d8 100644 --- a/evaluation/auto_eval.py +++ b/evaluation/auto_eval.py @@ -13,7 +13,7 @@ def run_serial_evaluation( results_dir: Path, output_file: Path, openai_key: str -) -> None: +) -> Dict[str, Any]: """Run evaluation on task results serially""" # Initialize OpenAI client client = OpenAI(api_key=openai_key) @@ -59,7 +59,7 @@ def run_serial_evaluation( "success": result["success"], "visual_score": visual_score, "html_score": html_score, - "final_score": (visual_score + html_score) / 2, + "final_score": (0.8 * visual_score + 0.2 * html_score), "visual_reasoning": visual_reasoning, "html_reasoning": html_reasoning } @@ -76,13 +76,18 @@ def run_serial_evaluation( "error": str(e) }) - # Save evaluations to output file - with output_file.open('w') as f: - json.dump({ - "total_tasks": len(tasks), - "successful_tasks": sum(1 for e in evaluations if e.get("success", False)), - "evaluations": evaluations - }, f, indent=2) + evaluation_results = { + "total_tasks": len(tasks), + "successful_tasks": sum(1 for e in evaluations if e.get("success", False)), + "evaluations": evaluations + } + + # Save evaluations if output file is provided + if output_file: + with output_file.open('w') as f: + json.dump(evaluation_results, f, indent=2) + + return evaluation_results def run_evaluation( tasks_file: Path, @@ -90,9 +95,9 @@ def run_evaluation( output_file: Path, openai_key: str, max_workers: int = None -) -> None: +) -> Dict[str, Any]: """Run evaluation on task results using either serial or parallel mode""" if max_workers: - run_parallel_evaluation(tasks_file, results_dir, output_file, openai_key, max_workers) + return run_parallel_evaluation(tasks_file, results_dir, output_file, openai_key, max_workers) else: - run_serial_evaluation(tasks_file, results_dir, output_file, openai_key) + return run_serial_evaluation(tasks_file, results_dir, output_file, openai_key) diff --git a/evaluation/fuzzy_match.py b/evaluation/fuzzy_match.py index 70107f5..a8758b0 100644 --- a/evaluation/fuzzy_match.py +++ b/evaluation/fuzzy_match.py @@ -35,6 +35,22 @@ def fuzzy_match_html( client = openai_client + # Truncate inputs if too long + max_html_length = 2000 # Characters per HTML string + max_task_length = 500 # Characters for task description + + if len(actual_html) > max_html_length: + actual_html = actual_html[:max_html_length] + "..." + logger.warning("Actual HTML was truncated due to length") + + if len(expected_html) > max_html_length: + expected_html = expected_html[:max_html_length] + "..." + logger.warning("Expected HTML was truncated due to length") + + if len(task_description) > max_task_length: + task_description = task_description[:max_task_length] + "..." + logger.warning("Task description was truncated due to length") + user_prompt = f"""You are evaluating if an HTML element matches the expected element for the following task: {task_description} Expected HTML: {expected_html} diff --git a/evaluation/image_match.py b/evaluation/image_match.py index 76e3858..923302d 100644 --- a/evaluation/image_match.py +++ b/evaluation/image_match.py @@ -21,7 +21,11 @@ def encode_image(image_path): with open(image_path, "rb") as image_file: - return base64.b64encode(image_file.read()).decode('utf-8') + image_data = image_file.read() + # Check file size (max 20MB) + if len(image_data) > 20 * 1024 * 1024: + raise ValueError(f"Image {image_path} is too large (>20MB)") + return base64.b64encode(image_data).decode('utf-8') def compare_images(prompt, ground_truth_path, agent_image_path, note = None, openai_client = None): if openai_client is None: @@ -42,17 +46,40 @@ def compare_images(prompt, ground_truth_path, agent_image_path, note = None, ope logger.debug("Using provided OpenAI client") client = openai_client - image1 = encode_image(ground_truth_path) - image2 = encode_image(agent_image_path) + try: + image1 = encode_image(ground_truth_path) + image2 = encode_image(agent_image_path) + except ValueError as e: + logger.error(f"Image encoding error: {str(e)}") + return False, f"Image processing error: {str(e)}" + + # Truncate prompt if too long + max_prompt_length = 500 + if len(prompt) > max_prompt_length: + prompt = prompt[:max_prompt_length] + "..." + user_prompt = f"The agent was trying to accomplish the following task: {prompt} The first image is the expected image and the second image is the agent's output. Does the image answer the question correctly as the expected image? Don't focus on unnecessary details, like axes titles or colors or image size or labels unless specified in the task." if note: + # Truncate note if too long + if len(note) > 200: + note = note[:200] + "..." user_prompt += f"Here are some notes to help you evaluate the images: {note}" messages = [ - {"role": "system", "content": system_prompt}, + { + "role": "system", + "content": """You are evaluating if a web automation task was completed successfully. Compare the screenshots and determine if the task's goal was achieved, focusing on the relevant UI changes that indicate success. + +Return a JSON object with: +- correctness (boolean): Whether the task was completed successfully +- reason (string): Clear explanation of your evaluation""" + }, { "role": "user", "content": [ - {"type": "text", "text": user_prompt}, + { + "type": "text", + "text": user_prompt + }, { "type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image1}"} diff --git a/evaluation/parallel_eval.py b/evaluation/parallel_eval.py index dd0617d..ca41e6e 100644 --- a/evaluation/parallel_eval.py +++ b/evaluation/parallel_eval.py @@ -4,6 +4,7 @@ from typing import Dict, Any, List, Tuple from openai import OpenAI from concurrent.futures import ThreadPoolExecutor, as_completed +import time from evaluation.image_match import compare_images from evaluation.fuzzy_match import fuzzy_match_html @@ -32,16 +33,19 @@ def evaluate_task(task: Dict[str, Any], result: Dict[str, Any], client: OpenAI) visual_score = 1.0 if visual_correctness else 0.0 html_score = 1.0 if html_correctness else 0.0 + # Calculate final score: 80% visual, 20% HTML + final_score = (0.8 * visual_score) + (0.2 * html_score) + evaluation = { "task_id": task_id, "success": result["success"], "visual_score": visual_score, "html_score": html_score, - "final_score": (visual_score + html_score) / 2, + "final_score": final_score, "visual_reasoning": visual_reasoning, "html_reasoning": html_reasoning } - logging.info(f"Evaluated task {task_id}: score={evaluation.get('final_score', 0.0):.2f}") + logging.info(f"Evaluated task {task_id}: score={final_score:.2f}") return evaluation except Exception as e: logging.error(f"Error evaluating task {task_id}: {str(e)}") @@ -60,7 +64,7 @@ def run_parallel_evaluation( output_file: Path, openai_key: str, max_workers: int = 4 -) -> None: +) -> Dict[str, Any]: """Run evaluation on task results in parallel""" # Initialize OpenAI client client = OpenAI(api_key=openai_key) @@ -84,33 +88,50 @@ def run_parallel_evaluation( if result: task_pairs.append((task, result)) - # Run evaluations in parallel - with ThreadPoolExecutor(max_workers=max_workers) as executor: - future_to_task = { - executor.submit(evaluate_task, task, result, client): task_id - for task, result in task_pairs - } + # Process tasks in smaller batches to avoid rate limits + batch_size = min(max_workers, 3) # Process at most 3 tasks at a time + for i in range(0, len(task_pairs), batch_size): + batch = task_pairs[i:i + batch_size] + logging.info(f"Processing evaluation batch {i//batch_size + 1}/{(len(task_pairs) + batch_size - 1)//batch_size}") - for future in as_completed(future_to_task): - try: - evaluation = future.result() - evaluations.append(evaluation) - except Exception as e: - task_id = future_to_task[future] - logging.error(f"Error in evaluation future for task {task_id}: {str(e)}") - evaluations.append({ - "task_id": task_id, - "success": False, - "visual_score": 0.0, - "html_score": 0.0, - "final_score": 0.0, - "error": str(e) - }) + # Run evaluations in parallel for this batch + with ThreadPoolExecutor(max_workers=batch_size) as executor: + future_to_task = { + executor.submit(evaluate_task, task, result, client): task['id'] + for task, result in batch + } + + for future in as_completed(future_to_task): + try: + evaluation = future.result(timeout=60) # 60 second timeout per evaluation + evaluations.append(evaluation) + logging.info(f"Completed evaluation for task {future_to_task[future]}") + except Exception as e: + task_id = future_to_task[future] + error_msg = f"Error in evaluation future for task {task_id}: {str(e)}" + logging.error(error_msg) + evaluations.append({ + "task_id": task_id, + "success": False, + "visual_score": 0.0, + "html_score": 0.0, + "final_score": 0.0, + "error": error_msg + }) + + # Add a small delay between batches to avoid rate limits + if i + batch_size < len(task_pairs): + time.sleep(1) + + evaluation_results = { + "total_tasks": len(tasks), + "successful_tasks": sum(1 for e in evaluations if e.get("success", False)), + "evaluations": evaluations + } - # Save evaluations to output file - with output_file.open('w') as f: - json.dump({ - "total_tasks": len(tasks), - "successful_tasks": sum(1 for e in evaluations if e.get("success", False)), - "evaluations": evaluations - }, f, indent=2) + # Save evaluations if output file is provided + if output_file: + with output_file.open('w') as f: + json.dump(evaluation_results, f, indent=2) + + return evaluation_results diff --git a/models/base.py b/models/base.py index 0228efe..2adaf27 100644 --- a/models/base.py +++ b/models/base.py @@ -14,26 +14,24 @@ class WebInteraction: @dataclass class TaskResult: - """Represents the result of executing a task.""" + """Class to store task execution results""" task_id: str success: bool - before_screenshot: Optional[str] = None - after_screenshot: Optional[str] = None + error: Optional[str] = None html_element: Optional[str] = None + after_screenshot: Optional[str] = None accessibility_tree: Optional[Dict[str, Any]] = None - error: Optional[str] = None metadata: Optional[Dict[str, Any]] = None def to_dict(self) -> Dict[str, Any]: - """Convert to dictionary for serialization.""" + """Convert to dictionary format""" return { "task_id": self.task_id, "success": self.success, - "before_screenshot": self.before_screenshot, - "after_screenshot": self.after_screenshot, + "error": self.error, "html_element": self.html_element, + "after_screenshot": self.after_screenshot, "accessibility_tree": self.accessibility_tree, - "error": self.error, "metadata": self.metadata } diff --git a/models/gemini.py b/models/gemini.py index 1cfb4ee..e924f3d 100644 --- a/models/gemini.py +++ b/models/gemini.py @@ -110,10 +110,7 @@ def parse_task(self, task: Dict[str, Any]) -> WebInteraction: selector_type=interaction_data.get('selector_type', task['target_element']['type']), selector_value=interaction_data.get('selector_value', task['target_element']['value']), input_text=interaction_data.get('input_text'), - description=task['task'], - wait_time=interaction_data.get('wait_time', 0), - hover_duration=interaction_data.get('hover_duration', 0), - validation=interaction_data.get('validation', {}) + description=task['task'] ) except Exception as e: print(f"Error parsing Gemini response: {str(e)}") @@ -167,10 +164,7 @@ def handle_error(self, task: Dict[str, Any], error: str) -> Optional[WebInteract selector_type=interaction_data['selector_type'], selector_value=interaction_data['selector_value'], input_text=interaction_data.get('input_text'), - description=f"Error recovery: {task['task']}", - wait_time=interaction_data.get('wait_time', 0), - hover_duration=interaction_data.get('hover_duration', 0), - validation=interaction_data.get('validation', {}) + description=f"Error recovery: {task['task']}" ) except Exception as e: print(f"Error in error handling: {str(e)}") diff --git a/models/gpt4.py b/models/gpt4.py index bdc881e..c0d108a 100644 --- a/models/gpt4.py +++ b/models/gpt4.py @@ -1,5 +1,8 @@ import json import time +import os +import base64 +import logging from typing import Dict, Any, Optional, Tuple from openai import OpenAI from .base import BaseModel, WebInteraction, TaskResult @@ -7,20 +10,24 @@ class GPT4Model(BaseModel): """GPT-4 model implementation for the DOM benchmark.""" - def __init__(self, api_key: str, model_config: Dict[str, Any] = None): - super().__init__("gpt-4", model_config or {}) - self.client = OpenAI(api_key=api_key) + def __init__(self, api_key: str = None): + """Initialize GPT4Model with OpenAI API key""" + self.api_key = api_key or os.getenv('OPENAI_API_KEY') + if not self.api_key: + raise ValueError("OpenAI API key not provided") + + self.client = OpenAI(api_key=self.api_key) self.max_retries = 10 - self.model = model_config.get("model", "gpt-4") - self.temperature = model_config.get("temperature", 0) - self.max_tokens = model_config.get("max_tokens", 1000) + self.model = "gpt-4" + self.temperature = 0 + self.max_tokens = 1000 # Enhanced system prompt with hover support self.system_prompt = """You are an AI assistant that helps users interact with web elements. Your task is to understand the user's intent and generate precise web element interactions. For each task, analyze: -1. The user's goal and required interaction (click, type, scroll, wait, hover) +1. The user's goal and required interaction (click, type, hover) 2. The target element's properties and accessibility 3. Any constraints or special conditions @@ -29,31 +36,14 @@ def __init__(self, api_key: str, model_config: Dict[str, Any] = None): 2. Consider element visibility and interactability 3. Handle dynamic content and loading states 4. Pay attention to timing and wait states -5. Validate success criteria for each interaction -6. For hover actions: - - Ensure element is visible and interactable - - Consider dynamic content (dropdowns, tooltips) - - Validate hover effects and state changes Generate interactions in this JSON format: { - "action": "click|type|scroll|wait|hover", - "selector_type": "css|xpath|id", + "action": "click|type|hover", + "selector_type": "css|xpath|id|class", "selector_value": "string", "input_text": "string", # For type actions - "wait_time": integer, # For wait actions in seconds - "scroll_direction": "up|down", # For scroll actions - "hover_duration": integer, # For hover actions in milliseconds - "validation": { - "expected_state": "visible|hidden|text_present|text_absent|hover_effect", - "validation_selector": "string", # Element to validate - "expected_text": "string", # For text validation - "hover_effects": { # For hover validation - "type": "tooltip|dropdown|style_change", - "target_selector": "string", # Element affected by hover - "expected_changes": ["color_change", "visibility", "content"] - } - } + "description": "string" # Optional description of the interaction }""" def _call_api(self, messages: list, retry_count: int = 0) -> Tuple[Optional[dict], bool]: @@ -111,14 +101,11 @@ def parse_task(self, task: Dict[str, Any]) -> WebInteraction: interaction_data = json.loads(content) return WebInteraction( - action=interaction_data.get('action', task.get('interaction', 'click')), - selector_type=interaction_data.get('selector_type', task['target_element']['type']), + action=interaction_data.get('action', task.get('interaction', 'click')).lower(), + selector_type=interaction_data.get('selector_type', task['target_element']['type']).lower(), selector_value=interaction_data.get('selector_value', task['target_element']['value']), - input_text=interaction_data.get('input_text'), - description=task['task'], - wait_time=interaction_data.get('wait_time', 0), - hover_duration=interaction_data.get('hover_duration', 0), - validation=interaction_data.get('validation', {}) + input_text=interaction_data.get('input_text', task.get('input_text')), + description=task.get('task') ) except Exception as e: print(f"Error parsing GPT-4 response: {str(e)}") @@ -169,10 +156,7 @@ def handle_error(self, task: Dict[str, Any], error: str) -> Optional[WebInteract selector_type=interaction_data['selector_type'], selector_value=interaction_data['selector_value'], input_text=interaction_data.get('input_text'), - description=f"Error recovery: {task['task']}", - wait_time=interaction_data.get('wait_time', 0), - hover_duration=interaction_data.get('hover_duration', 0), - validation=interaction_data.get('validation', {}) + description=f"Error recovery: {task['task']}" ) except Exception as e: print(f"Error in error handling: {str(e)}") @@ -218,3 +202,104 @@ def validate_result(self, task: Dict[str, Any], result: TaskResult) -> bool: if failure_reason: print(f"Validation failed: {failure_reason}") return False + + def evaluate_image_similarity(self, actual_img: str, expected_img: str) -> Dict[str, Any]: + """ + Evaluate similarity between actual and expected screenshots + + Args: + actual_img: Path to actual screenshot + expected_img: Path to expected (ground truth) screenshot + + Returns: + Dict containing similarity score and explanation + """ + try: + # Load images + with open(actual_img, "rb") as actual, open(expected_img, "rb") as expected: + response = self.client.chat.completions.create( + model="gpt-4-vision-preview", + messages=[ + { + "role": "system", + "content": "You are an expert at comparing web page screenshots to determine if the same interaction was performed." + }, + { + "role": "user", + "content": [ + { + "type": "text", + "text": "Compare these two screenshots and determine if they show the same web interaction was performed. Focus on the relevant UI changes, not minor visual differences." + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{base64.b64encode(actual.read()).decode()}"} + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{base64.b64encode(expected.read()).decode()}"} + } + ] + } + ], + max_tokens=300 + ) + + return { + "score": 1.0 if "same" in response.choices[0].message.content.lower() else 0.0, + "explanation": response.choices[0].message.content + } + + except Exception as e: + logging.error(f"Error evaluating image similarity: {str(e)}") + return { + "score": 0.0, + "explanation": f"Error evaluating images: {str(e)}" + } + + def evaluate_html_similarity(self, actual_html: str, expected_html: str) -> Dict[str, Any]: + """ + Evaluate similarity between actual and expected HTML + + Args: + actual_html: Actual HTML string + expected_html: Expected HTML string + + Returns: + Dict containing similarity score and explanation + """ + try: + response = self.client.chat.completions.create( + model="gpt-4", + messages=[ + { + "role": "system", + "content": "You are an expert at comparing HTML elements to determine if they refer to the same interactive element." + }, + { + "role": "user", + "content": f"""Compare these two HTML elements and determine if they refer to the same interactive element: + + Actual HTML: + {actual_html} + + Expected HTML: + {expected_html} + + Focus on key attributes like id, class, role, and text content. Ignore minor differences in formatting or dynamic attributes.""" + } + ], + max_tokens=300 + ) + + return { + "score": 1.0 if "same" in response.choices[0].message.content.lower() else 0.0, + "explanation": response.choices[0].message.content + } + + except Exception as e: + logging.error(f"Error evaluating HTML similarity: {str(e)}") + return { + "score": 0.0, + "explanation": f"Error comparing HTML: {str(e)}" + } diff --git a/parallel_runner.py b/parallel_runner.py index bd875e5..d122a6d 100644 --- a/parallel_runner.py +++ b/parallel_runner.py @@ -9,7 +9,7 @@ from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service -from webdriver_manager.chrome import ChromeDriverManager +from selenium.webdriver.chrome.service import Service as ChromeService from utils import ( execute_interaction, @@ -23,6 +23,7 @@ class TaskRunner: """Handles parallel execution of benchmark tasks""" def __init__(self, + model, max_workers: int = 4, output_dir: Path = None, save_accessibility_tree: bool = True, @@ -31,11 +32,13 @@ def __init__(self, Initialize TaskRunner Args: + model: Language model to use for task parsing max_workers: Maximum number of concurrent Chrome instances output_dir: Directory for results and screenshots save_accessibility_tree: Whether to save accessibility trees wait_time: Wait time between actions in seconds """ + self.model = model self.max_workers = max_workers self.output_dir = output_dir or Path("results") self.save_accessibility_tree = save_accessibility_tree @@ -57,20 +60,29 @@ def __init__(self, # Thread-safe queue for results self.results_queue = queue.Queue() - def setup_driver(self) -> webdriver.Chrome: + def setup_driver(self): """Create and configure Chrome WebDriver instance""" chrome_options = Options() chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument('--force-device-scale-factor=1') chrome_options.add_argument('--window-size=1920,1080') + chrome_options.add_argument('--disable-gpu') # Disable GPU hardware acceleration + chrome_options.add_argument('--start-maximized') # Start maximized + chrome_options.add_argument('--disable-extensions') # Disable extensions + chrome_options.add_argument('--disable-popup-blocking') # Disable popup blocking chrome_options.add_argument( - 'user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' - '(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36' + 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ' + 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.6778.140 Safari/537.36' ) - service = Service(ChromeDriverManager().install()) - return webdriver.Chrome(service=service, options=chrome_options) + # Use Selenium Manager instead of ChromeDriverManager + service = Service() + driver = webdriver.Chrome(service=service, options=chrome_options) + + # Navigate to about:blank first to ensure a clean start + driver.get("about:blank") + return driver def execute_task(self, task: Dict[str, Any]) -> Dict[str, Any]: """Execute a single benchmark task""" @@ -88,7 +100,7 @@ def execute_task(self, task: Dict[str, Any]) -> Dict[str, Any]: driver = self.setup_driver() # Navigate to page - url = task.get('web') + url = task.get('web') # Changed from 'url' to 'web' to match task data if not url: raise ValueError("No URL provided in task") @@ -96,11 +108,6 @@ def execute_task(self, task: Dict[str, Any]) -> Dict[str, Any]: driver.get(url) time.sleep(self.wait_time) - # Save before screenshot - before_screenshot = self.output_dir / f"{task_id}_before.png" - save_screenshot(driver, str(before_screenshot)) - result['before_screenshot'] = str(before_screenshot) - # Save accessibility tree before interaction if self.save_accessibility_tree: before_tree = get_accessibility_tree(driver) @@ -109,14 +116,21 @@ def execute_task(self, task: Dict[str, Any]) -> Dict[str, Any]: result['before_tree'] = str(before_tree_path) # Execute interaction + web_interaction = self.model.parse_task(task) interaction = { - "action": task.get("interaction", "click"), - "selector": f"{task['target_element']['type']}={task['target_element']['value']}" if task.get('target_element') else "", - "value": task.get("input_text", "") + 'action': web_interaction.action, + 'target_element': { + 'type': web_interaction.selector_type, + 'value': web_interaction.selector_value + }, + 'input_text': web_interaction.input_text } + logging.info(f"Task {task_id}: Executing interaction: {interaction}") success, element_html = execute_interaction(driver, interaction) - result['success'] = success + if not success: + raise ValueError("Interaction failed") + result['success'] = True result['html_element'] = element_html time.sleep(self.wait_time) @@ -146,28 +160,48 @@ def run_tasks(self, tasks: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Run tasks in parallel using ThreadPoolExecutor""" results = [] - with ThreadPoolExecutor(max_workers=self.max_workers) as executor: - # Submit all tasks - future_to_task = { - executor.submit(self.execute_task, task): task - for task in tasks - } + # Process tasks in smaller batches to avoid overwhelming the system + batch_size = min(self.max_workers, 5) # Process at most 5 tasks at a time + for i in range(0, len(tasks), batch_size): + batch = tasks[i:i + batch_size] + logging.info(f"Processing task batch {i//batch_size + 1}/{(len(tasks) + batch_size - 1)//batch_size}") + + with ThreadPoolExecutor(max_workers=batch_size) as executor: + # Submit batch of tasks + future_to_task = { + executor.submit(self.execute_task, task): task + for task in batch + } + + # Process completed tasks + for future in as_completed(future_to_task): + task = future_to_task[future] + task_id = task.get('id', 'unknown') + try: + result = future.result(timeout=120) # 2 minute timeout per task + results.append(result) + logging.info(f"Completed task {task_id}") + except Exception as e: + error_msg = f"Task {task_id} failed with error: {str(e)}" + logging.error(error_msg) + results.append({ + 'task_id': task_id, + 'success': False, + 'error': error_msg, + 'task_description': task.get('task'), + 'timestamp': time.time() + }) - # Process completed tasks - for future in as_completed(future_to_task): - task = future_to_task[future] - try: - result = future.result() - results.append(result) - logging.info(f"Completed task {task.get('id', 'unknown')}") - except Exception as e: - logging.error(f"Task failed: {str(e)}", exc_info=True) + # Add a small delay between batches + if i + batch_size < len(tasks): + time.sleep(1) return results def run_parallel_benchmark( tasks_file: str, output_dir: str, + model, max_workers: int = 4, save_accessibility_tree: bool = True, wait_time: float = 2.0 @@ -178,6 +212,7 @@ def run_parallel_benchmark( Args: tasks_file: Path to JSONL file containing tasks output_dir: Directory for results and screenshots + model: Language model to use for task parsing max_workers: Maximum number of concurrent Chrome instances save_accessibility_tree: Whether to save accessibility trees wait_time: Wait time between actions in seconds @@ -191,6 +226,7 @@ def run_parallel_benchmark( # Initialize runner runner = TaskRunner( + model=model, max_workers=max_workers, output_dir=Path(output_dir), save_accessibility_tree=save_accessibility_tree, diff --git a/run.py b/run.py index efc56b2..5a5e449 100644 --- a/run.py +++ b/run.py @@ -6,6 +6,7 @@ from evaluation.auto_eval import run_evaluation from models import GPT4Model, ClaudeModel, GeminiModel import os +from dotenv import load_dotenv def get_model(model_name): """Get the appropriate model based on command line argument.""" @@ -13,8 +14,8 @@ def get_model(model_name): models = { 'gpt4': lambda: GPT4Model(api_key=os.getenv("OPENAI_API_KEY")), - 'claude': lambda: ClaudeModel(api_key=os.getenv("ANTHROPIC_API_KEY")), - 'gemini': lambda: GeminiModel(api_key=os.getenv("GOOGLE_API_KEY")) + 'claude': lambda: ClaudeModel(api_key=os.getenv("ANTHROPIC_API_KEY"), model_config={}), + 'gemini': lambda: GeminiModel(api_key=os.getenv("GOOGLE_API_KEY"), model_config={}) } if model_name not in models: @@ -49,23 +50,19 @@ def main(): if args.mode == 'parallel': results = run_parallel_benchmark( tasks_file=args.tasks, - output_dir=args.output, - model=model, + output_dir=str(output_dir), max_workers=args.max_workers, save_accessibility_tree=args.save_accessibility_tree, wait_time=args.wait_time, - evaluate=args.evaluate, - evaluate_mode=args.evaluate_mode + model=model ) else: results = run_serial_benchmark( tasks_file=args.tasks, - output_dir=args.output, - model=model, + output_dir=str(output_dir), save_accessibility_tree=args.save_accessibility_tree, wait_time=args.wait_time, - evaluate=args.evaluate, - evaluate_mode=args.evaluate_mode + model=model ) # Save results @@ -75,14 +72,48 @@ def main(): # Run evaluation if requested if args.evaluate: - eval_output = output_dir / "evaluation.json" - run_evaluation( + # Run evaluations + eval_results = run_evaluation( tasks_file=Path(args.tasks), results_dir=results_file, - output_file=eval_output, + output_file=None, # Don't save to separate file openai_key=os.getenv('OPENAI_API_KEY'), max_workers=args.max_workers if args.evaluate_mode == 'parallel' else None ) + + # Update results with evaluations + for result in results: + task_id = result['task_id'] + eval_result = next((e for e in eval_results['evaluations'] if e['task_id'] == task_id), None) + if eval_result: + # Get evaluation scores and explanations, with defaults if missing + visual_score = eval_result.get('visual_score', 0.0) + html_score = eval_result.get('html_score', 0.0) + final_score = eval_result.get('final_score', 0.0) # Get final score from evaluation + visual_reasoning = eval_result.get('visual_reasoning', 'No visual evaluation available') + html_reasoning = eval_result.get('html_reasoning', 'No HTML evaluation available') + + # Add evaluation scores to result + result['final_score'] = final_score # Add final score at top level + result['llm_evaluations'] = { + 'image_similarity': { + 'score': visual_score, + 'explanation': visual_reasoning + }, + 'html_fuzzy_match': { + 'score': html_score, + 'explanation': html_reasoning + } + } + # Update success based on evaluation scores + # Only mark as success if both image and HTML evaluations pass + result['success'] = (visual_score > 0.5 and html_score > 0.5) + if not result['success'] and not result['error']: + result['error'] = "Failed evaluation checks" + + # Save updated results + with open(results_file, 'w') as f: + json.dump(results, f, indent=2) if __name__ == '__main__': main() diff --git a/serial_runner.py b/serial_runner.py index 0e2f4c2..c8bf31f 100644 --- a/serial_runner.py +++ b/serial_runner.py @@ -20,6 +20,7 @@ class SerialTaskRunner: """Handles serial execution of benchmark tasks""" def __init__(self, + model, output_dir: Path = None, save_accessibility_tree: bool = True, wait_time: float = 2.0): @@ -27,10 +28,12 @@ def __init__(self, Initialize SerialTaskRunner Args: + model: Language model to use for task parsing output_dir: Directory for results and screenshots save_accessibility_tree: Whether to save accessibility trees wait_time: Wait time between actions in seconds """ + self.model = model self.output_dir = output_dir or Path("results") self.save_accessibility_tree = save_accessibility_tree self.wait_time = wait_time @@ -49,92 +52,114 @@ def __init__(self, ] ) - def setup_driver(self) -> webdriver.Chrome: + def setup_driver(self): """Create and configure Chrome WebDriver instance""" chrome_options = Options() chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument('--force-device-scale-factor=1') chrome_options.add_argument('--window-size=1920,1080') + chrome_options.add_argument('--disable-gpu') # Disable GPU hardware acceleration + chrome_options.add_argument('--start-maximized') # Start maximized + chrome_options.add_argument('--disable-extensions') # Disable extensions + chrome_options.add_argument('--disable-popup-blocking') # Disable popup blocking chrome_options.add_argument( - 'user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' - '(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36' + 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ' + 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.6778.140 Safari/537.36' ) - service = Service(ChromeDriverManager().install()) - return webdriver.Chrome(service=service, options=chrome_options) + # Use Selenium Manager instead of ChromeDriverManager + service = Service() + driver = webdriver.Chrome(service=service, options=chrome_options) + + # Navigate to about:blank first to ensure a clean start + driver.get("about:blank") + return driver def execute_task(self, task: Dict[str, Any], task_num: int, total_tasks: int) -> Dict[str, Any]: """Execute a single benchmark task""" task_id = task.get('id', 'unknown') - logging.info(f"\nProcessing task {task_num}/{total_tasks}: {task_id}") - logging.info(f"Task description: {task.get('task', 'No description')}") + logging.info(f"\n{'='*50}") + logging.info(f"Starting task {task_num}/{total_tasks}: {task_id}") + logging.info(f"Task details: {task}") result = { 'task_id': task_id, 'success': False, 'error': None, - 'task_description': task.get('task'), - 'timestamp': time.time() + 'after_screenshot': None, + 'llm_evaluations': { + 'image_similarity': None, + 'html_fuzzy_match': None + } } try: - # Navigate to page + driver = self.setup_driver() + logging.info(f"Browser initialized for task {task_id}") + + # Navigate to URL url = task.get('web') if not url: raise ValueError("No URL provided in task") - logging.info(f"Task {task_id}: Navigating to {url}") - self.driver.get(url) - time.sleep(self.wait_time) - - # Save before screenshot - before_screenshot = self.output_dir / f"{task_id}_before.png" - save_screenshot(self.driver, str(before_screenshot)) - result['before_screenshot'] = str(before_screenshot) - logging.info(f"Saved before screenshot: {before_screenshot}") - - # Save accessibility tree before interaction - if self.save_accessibility_tree: - before_tree = get_accessibility_tree(self.driver) - before_tree_path = self.output_dir / f"{task_id}_before_tree.json" - save_accessibility_tree(before_tree, str(before_tree_path)) - result['before_tree'] = str(before_tree_path) - logging.info(f"Saved before accessibility tree: {before_tree_path}") + logging.info(f"Navigating to URL: {url}") + driver.get(url) + time.sleep(self.wait_time) # Wait for page load # Execute interaction + web_interaction = self.model.parse_task(task) interaction = { - "action": task.get("interaction", "click"), - "selector": f"{task['target_element']['type']}={task['target_element']['value']}" if task.get('target_element') else "", - "value": task.get("input_text", "") + 'action': web_interaction.action, + 'target_element': { + 'type': web_interaction.selector_type, + 'value': web_interaction.selector_value + }, + 'input_text': web_interaction.input_text } - logging.info(f"Executing interaction: {interaction}") - success, element_html = execute_interaction(self.driver, interaction) - result['success'] = success + logging.info(f"Task {task_id}: Executing interaction: {interaction}") + success, element_html = execute_interaction(driver, interaction) + if not success: + raise ValueError("Interaction failed") result['html_element'] = element_html - time.sleep(self.wait_time) + time.sleep(self.wait_time) # Wait for interaction to complete - # Save after screenshot - after_screenshot = self.output_dir / f"{task_id}_after.png" - save_screenshot(self.driver, str(after_screenshot)) - result['after_screenshot'] = str(after_screenshot) - logging.info(f"Saved after screenshot: {after_screenshot}") + # Take after screenshot + after_screenshot = save_screenshot(driver, self.output_dir / f"{task_id}_after.png") + result['after_screenshot'] = after_screenshot - # Save accessibility tree after interaction if self.save_accessibility_tree: - after_tree = get_accessibility_tree(self.driver) - after_tree_path = self.output_dir / f"{task_id}_after_tree.json" - save_accessibility_tree(after_tree, str(after_tree_path)) - result['after_tree'] = str(after_tree_path) - logging.info(f"Saved after accessibility tree: {after_tree_path}") - - logging.info(f"Task completed successfully: {success}") + after_tree = get_accessibility_tree(driver) + save_accessibility_tree(after_tree, self.output_dir / f"{task_id}_after_tree.json") + logging.info("Saved after screenshots and accessibility tree") + # Only mark as success if we have all required data + if after_screenshot and element_html: + # We have the data but need to wait for evaluations to determine final success + # Set to False for now, will be updated after evaluations + result['success'] = False + logging.info(f"Task {task_id} completed data collection") + else: + result['success'] = False + result['error'] = "Missing required data (screenshots or HTML element)" + except Exception as e: - result['error'] = str(e) - logging.error(f"Error in task {task_id}: {str(e)}", exc_info=True) - + error_msg = f"Error in task {task_id}: {str(e)}" + logging.error(error_msg, exc_info=True) + result['error'] = error_msg + result['success'] = False + + finally: + try: + if 'driver' in locals(): + driver.quit() + logging.info(f"Browser closed for task {task_id}") + except Exception as e: + logging.error(f"Error closing browser: {str(e)}") + + logging.info(f"Task {task_id} result: {result}") + logging.info(f"{'='*50}\n") return result def run_tasks(self, tasks: List[Dict[str, Any]]) -> List[Dict[str, Any]]: @@ -168,6 +193,7 @@ def run_tasks(self, tasks: List[Dict[str, Any]]) -> List[Dict[str, Any]]: def run_serial_benchmark( tasks_file: str, output_dir: str, + model, save_accessibility_tree: bool = True, wait_time: float = 2.0 ) -> List[Dict[str, Any]]: @@ -177,11 +203,9 @@ def run_serial_benchmark( Args: tasks_file: Path to JSONL file containing tasks output_dir: Directory for results and screenshots + model: Language model to use for task parsing save_accessibility_tree: Whether to save accessibility trees wait_time: Wait time between actions in seconds - - Returns: - List of task results """ # Load tasks tasks = load_tasks_with_ground_truth(tasks_file) @@ -189,6 +213,7 @@ def run_serial_benchmark( # Initialize runner runner = SerialTaskRunner( + model=model, output_dir=Path(output_dir), save_accessibility_tree=save_accessibility_tree, wait_time=wait_time diff --git a/utils.py b/utils.py index 32780dd..40395df 100644 --- a/utils.py +++ b/utils.py @@ -15,25 +15,33 @@ def execute_interaction(driver: webdriver.Chrome, interaction: Dict[str, Any]) - """Execute a single interaction on the webpage and return success status and element HTML""" try: action = interaction.get("action", "").lower() - selector = interaction.get("selector", "") - value = interaction.get("value", "") - if not selector: - logging.warning("No selector provided for interaction") - return False, None - - # Parse selector in format "type=value" - selector_parts = selector.split('=', 1) - if len(selector_parts) != 2: - logging.error(f"Invalid selector format: {selector}") + # Get selector info from either old or new format + selector_type = None + selector_value = None + + # Try new format first (target_element) + target_element = interaction.get("target_element") + if target_element: + selector_type = target_element.get("type") + selector_value = target_element.get("value") + + # Fall back to old format if needed + if not selector_type or not selector_value: + selector = interaction.get("selector") + if selector: + selector_parts = selector.split('=', 1) + if len(selector_parts) == 2: + selector_type, selector_value = selector_parts + + if not selector_type or not selector_value: + logging.warning("No valid selector found in interaction") return False, None - selector_type, selector_value = selector_parts - # Map selector type to Selenium By selector_map = { 'id': By.ID, - 'class': By.CLASS_NAME, + 'class': By.CSS_SELECTOR, 'css': By.CSS_SELECTOR, 'xpath': By.XPATH, 'name': By.NAME, @@ -44,15 +52,26 @@ def execute_interaction(driver: webdriver.Chrome, interaction: Dict[str, Any]) - if not by_type: logging.error(f"Unsupported selector type: {selector_type}") return False, None - + + # For class selectors, convert to CSS format + selector_value_to_use = selector_value + if selector_type.lower() == 'class': + # Handle space-separated class names by converting to CSS format + classes = selector_value.split() + selector_value_to_use = '.' + '.'.join(classes) + logging.info(f"Converted class selector '{selector_value}' to CSS selector '{selector_value_to_use}'") + # Wait for element to be present and interactable - wait = WebDriverWait(driver, 10) - element = wait.until(EC.presence_of_element_located((by_type, selector_value))) - wait.until(EC.element_to_be_clickable((by_type, selector_value))) + wait = WebDriverWait(driver, 30) + element = wait.until(EC.presence_of_element_located((by_type, selector_value_to_use))) + wait.until(EC.element_to_be_clickable((by_type, selector_value_to_use))) # Get element's outer HTML element_html = element.get_attribute('outerHTML') + # Prioritize input_text over value + value = interaction.get("input_text", interaction.get("value", "")) + # Execute the interaction if action == "click": element.click() @@ -66,22 +85,24 @@ def execute_interaction(driver: webdriver.Chrome, interaction: Dict[str, Any]) - logging.error(f"Unsupported action: {action}") return False, element_html + logging.info(f"Successfully executed {action} on {selector_type}={selector_value} with value '{value}'") return True, element_html except Exception as e: logging.error(f"Error executing interaction: {str(e)}") return False, None -def save_screenshot(driver: webdriver.Chrome, filepath: str) -> bool: +def save_screenshot(driver: webdriver.Chrome, filepath: Union[str, Path]) -> Optional[str]: """Save screenshot of the current page state""" try: - Path(filepath).parent.mkdir(parents=True, exist_ok=True) - driver.save_screenshot(filepath) + filepath = Path(filepath) + filepath.parent.mkdir(parents=True, exist_ok=True) + driver.save_screenshot(str(filepath)) logging.info(f"Screenshot saved to {filepath}") - return True + return str(filepath) except Exception as e: logging.error(f"Error saving screenshot: {str(e)}") - return False + return None def get_accessibility_tree(driver: webdriver.Chrome) -> Dict[str, Any]: """Get accessibility tree of the current page"""