-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
8fbea63
commit 117a792
Showing
11 changed files
with
705 additions
and
445 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
import json | ||
from collections import defaultdict | ||
from typing import Dict, List, Any | ||
|
||
def load_results() -> List[Dict[str, Any]]: | ||
with open('results/results.json') as f: | ||
return json.load(f) | ||
|
||
def analyze_results(results: List[Dict[str, Any]]) -> None: | ||
total_tasks = len(results) | ||
successes = [r for r in results if r.get('success', False)] | ||
failures = [r for r in results if not r.get('success', False)] | ||
|
||
print("\n=== Overall Statistics ===") | ||
print(f"Total Tasks: {total_tasks}") | ||
print(f"Success Rate: {len(successes)/total_tasks*100:.2f}% ({len(successes)} successes, {len(failures)} failures)") | ||
|
||
# Error Analysis | ||
error_types = defaultdict(int) | ||
for task in failures: | ||
error = task.get('error', 'Unknown error') | ||
if isinstance(error, str): | ||
# Simplify error messages to group similar errors | ||
if 'has no attribute' in error: | ||
error = "Missing attribute error" | ||
elif 'timeout' in error.lower(): | ||
error = "Timeout error" | ||
elif 'not found' in error.lower(): | ||
error = "Element not found" | ||
elif 'failed evaluation' in error.lower(): | ||
error = "Failed evaluation checks" | ||
error_types[error] += 1 | ||
|
||
print("\n=== Error Analysis ===") | ||
print("Common failure reasons:") | ||
for error, count in sorted(error_types.items(), key=lambda x: x[1], reverse=True): | ||
percentage = (count / len(failures)) * 100 | ||
print(f"{error}: {percentage:.1f}% ({count} tasks)") | ||
|
||
# Task Type Analysis | ||
def categorize_task(task_desc: str) -> str: | ||
desc = task_desc.lower() | ||
if 'click' in desc: | ||
return 'Click' | ||
elif 'type' in desc or 'enter' in desc: | ||
return 'Type/Input' | ||
elif 'search' in desc: | ||
return 'Search' | ||
elif 'hover' in desc: | ||
return 'Hover' | ||
return 'Other' | ||
|
||
task_types = defaultdict(lambda: {'success': 0, 'fail': 0}) | ||
for task in results: | ||
task_type = categorize_task(task.get('task_description', '')) | ||
if task.get('success', False): | ||
task_types[task_type]['success'] += 1 | ||
else: | ||
task_types[task_type]['fail'] += 1 | ||
|
||
print("\n=== Task Type Analysis ===") | ||
for task_type, stats in task_types.items(): | ||
total = stats['success'] + stats['fail'] | ||
success_rate = (stats['success']/total*100) if total > 0 else 0 | ||
print(f"{task_type}: {success_rate:.1f}% success rate ({stats['success']}/{total} tasks)") | ||
|
||
# Website Analysis | ||
def extract_website(task_id: str) -> str: | ||
return task_id.split('_')[0] if '_' in task_id else 'unknown' | ||
|
||
website_stats = defaultdict(lambda: {'success': 0, 'fail': 0}) | ||
for task in results: | ||
website = extract_website(task.get('task_id', 'unknown')) | ||
if task.get('success', False): | ||
website_stats[website]['success'] += 1 | ||
else: | ||
website_stats[website]['fail'] += 1 | ||
|
||
print("\n=== Website Performance ===") | ||
for website, stats in sorted(website_stats.items(), | ||
key=lambda x: (x[1]['success'] + x[1]['fail']), | ||
reverse=True): | ||
total = stats['success'] + stats['fail'] | ||
if total < 2: # Skip websites with very few tasks | ||
continue | ||
success_rate = (stats['success']/total*100) | ||
print(f"{website}: {success_rate:.1f}% success rate ({stats['success']}/{total} tasks)") | ||
|
||
# Example Analysis | ||
print("\n=== Example Cases ===") | ||
print("\nSuccessful Tasks:") | ||
for task in successes[:3]: | ||
print(f"✓ {task.get('task_description', '')}") | ||
print(f" ID: {task.get('task_id', '')}") | ||
if task.get('error'): | ||
print(f" Note: {task['error']}") | ||
print() | ||
|
||
print("\nFailed Tasks:") | ||
for task in failures[:3]: | ||
print(f"✗ {task.get('task_description', '')}") | ||
print(f" ID: {task.get('task_id', '')}") | ||
if task.get('error'): | ||
print(f" Error: {task['error']}") | ||
print() | ||
|
||
if __name__ == "__main__": | ||
results = load_results() | ||
analyze_results(results) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
import json | ||
import os | ||
from pathlib import Path | ||
from typing import Dict, List, Any, Optional | ||
from openai import OpenAI | ||
from dotenv import load_dotenv | ||
|
||
load_dotenv() | ||
|
||
class DatasetCleaner: | ||
def __init__(self, results_file: str, api_key: Optional[str] = None): | ||
"""Initialize the dataset cleaner. | ||
Args: | ||
results_file: Path to results.json file | ||
api_key: OpenAI API key (optional, will use environment variable if not provided) | ||
""" | ||
self.results_file = Path(results_file) | ||
self.client = OpenAI(api_key=api_key) | ||
|
||
def analyze_result(self, result: Dict[str, Any]) -> Dict[str, Any]: | ||
"""Analyze a single result entry to determine if it's valid.""" | ||
response = self.client.chat.completions.create( | ||
model="gpt-4-turbo", | ||
messages=[ | ||
{ | ||
"role": "system", | ||
"content": """You are an expert at analyzing web automation test results to determine if a test case is invalid. | ||
A test case should be considered invalid if it encounters issues that make it unsuitable for benchmarking, such as: | ||
1. CAPTCHA or verification challenges | ||
2. Network or connection issues | ||
3. Page timeouts or loading failures | ||
4. Security blocks or authentication requirements | ||
5. Missing or broken page elements | ||
6. Browser crashes | ||
7. Rate limiting or API errors | ||
8. Geolocation restrictions""" | ||
}, | ||
{ | ||
"role": "user", | ||
"content": f"""Analyze this test result and determine if it should be excluded from benchmarking: | ||
Task ID: {result['task_id']} | ||
Success: {result['success']} | ||
Error: {result.get('error', 'None')} | ||
Task Description: {result['task_description']} | ||
HTML Element: {result.get('html_element', 'None')} | ||
Respond with a JSON object containing: | ||
{{ | ||
"is_valid": boolean, | ||
"reason": string explaining why the test case is invalid (if applicable), | ||
"confidence": float between 0 and 1 | ||
}}""" | ||
} | ||
], | ||
response_format={"type": "json_object"} | ||
) | ||
|
||
return json.loads(response.choices[0].message.content) | ||
|
||
def clean_dataset(self, min_confidence: float = 0.8) -> Dict[str, List[str]]: | ||
"""Clean the dataset by analyzing results.json entries. | ||
Args: | ||
min_confidence: Minimum confidence threshold for filtering (default: 0.8) | ||
Returns: | ||
Dictionary containing lists of valid and invalid test cases | ||
""" | ||
results = { | ||
"valid": [], | ||
"invalid": [] | ||
} | ||
|
||
# Load and process results.json | ||
with open(self.results_file) as f: | ||
test_results = json.load(f) | ||
|
||
for result in test_results: | ||
analysis = self.analyze_result(result) | ||
|
||
if analysis["is_valid"] or analysis["confidence"] < min_confidence: | ||
results["valid"].append(result["task_id"]) | ||
else: | ||
results["invalid"].append({ | ||
"task_id": result["task_id"], | ||
"reason": analysis["reason"], | ||
"confidence": analysis["confidence"] | ||
}) | ||
|
||
# Save results | ||
output_path = self.results_file.parent / "dataset_cleaning_results.json" | ||
with open(output_path, "w") as f: | ||
json.dump(results, f, indent=2) | ||
|
||
print(f"Dataset cleaning results saved to {output_path}") | ||
print(f"Valid test cases: {len(results['valid'])}") | ||
print(f"Invalid test cases: {len(results['invalid'])}") | ||
print("\nInvalid test cases and reasons:") | ||
for invalid in results["invalid"]: | ||
print(f"- {invalid['task_id']}: {invalid['reason']} (confidence: {invalid['confidence']:.2f})") | ||
|
||
return results | ||
|
||
if __name__ == "__main__": | ||
import argparse | ||
|
||
parser = argparse.ArgumentParser(description="Clean benchmark dataset by filtering invalid test cases") | ||
parser.add_argument("results_file", help="Path to results.json file") | ||
parser.add_argument("--min-confidence", type=float, default=0.8, | ||
help="Minimum confidence threshold for filtering (default: 0.8)") | ||
parser.add_argument("--api-key", help="OpenAI API key (optional)") | ||
|
||
args = parser.parse_args() | ||
|
||
cleaner = DatasetCleaner(args.results_file, os.getenv("OPENAI_API_KEY")) | ||
results = cleaner.clean_dataset(min_confidence=args.min_confidence) |
Oops, something went wrong.