-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun.py
129 lines (112 loc) · 5.37 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import json
import argparse
from pathlib import Path
from parallel_runner import run_parallel_benchmark
from serial_runner import run_serial_benchmark
from evaluation.auto_eval import run_evaluation
from models import GPT4Model, ClaudeModel, GeminiModel
import os
import time
import logging
from dotenv import load_dotenv
def get_model(model_name):
"""Get the appropriate model based on command line argument."""
load_dotenv()
# Validate API keys exist
api_keys = {
'gpt4': ('OPENAI_API_KEY', os.getenv("OPENAI_API_KEY")),
'claude': ('ANTHROPIC_API_KEY', os.getenv("ANTHROPIC_API_KEY")),
'gemini': ('GOOGLE_API_KEY', os.getenv("GOOGLE_API_KEY"))
}
key_name, api_key = api_keys[model_name]
if not api_key:
raise ValueError(f"Missing {key_name} in environment variables. Please set it in your .env file.")
models = {
'gpt4': lambda: GPT4Model(api_key=api_key),
'claude': lambda: ClaudeModel(api_key=api_key, model_config={}),
'gemini': lambda: GeminiModel(api_key=api_key, model_config={})
}
if model_name not in models:
raise ValueError(f"Model {model_name} not supported. Choose from: {', '.join(models.keys())}")
return models[model_name]()
def main():
parser = argparse.ArgumentParser(description='Run web automation tasks')
parser.add_argument('--tasks', type=str, required=True, help='Path to tasks JSONL file')
parser.add_argument('--output', type=str, required=True, help='Output directory for results')
parser.add_argument('--max-workers', type=int, default=4, help='Number of parallel workers (only for parallel mode)')
parser.add_argument('--mode', type=str, choices=['serial', 'parallel'], default='parallel',
help='Run tasks serially or in parallel')
parser.add_argument('--wait-time', type=float, default=2.0, help='Wait time between actions in seconds')
parser.add_argument('--evaluate', action='store_true', help='Run evaluation after benchmark')
parser.add_argument('--evaluate-mode', type=str, choices=['serial', 'parallel'], default='parallel',
help='Run evaluations serially or in parallel')
parser.add_argument('--model', choices=['gpt4', 'claude', 'gemini'], default='gpt4', help='Model to use for the benchmark')
args = parser.parse_args()
# Create output directory if it doesn't exist
output_dir = Path(args.output)
output_dir.mkdir(parents=True, exist_ok=True)
# Initialize the selected model
model = get_model(args.model)
# Run benchmark based on mode
if args.mode == 'parallel':
results = run_parallel_benchmark(
tasks_file=args.tasks,
output_dir=str(output_dir),
max_workers=args.max_workers,
wait_time=args.wait_time,
model=model
)
else:
results = run_serial_benchmark(
tasks_file=args.tasks,
output_dir=str(output_dir),
wait_time=args.wait_time,
model=model
)
# Save results
results_file = output_dir / 'results.json'
with open(results_file, 'w') as f:
json.dump(results, f, indent=2)
# Run evaluation if requested
if args.evaluate:
# Run evaluations
eval_results = run_evaluation(
tasks_file=Path(args.tasks),
results_dir=results_file,
output_file=None, # Don't save to separate file
openai_key=os.getenv('OPENAI_API_KEY'),
max_workers=args.max_workers if args.evaluate_mode == 'parallel' else None
)
# Update results with evaluations
for result in results:
task_id = result['task_id']
eval_result = next((e for e in eval_results['evaluations'] if e['task_id'] == task_id), None)
if eval_result:
# Get evaluation scores and explanations, with defaults if missing
visual_score = eval_result.get('visual_score', 0.0)
html_score = eval_result.get('html_score', 0.0)
final_score = eval_result.get('final_score', 0.0) # Get final score from evaluation
visual_reasoning = eval_result.get('visual_reasoning', 'No visual evaluation available')
html_reasoning = eval_result.get('html_reasoning', 'No HTML evaluation available')
# Add evaluation scores to result
result['final_score'] = final_score # Add final score at top level
result['llm_evaluations'] = {
'image_similarity': {
'score': visual_score,
'explanation': visual_reasoning
},
'html_fuzzy_match': {
'score': html_score,
'explanation': html_reasoning
}
}
# Update success based on evaluation scores
# Only mark as success if both image and HTML evaluations pass
result['success'] = (visual_score > 0.5 and html_score > 0.5)
if not result['success'] and not result['error']:
result['error'] = "Failed evaluation checks"
# Save updated results
with open(results_file, 'w') as f:
json.dump(results, f, indent=2)
if __name__ == '__main__':
main()