diff --git a/.gitignore b/.gitignore index 9dbfefc0..0f9ba034 100644 --- a/.gitignore +++ b/.gitignore @@ -19,6 +19,7 @@ venv/ # docs documents.txt credentials.json +creds.json # virtualenv .venv @@ -29,3 +30,4 @@ credentials.json **/.deepeval-cache.json temp_test_run_data.json **/llm_tests_output.txt +**/error_log.txt diff --git a/evaluation/auto_evaluation/eval_main.py b/evaluation/auto_evaluation/eval_main.py index aac81503..5f708fb3 100644 --- a/evaluation/auto_evaluation/eval_main.py +++ b/evaluation/auto_evaluation/eval_main.py @@ -19,7 +19,7 @@ make_hallucination_metric, ) from auto_evaluation.dataset import hf_pull, preprocess -from tqdm import tqdm # type: ignore +from tqdm import tqdm eval_root_path = os.path.join(os.path.dirname(__file__), "..") load_dotenv(dotenv_path=os.path.join(eval_root_path, ".env")) diff --git a/evaluation/human_evaluation/main.py b/evaluation/human_evaluation/main.py index 5d04e676..e3ea3987 100644 --- a/evaluation/human_evaluation/main.py +++ b/evaluation/human_evaluation/main.py @@ -2,9 +2,13 @@ from dotenv import load_dotenv import os -from utils.sheets import read_questions_and_answers, write_responses, find_new_questions -from utils.api import fetch_endpoints, get_responses -from utils.utils import ( +from human_evaluation.utils.sheets import ( + read_questions_and_answers, + write_responses, + find_new_questions, +) +from human_evaluation.utils.api import fetch_endpoints, get_responses +from human_evaluation.utils.utils import ( parse_custom_input, selected_questions, update_gform, diff --git a/evaluation/pyproject.toml b/evaluation/pyproject.toml index 013e1111..107c4a7c 100644 --- a/evaluation/pyproject.toml +++ b/evaluation/pyproject.toml @@ -21,7 +21,7 @@ dependencies = { file = ["requirements.txt"] } optional-dependencies = { test = { file = ["requirements-test.txt"] } } [tool.setuptools.packages.find] -include = ["auto_evaluation", "human_evaluation"] +include = ["auto_evaluation", "human_evaluation", "script_based_evaluation"] [tool.mypy] python_version = "3.12" @@ -30,6 +30,7 @@ warn_return_any = true warn_unused_ignores = true strict_optional = true disable_error_code = ["call-arg"] +explicit_package_bases = true exclude = "src/post_install.py" [[tool.mypy.overrides]] diff --git a/evaluation/requirements-test.txt b/evaluation/requirements-test.txt index de806103..24cd375b 100644 --- a/evaluation/requirements-test.txt +++ b/evaluation/requirements-test.txt @@ -2,3 +2,4 @@ mypy==1.10.1 ruff==0.5.1 types-requests==2.32.0.20240622 google-api-python-client-stubs==1.28.0 +types-tqdm==4.67.0.20241221 diff --git a/evaluation/requirements.txt b/evaluation/requirements.txt index 96e6f1ae..8d73683e 100644 --- a/evaluation/requirements.txt +++ b/evaluation/requirements.txt @@ -13,3 +13,8 @@ langchain-google-vertexai==2.0.6 asyncio==3.4.3 huggingface-hub==0.26.2 instructor[vertexai]==1.5.2 +openai==1.58.1 +pydantic==2.10.4 +tqdm==4.67.1 +vertexai==1.71.1 +plotly==5.24.1 diff --git a/evaluation/script_based_evaluation/.env.sample b/evaluation/script_based_evaluation/.env.sample new file mode 100644 index 00000000..7182ea5c --- /dev/null +++ b/evaluation/script_based_evaluation/.env.sample @@ -0,0 +1,2 @@ +GOOGLE_APPLICATION_CREDENTIALS={{GOOGLE_APPLICATION_CREDENTIALS}} +OPENAI_API_KEY={{OPENAI_API_KEY}} \ No newline at end of file diff --git a/evaluation/script_based_evaluation/README.md b/evaluation/script_based_evaluation/README.md new file mode 100644 index 00000000..8abd4e02 --- /dev/null +++ b/evaluation/script_based_evaluation/README.md @@ -0,0 +1,134 @@ +# ORAssistant Automated Evaluation + +This project automates the evaluation of language model responses using classification-based metrics and LLMScore. It supports testing against various models, including OpenAI and Google Vertex AI. It also serves as an evaluation benchmark for comparing multiple versions of ORAssistant. + + +## Features + +1. **Classification-based Metrics**: + - Categorizes responses into True Positive (TP), True Negative (TN), False Positive (FP), and False Negative (FN). + - Computes metrics such as Accuracy, Precision, Recall, and F1 Score. + +2. **LLMScore**: + - Assigns a score between 0 and 1 by comparing the ground truth against the generated response's quality and accuracy. + +## Setup + +### Environment Variables + +Create a `.env` file in the root directory with the following variables: +```plaintext +GOOGLE_APPLICATION_CREDENTIALS=path/to/secret.json +OPENAI_API_KEY=your_openai_api_key # Required if testing against OpenAI models +``` +### Required Files + +- `secret.json`: Ensure you have a Google Vertex AI subscription and the necessary credentials file. + +### Data Files + +- **Input File**: `data/data.csv` + - This file should contain the questions to be tested. Ensure it is formatted as a CSV file with the following columns: `Question`, `Answer`. + +- **Output File**: `data/data_result.csv` + - This file will be generated after running the script. It contains the results of the evaluation. + +## How to Run + +1. **Activate virtual environment** + + From the previous directory (`evaluation`), make sure you have run the command + `make init` before activating virtual environment. It is needed to recognise + this folder as a submodule. + +2. **Run the Script** + + Use the following command to execute the script with customizable options: + + ```bash + python script.py --env-path /path/to/.env --creds-path /path/to/secret.json --iterations 10 --llms "base-gemini-1.5-flash,base-gpt-4o" --agent-retrievers "v1=http://url1.com,v2=http://url2.com" + ``` + + - `--env-path`: Path to the `.env` file. + - `--creds-path`: Path to the `secret.json` file. + - `--iterations`: Number of iterations per question. + - `--llms`: Comma-separated list of LLMs to test. + - `--agent-retrievers`: Comma-separated list of agent-retriever names and URLs. + +3. **View Results** + + Results will be saved in a CSV file named after the input data file with `_result` appended. + +## Basic Usage + +### a. Default Usage + +```bash +python main.py +``` + +- Uses the default `.env` file in the project root. +- Default `data/data.csv` as input. +- 5 iterations per question. +- Tests all available LLMs. +- No additional agent-retrievers. + +### b. Specify .env and secret.json Paths + +```bash +python main.py --env-path /path/to/.env --creds-path /path/to/secret.json +``` + +### c. Customize Iterations and Select Specific LLMs + +```bash +python main.py --iterations 10 --llms "base-gpt-4o,base-gemini-1.5-flash" +``` + +### d. Add Agent-Retrievers with Custom Names + +```bash +python main.py --agent-retrievers "v1=http://url1.com,v2=http://url2.com" +``` + +### e. Full Example with All Options + +```bash +python main.py \ + --env-path /path/to/.env \ + --creds-path /path/to/secret.json \ + --iterations 10 \ + --llms "base-gemini-1.5-flash,base-gpt-4o" \ + --agent-retrievers "v1=http://url1.com,v2=http://url2.com" +``` + +### f. Display Help Message + +To view all available command-line options: + +```bash +python main.py --help +``` + +### Run Analysis + +After generating results, you can perform analysis using the provided `analysis.py` script. To run the analysis, execute the following command: + +```bash +streamlit run analysis.py +``` + + +### Sample Comparison Commands + +1. To compare three versions of ORAssistant, use: + ```bash + python main.py --agent-retrievers "orassistant-v1=http://url1.com,orassistant-v2=http://url2.com,orassistant-v3=http://url3.com" + ``` + *Note: Each URL is the endpoint of the ORAssistant backend.* + +2. To compare ORAssistant with base-gpt-4o, use: + ```bash + python main.py --llms "base-gpt-4o" --agent-retrievers "orassistant=http://url.com" + ``` + *Note: The URL is the endpoint of the ORAssistant backend.* \ No newline at end of file diff --git a/evaluation/script_based_evaluation/__init__.py b/evaluation/script_based_evaluation/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/evaluation/script_based_evaluation/analysis.py b/evaluation/script_based_evaluation/analysis.py new file mode 100644 index 00000000..4aa8627b --- /dev/null +++ b/evaluation/script_based_evaluation/analysis.py @@ -0,0 +1,326 @@ +import streamlit as st +import pandas as pd # type: ignore +import plotly.express as px # type: ignore +import plotly.graph_objects as go # type: ignore + +st.set_page_config(layout="wide") + + +def load_data(file_path): + df = pd.read_csv(file_path) + return df + + +def display_metric_formulas(): + st.subheader("Metric Formulas") + + st.latex(r""" + \text{Accuracy} = \frac{TP + TN}{TP + TN + FP + FN} + """) + + st.latex(r""" + \text{Precision} = \frac{TP}{TP + FP} + """) + + st.latex(r""" + \text{Recall} = \frac{TP}{TP + FN} + """) + + st.latex(r""" + \text{F1 Score} = 2 \cdot \frac{\text{Precision} \cdot \text{Recall}}{\text{Precision} + \text{Recall}} + """) + + st.markdown(""" + Where: + - TP = True Positives + - TN = True Negatives + - FP = False Positives + - FN = False Negatives + """) + + st.subheader("Explanation of TP, TN, FP, FN") + + st.markdown(""" + **True Positive (TP):** + - Description: The model provided a correct and relevant answer. + - Example: + - Question: "What does CTS stand for in the OpenROAD flow?" + - Model Answer: "CTS stands for Clock Tree Synthesis. It is a stage in the OpenROAD flow that synthesizes the clock distribution network. CTS inserts clock buffers to distribute the clock signal to all sequential elements while minimizing skew. The CTS metrics reported include the number of clock roots, number of buffers inserted, number of clock subnets, and number of sinks." + - Evaluation: TP (The model provided a detailed, accurate, and relevant answer.) + + **True Negative (TN):** + - Description: The model correctly identified that it couldn't answer a question or that the question was out of scope. + - Example: + - Question: "What is the latest movie released in theaters?" + - Model Answer: "I can't provide information on movies as it is out of my scope." + - Evaluation: TN (The model correctly identified that the question was out of scope.) + + **False Positive (FP):** + - Description: The model provided an answer that it thought was correct, but was actually incorrect or irrelevant. + - Example: + - Question: "What does CTS stand for in the OpenROAD flow?" + - Model Answer: "CTS stands for Central Time Scheduling. It is a process used in schools to manage class schedules." + - Evaluation: FP (The model provided an incorrect and irrelevant answer.) + + **False Negative (FN):** + - Description: The model failed to provide an answer when it should have been able to. + - Example: + - Question: "What does CTS stand for in the OpenROAD flow?" + - Model Answer: "I cannot provide an answer to this question." + - Evaluation: FN (The model failed to provide an answer when it was expected to.) + """) + + +def calculate_accuracy_counts(df): + if "architecture" in df.columns and "acc_value" in df.columns: + if df["acc_value"].dtype == "object": + df["acc_value"] = df["acc_value"].astype(str).str.strip().str.upper() + else: + pass + + accuracy_counts = ( + df.groupby(["architecture", "acc_value"]).size().unstack(fill_value=0) + ) + + for col in ["TP", "TN", "FP", "FN"]: + if col not in accuracy_counts.columns: + accuracy_counts[col] = 0 + + accuracy_counts["Total"] = accuracy_counts.sum(axis=1) + + return accuracy_counts + else: + st.error( + "Required columns 'architecture' or 'acc_value' not found in the DataFrame." + ) + return None + + +def calculate_metrics(df): + # Calculate metrics for each architecture + metrics = {} + for arch in df["architecture"].unique(): + arch_data = df[df["architecture"] == arch] + + tp = sum(arch_data["acc_value"] == "TP") + tn = sum(arch_data["acc_value"] == "TN") + fp = sum(arch_data["acc_value"] == "FP") + fn = sum(arch_data["acc_value"] == "FN") + + total = tp + tn + fp + fn + accuracy = (tp + tn) / total if total > 0 else 0 + precision = tp / (tp + fp) if (tp + fp) > 0 else 0 + recall = tp / (tp + fn) if (tp + fn) > 0 else 0 + f1_score = ( + 2 * (precision * recall) / (precision + recall) + if (precision + recall) > 0 + else 0 + ) + + llm_score = arch_data["llm_score"].mean() + response_time = arch_data["response_time"].mean() + + metrics[arch] = { + "Accuracy": accuracy, + "Precision": precision, + "Recall": recall, + "F1 Score": f1_score, + "LLM Score": llm_score, + "Response Time": response_time, + "TP": tp, + "TN": tn, + "FP": fp, + "FN": fn, + "Total": total, + } + + return metrics + + +def main(): + st.title("Architecture Metrics Visualization") + + # Get list of CSV files in the current directory + selected_file = "data/data_result.csv" + + # Load the selected CSV file + df = load_data(selected_file) + + # Display the selected CSV file in table format + st.subheader(f"Contents of {selected_file}") + st.dataframe(df) + + # Display metric formulas and explanations + display_metric_formulas() + + # Calculate metrics + metrics = calculate_metrics(df) + + # Display accuracy counts + st.subheader("Accuracy Value Counts per Architecture") + accuracy_counts = pd.DataFrame( + { + arch: { + "TP": m["TP"], + "TN": m["TN"], + "FP": m["FP"], + "FN": m["FN"], + "Total": m["Total"], + } + for arch, m in metrics.items() + } + ).T + st.dataframe(accuracy_counts) + + st.subheader("Distribution of Accuracy Values Across Architectures") + accuracy_counts_melted = accuracy_counts.reset_index().melt( + id_vars="index", value_vars=["TP", "TN", "FP", "FN"] + ) + accuracy_counts_melted.columns = ["Architecture", "Accuracy Value", "Count"] + fig_acc = px.bar( + accuracy_counts_melted, + x="Architecture", + y="Count", + color="Accuracy Value", + barmode="group", + title="Distribution of Accuracy Values Across Architectures", + labels={ + "Architecture": "Architecture", + "Count": "Count", + "Accuracy Value": "Accuracy Value", + }, + ) + st.plotly_chart(fig_acc) + + custom_colors = { + "v1": "#1f77b4", # Blue + "v2": "#ff7f0e", # Orange + "base-gemini-1.5-flash": "#2ca02c", # Green + "base-gpt-4o": "#d62728", # Red + } + + metric_names = [ + "Accuracy", + "Precision", + "Recall", + "F1 Score", + "LLM Score", + "Response Time", + ] + selected_metric = st.selectbox( + "Select a metric to visualize", metric_names, key="metric_selector" + ) + + st.subheader(f"{selected_metric} Comparison Across Architectures") + sorted_architectures = sorted( + metrics.keys(), + key=lambda x: metrics[x][selected_metric], + reverse=True if selected_metric != "Response Time" else False, + ) + fig = px.bar( + x=sorted_architectures, + y=[metrics[arch][selected_metric] for arch in sorted_architectures], + labels={"x": "Architecture", "y": selected_metric}, + title=f"{selected_metric} Comparison Across Architectures", + color=sorted_architectures, + color_discrete_map=custom_colors, + ) + fig.update_layout( + xaxis_tickangle=-45, + yaxis_range=[0.7, 1] + if selected_metric != "Response Time" + else [0, max([metrics[arch][selected_metric] for arch in metrics]) * 1.1], + yaxis_tickformat=".5f" + if selected_metric == "LLM Score" + else ".1%" + if selected_metric != "Response Time" + else "", + height=600, + width=800, + legend_title="Architectures", + showlegend=False, + ) + fig.update_traces( + texttemplate="%{y:.5f}" + if selected_metric == "LLM Score" + else "%{y:.1%}" + if selected_metric != "Response Time" + else "%{y:.2f}", + textposition="outside", + ) + st.plotly_chart(fig) + + # Create precision-recall graph + st.subheader("Precision vs Recall for All Architectures") + fig_pr = go.Figure() + + for arch, metric in metrics.items(): + fig_pr.add_trace( + go.Scatter( + x=[metric["Recall"]], + y=[metric["Precision"]], + mode="markers+text", + name=arch, + marker=dict( + size=15, color=custom_colors.get(arch, "#000000") + ), # Default to black if color not found + text=[arch], + textposition="top center", + ) + ) + + fig_pr.update_layout( + title="Precision vs Recall for All Architectures", + xaxis_title="Recall", + yaxis_title="Precision", + xaxis=dict(range=[0.9, 1], tickformat=".1%"), + yaxis=dict(range=[0.9, 1], tickformat=".1%"), + height=600, + width=800, + showlegend=False, + ) + st.plotly_chart(fig_pr) + + # Create heatmap + st.subheader("Heatmap of Metrics Across Architectures") + heatmap_metrics = ["Accuracy", "Precision", "Recall", "F1 Score"] + heatmap_data = [] + for arch in metrics: + for metric in heatmap_metrics: + heatmap_data.append([arch, metric, metrics[arch][metric]]) + + heatmap_df = pd.DataFrame(heatmap_data, columns=["Architecture", "Metric", "Value"]) + heatmap_pivot = heatmap_df.pivot( + index="Architecture", columns="Metric", values="Value" + ) + + # Ensure the order of columns matches the order in heatmap_metrics + heatmap_pivot = heatmap_pivot[heatmap_metrics] + + overall_performance = heatmap_pivot.mean(axis=1).sort_values(ascending=False) + sorted_architectures = overall_performance.index.tolist() + + heatmap_pivot = heatmap_pivot.loc[sorted_architectures] + + fig_heatmap = px.imshow( + heatmap_pivot, + labels=dict(x="Metric", y="Architecture", color="Value"), + x=heatmap_metrics, + y=heatmap_pivot.index, + color_continuous_scale="RdYlGn", + title="Heatmap of Metrics Across Architectures", + ) + + fig_heatmap.update_layout( + height=600, + width=800, + ) + + fig_heatmap.update_traces(text=heatmap_pivot.values, texttemplate="%{text:.3f}") + fig_heatmap.update_coloraxes(colorbar_tickformat=".3f") + + st.plotly_chart(fig_heatmap) + + +if __name__ == "__main__": + main() diff --git a/evaluation/script_based_evaluation/config/config.py b/evaluation/script_based_evaluation/config/config.py new file mode 100644 index 00000000..2b4c607d --- /dev/null +++ b/evaluation/script_based_evaluation/config/config.py @@ -0,0 +1,23 @@ +import os +from dotenv import load_dotenv, dotenv_values +from typing import Optional + + +def load_environment(env_path: str): + if not os.path.exists(env_path): + raise FileNotFoundError(f"The specified .env file does not exist at {env_path}") + load_dotenv(env_path, override=True) + config = dotenv_values(env_path) + google_creds: Optional[str] = config.get("GOOGLE_APPLICATION_CREDENTIALS") + if google_creds is not None: + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = google_creds + else: + raise KeyError( + "GOOGLE_APPLICATION_CREDENTIALS not found in .env file or is None" + ) + + +def get_config() -> dict[str, str]: + config_raw = dotenv_values(".env") + config = {k: v for k, v in config_raw.items() if v is not None} + return config diff --git a/evaluation/script_based_evaluation/main.py b/evaluation/script_based_evaluation/main.py new file mode 100644 index 00000000..7cef96be --- /dev/null +++ b/evaluation/script_based_evaluation/main.py @@ -0,0 +1,298 @@ +import argparse +import sys +import os +import csv +import traceback +import time +from script_based_evaluation.utils.data_utils import validate_csv_lines +from script_based_evaluation.utils.api_utils import send_request, llm_judge +from script_based_evaluation.utils.logging_utils import log_error +from script_based_evaluation.config.config import load_environment +from openai import OpenAI +from tqdm import tqdm +from typing import Any +import vertexai # type: ignore + + +def get_accuracy_value(response_text: str, ground_truth: str, query_text: str) -> str: + sys_prompt = """ + You are a LLM Judge Evaluator for OpenROAD Chat Bot. All Questions and Answers should be technical and related to OpenROAD, Chip Design, Problems related to it, general query, commands. + Evaluate the response based on the ground truth and return one of the following: TP, TN, FP, FN. + Definitions: + True Positive (TP): The model provided a correct and relevant answer. If the response is partially correct, it should still be considered TP. + True Negative (TN): The model correctly identified that it couldn't answer a question or that the question was out of scope. + False Positive (FP): The model provided an answer that it thought was correct, but was actually incorrect or irrelevant. (NOTE: Mark TP even if it's partially correct, FP is only for completely incorrect or irrelevant answer) + False Negative (FN): The model failed to provide an answer when it should have been able to. + Instructions: Compare the model's response with the ground truth. If the model's response is accurate and relevant, return 'TP'. + If the model correctly identifies that it cannot answer or the question is out of scope, return 'TN'. + If the model's response is incorrect or irrelevant, return 'FP'. + If the model fails to provide an answer when it should have been able to, return 'FN'. + Provide only one of the following outputs: TP, FP, FN, TN. + return type [one word only] for ex: TP + """ + final_query = f"{sys_prompt}\nQUESTION= {query_text}, ANS= {response_text}, GT= {ground_truth}" + for attempt in range(5): + try: + result = llm_judge(final_query) + print("Accuracy evaluation result:", result) + if result.strip() in ["TP", "TN", "FP", "FN"]: + return result.strip() + else: + raise ValueError(f"Invalid accuracy evaluation result: {result}") + except Exception as e: + print(f"Attempt {attempt + 1} failed: {str(e)}") + if attempt == 4: + print(f"All attempts failed. Raw accuracy value: {result}") + raise ValueError( + f"Failed to process accuracy value after 5 attempts: {str(e)}" + ) + time.sleep(2) + + return "Something went wrong while getting accuracy_value" + + +def get_llm_score(response_text: str, ground_truth: str, query_text: str) -> Any: + sys_prompt = """RULE: return type [float] Evaluate the response's accuracy by comparing it to the ground truth answer. Assign a numerical value between 0.00 and 1.00, where 0.00 represents completely inaccurate and 1.00 represents exactly accurate. If the response fails to provide a correct answer, contains apologies such as 'sorry,' or states 'it's not in my context,' assign a score of 0.00 + Your response must be a single float number between 0.00 and 1.00, with two decimal places. Do not include any other text or explanation. + +Example outputs: +0.00 +0.75 +1.00 + """ + final_query = f"{sys_prompt}\nQUESTION= {query_text}, ANS= {response_text}, GT= {ground_truth}" + for attempt in range(5): + try: + result = llm_judge(final_query) + print("LLM score result:", result) + score = float(result.strip()) + if 0.00 <= score <= 1.00: + return score + else: + raise ValueError(f"Invalid LLM score result: {result}") + except ValueError as e: + print(f"Attempt {attempt + 1} failed: {str(e)}") + if attempt == 4: + print(f"All attempts failed. Raw LLM score value: {result}") + raise ValueError( + f"Failed to process LLM score after 5 attempts: {str(e)}" + ) + time.sleep(2) + return "Something went wrong while getting llm_score" + + +def check_hallucination(response_text: str, ground_truth: str, query_text: str) -> Any: + prompt = f""" + Task: Check for hallucination + Question: {query_text} + Response: {response_text} + Ground Truth: {ground_truth} + Task: Determine if the response is unrelated or significantly deviates from the ground truth. + Return "True" if the response is completely different or unrelated to the ground truth. Otherwise, return "False". + """ + for attempt in range(5): + try: + result = llm_judge(prompt) + print("Hallucination check result:", result) + if result.strip() in ["True", "False"]: + return result.strip() == "True" + else: + raise ValueError(f"Invalid hallucination result: {result}") + except Exception as e: + print(f"Attempt {attempt + 1} failed: {str(e)}") + if attempt == 4: + print(f"All attempts failed. Raw hallucination value: {result}") + raise ValueError( + f"Failed to process hallucination value after 5 attempts: {str(e)}" + ) + time.sleep(2) + return "Something went wrong while getting check_hallucination" + + +def append_result_to_csv(result: dict[str, object], output_file: str): + fieldnames = [ + "question", + "ground_truth", + "architecture", + "response_time", + "response", + "itr", + "acc_value", + "llm_score", + "hall_score", + ] + try: + file_exists = os.path.isfile(output_file) + with open(output_file, "a", newline="", encoding="utf-8") as csvfile: + writer = csv.DictWriter( + csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL + ) + if not file_exists: + writer.writeheader() + writer.writerow(result) + except Exception as e: + error_message = f"Error in append_result_to_csv: {str(e)}" + log_error(error_message, traceback.format_exc()) + print(f"{error_message}\nCheck logs/error_log.txt for details.") + sys.exit(1) + + +def run_tests( + questions: list[dict[str, str]], + selected_retrievers: list[str], + iterations: int, + output_file: str, + agent_retriever_urls: dict[str, str], + client: OpenAI, +): + total_iterations = len(questions) * len(selected_retrievers) * iterations + print("Total Iterations:", total_iterations) + # Create a progress bar + pbar = tqdm(total=total_iterations) + + for question in questions: + prompt = question["question"] + ground_truth = question["ground_truth"] + + for architecture in selected_retrievers: + for itr in range(1, iterations + 1): + print( + f"Testing question: {prompt} with architecture: {architecture}, iteration: {itr}" + ) + try: + response, response_time = send_request( + architecture, prompt, agent_retriever_urls, client + ) + if not response: + response = "No response received." + + acc_value = get_accuracy_value(response, ground_truth, prompt) + llm_score = get_llm_score(response, ground_truth, prompt) + hallucination = check_hallucination(response, ground_truth, prompt) + + result = { + "question": prompt, + "ground_truth": ground_truth, + "architecture": architecture, + "response_time": response_time, + "response": response, + "itr": itr, + "acc_value": acc_value, + "llm_score": llm_score, + "hall_score": hallucination, + } + + append_result_to_csv(result, output_file) + + except Exception as e: + error_message = f"Error in run_tests: {str(e)}" + log_error(error_message, traceback.format_exc()) + print(f"{error_message}\nCheck logs/error_log.txt for details.") + + pbar.update(1) + + pbar.close() + return True + + +def main(): + parser = argparse.ArgumentParser( + description="Test LLM responses and evaluate them." + ) + parser.add_argument( + "--iterations", + type=int, + default=5, + help="Number of iterations per question per retriever", + ) + parser.add_argument( + "--llms", type=str, default="", help="Comma-separated list of LLMs to test" + ) + parser.add_argument( + "--agent-retrievers", + type=str, + default="", + help="Comma-separated list of agent-retriever names and URLs in the format name=url", + ) + args = parser.parse_args() + + iterations = args.iterations + + selected_retrievers = [] + if args.llms: + selected_retrievers = [llm.strip() for llm in args.llms.split(",")] + + agent_retriever_urls = {} + if args.agent_retrievers: + for item in args.agent_retrievers.split(","): + if "=" in item: + name, url = item.split("=", 1) + name = name.strip() + url = url.strip() + if not name or not url: + print( + f"Invalid format for agent-retriever: '{item}'. Expected format 'name=url'." + ) + sys.exit(1) + agent_retriever_urls[name] = url + else: + print( + f"Invalid format for agent-retriever: '{item}'. Expected format 'name=url'." + ) + sys.exit(1) + + # Merge selected_retrievers with agent_retriever names + if agent_retriever_urls: + selected_retrievers.extend(agent_retriever_urls.keys()) + + # Initialize environment + load_environment(".env") + + # Initialize OpenAI client + api_key = os.getenv("OPENAI_API_KEY") + if not api_key: + print("Error: OPENAI_API_KEY environment variable is not set.") + sys.exit(1) + + client = OpenAI(api_key=api_key) + + # Initialize Vertex AI + vertexai.init() + + # Input and Output Files + input_file = "data/data.csv" + output_file = f"{os.path.splitext(input_file)[0]}_result.csv" + + # Validate and read questions + valid_questions = validate_csv_lines(input_file) + + if not valid_questions: + print("Error: No valid questions found in the CSV file.") + sys.exit(1) + + print("\nQuestions to be tested:") + for i, question in enumerate(valid_questions, start=1): + print(f"{i}. {question['question']}") + + input("\nPress Enter to start testing...") + + # Run the tests + try: + run_tests( + valid_questions, + selected_retrievers, + iterations, + output_file, + agent_retriever_urls, + client, + ) + print(f"\nTesting completed. Results saved to {output_file}") + except Exception as e: + error_message = f"An error occurred: {str(e)}" + log_error(error_message, traceback.format_exc()) + print(f"{error_message}\nCheck logs/error_log.txt for details.") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/evaluation/script_based_evaluation/models/gemini_model.py b/evaluation/script_based_evaluation/models/gemini_model.py new file mode 100644 index 00000000..8ac219bf --- /dev/null +++ b/evaluation/script_based_evaluation/models/gemini_model.py @@ -0,0 +1,101 @@ +import time +import sys +import traceback +import vertexai.preview.generative_models as genai # type: ignore +from vertexai.generative_models import ( # type: ignore + HarmCategory, + HarmBlockThreshold, + SafetySetting, +) +from script_based_evaluation.utils.logging_utils import log_error + + +def base_gemini_1_5_flash(query: str) -> tuple[str, float]: + safety_config = [ + SafetySetting( + category=HarmCategory.HARM_CATEGORY_HATE_SPEECH, + threshold=HarmBlockThreshold.BLOCK_NONE, + ), + SafetySetting( + category=HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT, + threshold=HarmBlockThreshold.BLOCK_NONE, + ), + SafetySetting( + category=HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, + threshold=HarmBlockThreshold.BLOCK_NONE, + ), + SafetySetting( + category=HarmCategory.HARM_CATEGORY_HARASSMENT, + threshold=HarmBlockThreshold.BLOCK_NONE, + ), + ] + while True: + try: + model = genai.GenerativeModel("gemini-1.5-flash") + start_time = time.time() + query = " " + query + response = model.generate_content(query, safety_settings=safety_config) + end_time = time.time() + response_time = (end_time - start_time) * 1000 # Convert to milliseconds + return response.text, response_time + except Exception as e: + if "429" in str(e) or "RESOURCE_EXHAUSTED" in str(e): + print("Rate limit exceeded, sleeping for 10 seconds") + time.sleep(10) + else: + error_message = f"Error in base_gemini_1_5_flash: {str(e)}" + error_details = traceback.format_exc() + log_error(error_message, error_details) + print( + "An error occurred while sending request to Gemini. Check error_log.txt for details." + ) + sys.exit(1) + + +def base_gemini_1_5_pro(query: str) -> tuple[str, float]: + safety_config = [ + SafetySetting( + category=HarmCategory.HARM_CATEGORY_HATE_SPEECH, + threshold=HarmBlockThreshold.BLOCK_NONE, + ), + SafetySetting( + category=HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT, + threshold=HarmBlockThreshold.BLOCK_NONE, + ), + SafetySetting( + category=HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, + threshold=HarmBlockThreshold.BLOCK_NONE, + ), + SafetySetting( + category=HarmCategory.HARM_CATEGORY_HARASSMENT, + threshold=HarmBlockThreshold.BLOCK_NONE, + ), + ] + while True: + try: + model = genai.GenerativeModel("gemini-1.5-pro") + start_time = time.time() + query = " " + query + response = model.generate_content( + query, + safety_settings=safety_config, + generation_config=genai.GenerationConfig( + max_output_tokens=2000, + temperature=0.0, + ), + ) + end_time = time.time() + response_time = (end_time - start_time) * 1000 # Convert to milliseconds + return response.text, response_time + except Exception as e: + if "429" in str(e) or "RESOURCE_EXHAUSTED" in str(e): + print("Rate limit exceeded, sleeping for 10 seconds") + time.sleep(10) + else: + error_message = f"Error in base_gemini_1_5_flash: {str(e)}" + error_details = traceback.format_exc() + log_error(error_message, error_details) + print( + "An error occurred while sending request to Gemini. Check error_log.txt for details." + ) + sys.exit(1) diff --git a/evaluation/script_based_evaluation/models/gpt_model.py b/evaluation/script_based_evaluation/models/gpt_model.py new file mode 100644 index 00000000..f91d9a09 --- /dev/null +++ b/evaluation/script_based_evaluation/models/gpt_model.py @@ -0,0 +1,29 @@ +import time +import sys +import traceback +from openai import OpenAI +from script_based_evaluation.utils.logging_utils import log_error + + +def base_gpt_4o(query: str, client: OpenAI) -> tuple[str, float]: + try: + start_time = time.time() + completion = client.chat.completions.create( + model="gpt-4o", messages=[{"role": "user", "content": query}] + ) + response_content = completion.choices[0].message.content + if response_content is not None: + response = response_content.strip() + else: + response = "" + end_time = time.time() + response_time = (end_time - start_time) * 1000 # Convert to milliseconds + return response, response_time + except Exception as e: + error_message = f"Error in base_gpt_4o: {str(e)}" + error_details = traceback.format_exc() + log_error(error_message, error_details) + print( + "An error occurred while sending request to GPT-4. Check error_log.txt for details." + ) + sys.exit(1) diff --git a/evaluation/script_based_evaluation/utils/api_utils.py b/evaluation/script_based_evaluation/utils/api_utils.py new file mode 100644 index 00000000..bb54836f --- /dev/null +++ b/evaluation/script_based_evaluation/utils/api_utils.py @@ -0,0 +1,82 @@ +import requests +import time +import sys +import traceback +from script_based_evaluation.utils.logging_utils import log_error +from script_based_evaluation.models.gpt_model import base_gpt_4o +from script_based_evaluation.models.gemini_model import ( + base_gemini_1_5_flash, + base_gemini_1_5_pro, +) +from openai import OpenAI + + +def send_request( + endpoint: str, query: str, agent_retriever_urls: dict[str, str], client: OpenAI +) -> tuple[str | None, float]: + try: + print("Sending request to endpoint:", endpoint) + if endpoint in agent_retriever_urls: + url = f"{agent_retriever_urls[endpoint]}/graphs/agent-retriever" + elif endpoint == "base-gemini-1.5-flash": + response_text, response_time = base_gemini_1_5_flash(query) + print("Response:", response_text) + return response_text, response_time + elif endpoint == "base-gpt-4o": + response_text, response_time = base_gpt_4o(query, client) + print("Response:", response_text) + return response_text, response_time + + payload = {"query": query, "list_context": True, "list_sources": True} + print(f"POST {url} with payload: {payload}") + response = requests.post(url, json=payload) + + try: + response_json = response.json() + print("Response:", response_json.get("response")) + return response_json.get( + "response" + ), response.elapsed.total_seconds() * 1000 + except ValueError as e: + print(f"Error parsing JSON response: {str(e)}") + return None, response.elapsed.total_seconds() * 1000 + except Exception as e: + error_message = f"Error in send_request: {str(e)}" + log_error(error_message, traceback.format_exc()) + raise + + +def llm_judge(prompt: str) -> str: + while True: + try: + response_text, _ = base_gemini_1_5_pro(prompt) + return response_text + except Exception as e: + if "429" in str(e) or "RESOURCE_EXHAUSTED" in str(e): + print("Rate limit exceeded, sleeping for 10 seconds") + time.sleep(10) + else: + log_error(f"Error in llm_judge: {str(e)}", traceback.format_exc()) + print( + "An error occurred while sending request to Gemini. Check error_log.txt for details." + ) + sys.exit(1) + + +def send_request_gemini(prompt: str) -> str: + while True: + try: + response_text, _ = base_gemini_1_5_flash(prompt) + return response_text + except Exception as e: + if "429" in str(e) or "RESOURCE_EXHAUSTED" in str(e): + print("Rate limit exceeded, sleeping for 10 seconds") + time.sleep(10) + else: + log_error( + f"Error in send_request_gemini: {str(e)}", traceback.format_exc() + ) + print( + "An error occurred while sending request to Gemini. Check error_log.txt for details." + ) + sys.exit(1) diff --git a/evaluation/script_based_evaluation/utils/data_utils.py b/evaluation/script_based_evaluation/utils/data_utils.py new file mode 100644 index 00000000..86738baf --- /dev/null +++ b/evaluation/script_based_evaluation/utils/data_utils.py @@ -0,0 +1,37 @@ +import csv +from script_based_evaluation.utils.logging_utils import log_error +import traceback + + +def read_data(csv_file: str) -> list[dict[str, str]]: + questions = [] + with open(csv_file, "r", newline="", encoding="utf-8") as f: + reader = csv.reader(f) + header = next(reader) # Skip the header row + assert len(header) == 2, "CSV file must have exactly 2 columns" + for row in reader: + questions.append( + {"question": row[0].strip(), "ground_truth": row[1].strip()} + ) + return questions + + +def validate_csv_lines(csv_file: str) -> list[dict[str, str]]: + valid_questions = [] + try: + with open(csv_file, "r", newline="", encoding="utf-8") as f: + reader = csv.reader(f) + header = next(reader, None) # Skip the header row + if header is None or len(header) != 2: + raise ValueError("CSV file is empty or has an invalid header.") + + for i, row in enumerate(reader, start=2): + if len(row) == 2 and row[0].strip() and row[1].strip(): + valid_questions.append( + {"question": row[0].strip(), "ground_truth": row[1].strip()} + ) + except Exception as e: + log_error(f"Error in validate_csv_lines: {str(e)}", traceback.format_exc()) + raise + + return valid_questions diff --git a/evaluation/script_based_evaluation/utils/logging_utils.py b/evaluation/script_based_evaluation/utils/logging_utils.py new file mode 100644 index 00000000..e6fe752c --- /dev/null +++ b/evaluation/script_based_evaluation/utils/logging_utils.py @@ -0,0 +1,9 @@ +from datetime import datetime +import os + + +def log_error(error_message: str, error_details: str): + os.makedirs("logs", exist_ok=True) + with open("logs/error_log.txt", "a") as f: + f.write(f"{datetime.now()}: {error_message}\n") + f.write(f"Details:\n{error_details}\n\n") diff --git a/evaluation/script_based_evaluation/utils/resume_utils.py b/evaluation/script_based_evaluation/utils/resume_utils.py new file mode 100644 index 00000000..96daf9fc --- /dev/null +++ b/evaluation/script_based_evaluation/utils/resume_utils.py @@ -0,0 +1,24 @@ +import json +import traceback +from typing import Any +from script_based_evaluation.utils.logging_utils import log_error + + +def initialize_resume_data() -> dict[str, Any]: + return {"retriever": "", "question": "", "iteration": 0} + + +def save_resume_data(resume_data: dict[str, Any], filename: str): + with open(filename, "w") as f: + json.dump(resume_data, f) + + +def load_resume_data(filename: str) -> dict[str, Any]: + try: + with open(filename, "r") as f: + return dict(json.load(f)) + except FileNotFoundError: + return initialize_resume_data() + except Exception as e: + log_error(f"Error in load_resume_data: {str(e)}", traceback.format_exc()) + raise