diff --git a/src/openlayer/lib/constants.py b/src/openlayer/lib/constants.py deleted file mode 100644 index 3566ecae..00000000 --- a/src/openlayer/lib/constants.py +++ /dev/null @@ -1,93 +0,0 @@ -"""Module for storing constants used throughout the OpenLayer SDK. -""" - -# --------------------------- LLM usage costs table -------------------------- # -# Last update: 2024-02-05 -OPENAI_COST_PER_TOKEN = { - "babbage-002": { - "input": 0.0004e-3, - "output": 0.0004e-3, - }, - "davinci-002": { - "input": 0.002e-3, - "output": 0.002e-3, - }, - "gpt-3.5-turbo": { - "input": 0.0005e-3, - "output": 0.0015e-3, - }, - "gpt-3.5-turbo-0125": { - "input": 0.0005e-3, - "output": 0.0015e-3, - }, - "gpt-3.5-turbo-0301": { - "input": 0.0015e-3, - "output": 0.002e-3, - }, - "gpt-3.5-turbo-0613": { - "input": 0.0015e-3, - "output": 0.002e-3, - }, - "gpt-3.5-turbo-1106": { - "input": 0.001e-3, - "output": 0.002e-3, - }, - "gpt-3.5-turbo-16k-0613": { - "input": 0.003e-3, - "output": 0.004e-3, - }, - "gpt-3.5-turbo-instruct": { - "input": 0.0015e-3, - "output": 0.002e-3, - }, - "gpt-4": { - "input": 0.03e-3, - "output": 0.06e-3, - }, - "gpt-4-turbo-preview": { - "input": 0.01e-3, - "output": 0.03e-3, - }, - "gpt-4-0125-preview": { - "input": 0.01e-3, - "output": 0.03e-3, - }, - "gpt-4-1106-preview": { - "input": 0.01e-3, - "output": 0.03e-3, - }, - "gpt-4-0314": { - "input": 0.03e-3, - "output": 0.06e-3, - }, - "gpt-4-1106-vision-preview": { - "input": 0.01e-3, - "output": 0.03e-3, - }, - "gpt-4-32k": { - "input": 0.06e-3, - "output": 0.12e-3, - }, - "gpt-4-32k-0314": { - "input": 0.06e-3, - "output": 0.12e-3, - }, -} -# Last update: 2024-03-26 -AZURE_OPENAI_COST_PER_TOKEN = { - "babbage-002": { - "input": 0.0004e-3, - "output": 0.0004e-3, - }, - "davinci-002": { - "input": 0.002e-3, - "output": 0.002e-3, - }, - "gpt-35-turbo": {"input": 0.0005e-3, "output": 0.0015e-3}, - "gpt-35-turbo-0125": {"input": 0.0005e-3, "output": 0.0015e-3}, - "gpt-35-turbo-instruct": {"input": 0.0015e-3, "output": 0.002e-3}, - "gpt-4-turbo": {"input": 0.01e-3, "output": 0.03e-3}, - "gpt-4-turbo-vision": {"input": 0.01e-3, "output": 0.03e-3}, - "gpt-4-8k": {"input": 0.03e-3, "output": 0.06e-3}, - "gpt-4-32k": {"input": 0.06e-3, "output": 0.12e-3}, -} diff --git a/src/openlayer/lib/integrations/anthropic_tracer.py b/src/openlayer/lib/integrations/anthropic_tracer.py index d1d0f23c..241e3382 100644 --- a/src/openlayer/lib/integrations/anthropic_tracer.py +++ b/src/openlayer/lib/integrations/anthropic_tracer.py @@ -23,7 +23,6 @@ def trace_anthropic( - end_time: The time when the completion was received. - latency: The time it took to generate the completion. - tokens: The total number of tokens used to generate the completion. - - cost: The estimated cost of the completion. - prompt_tokens: The number of tokens in the prompt. - completion_tokens: The number of tokens in the completion. - model: The model used to generate the completion. @@ -152,15 +151,12 @@ def stream_chunks( collected_function_call["inputs"] = json.loads(collected_function_call["inputs"]) output_data = collected_function_call - cost = 0 - trace_args = create_trace_args( end_time=end_time, inputs={"prompt": kwargs["messages"]}, output=output_data, latency=latency, tokens=num_of_completion_tokens, - cost=cost, prompt_tokens=num_of_prompt_tokens, completion_tokens=num_of_completion_tokens, model=kwargs.get("model"), @@ -206,14 +202,12 @@ def handle_non_streaming_create( # Try to add step to the trace try: output_data = parse_non_streaming_output_data(response) - cost = 0 trace_args = create_trace_args( end_time=end_time, inputs={"prompt": kwargs["messages"]}, output=output_data, latency=(end_time - start_time) * 1000, tokens=response.usage.input_tokens + response.usage.output_tokens, - cost=cost, prompt_tokens=response.usage.input_tokens, completion_tokens=response.usage.output_tokens, model=response.model, @@ -275,7 +269,6 @@ def create_trace_args( output: str, latency: float, tokens: int, - cost: float, prompt_tokens: int, completion_tokens: int, model: str, @@ -291,7 +284,6 @@ def create_trace_args( "output": output, "latency": latency, "tokens": tokens, - "cost": cost, "prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens, "model": model, diff --git a/src/openlayer/lib/integrations/langchain_callback.py b/src/openlayer/lib/integrations/langchain_callback.py index 6b9b393b..41b4a6b4 100644 --- a/src/openlayer/lib/integrations/langchain_callback.py +++ b/src/openlayer/lib/integrations/langchain_callback.py @@ -7,7 +7,6 @@ from langchain import schema as langchain_schema from langchain.callbacks.base import BaseCallbackHandler -from .. import constants from ..tracing import tracer LANGCHAIN_TO_OPENLAYER_PROVIDER_MAP = {"openai-chat": "OpenAI"} @@ -27,7 +26,6 @@ def __init__(self, **kwargs: Any) -> None: self.provider: str = None self.model: Optional[str] = None self.model_parameters: Dict[str, Any] = None - self.cost: Optional[float] = None self.prompt_tokens: int = None self.completion_tokens: int = None self.total_tokens: int = None @@ -87,10 +85,6 @@ def on_llm_end(self, response: langchain_schema.LLMResult, **kwargs: Any) -> Any if response.llm_output and "token_usage" in response.llm_output: self.prompt_tokens = response.llm_output["token_usage"].get("prompt_tokens", 0) self.completion_tokens = response.llm_output["token_usage"].get("completion_tokens", 0) - self.cost = self._get_cost_estimate( - num_input_tokens=self.prompt_tokens, - num_output_tokens=self.completion_tokens, - ) self.total_tokens = response.llm_output["token_usage"].get("total_tokens", 0) for generations in response.generations: @@ -99,13 +93,6 @@ def on_llm_end(self, response: langchain_schema.LLMResult, **kwargs: Any) -> Any self._add_to_trace() - def _get_cost_estimate(self, num_input_tokens: int, num_output_tokens: int) -> float: - """Returns the cost estimate for a given model and number of tokens.""" - if self.model not in constants.OPENAI_COST_PER_TOKEN: - return None - cost_per_token = constants.OPENAI_COST_PER_TOKEN[self.model] - return cost_per_token["input"] * num_input_tokens + cost_per_token["output"] * num_output_tokens - def _add_to_trace(self) -> None: """Adds to the trace.""" name = PROVIDER_TO_STEP_NAME.get(self.provider, "Chat Completion Model") @@ -114,7 +101,6 @@ def _add_to_trace(self) -> None: provider=self.provider, inputs={"prompt": self.prompt}, output=self.output, - cost=self.cost, tokens=self.total_tokens, latency=self.latency, start_time=self.start_time, diff --git a/src/openlayer/lib/integrations/openai_tracer.py b/src/openlayer/lib/integrations/openai_tracer.py index fbc89317..064c35a9 100644 --- a/src/openlayer/lib/integrations/openai_tracer.py +++ b/src/openlayer/lib/integrations/openai_tracer.py @@ -8,7 +8,6 @@ import openai -from .. import constants from ..tracing import tracer logger = logging.getLogger(__name__) @@ -24,7 +23,6 @@ def trace_openai( - end_time: The time when the completion was received. - latency: The time it took to generate the completion. - tokens: The total number of tokens used to generate the completion. - - cost: The estimated cost of the completion. - prompt_tokens: The number of tokens in the prompt. - completion_tokens: The number of tokens in the completion. - model: The model used to generate the completion. @@ -161,12 +159,6 @@ def stream_chunks( else: collected_function_call["arguments"] = json.loads(collected_function_call["arguments"]) output_data = collected_function_call - completion_cost = estimate_cost( - model=kwargs.get("model"), - prompt_tokens=0, - completion_tokens=(num_of_completion_tokens if num_of_completion_tokens else 0), - is_azure_openai=is_azure_openai, - ) trace_args = create_trace_args( end_time=end_time, @@ -174,7 +166,6 @@ def stream_chunks( output=output_data, latency=latency, tokens=num_of_completion_tokens, - cost=completion_cost, prompt_tokens=0, completion_tokens=num_of_completion_tokens, model=kwargs.get("model"), @@ -196,21 +187,6 @@ def stream_chunks( ) -def estimate_cost( - prompt_tokens: int, - completion_tokens: int, - model: str, - is_azure_openai: bool = False, -) -> float: - """Returns the cost estimate for a given OpenAI model and number of tokens.""" - if is_azure_openai and model in constants.AZURE_OPENAI_COST_PER_TOKEN: - cost_per_token = constants.AZURE_OPENAI_COST_PER_TOKEN[model] - elif model in constants.OPENAI_COST_PER_TOKEN: - cost_per_token = constants.OPENAI_COST_PER_TOKEN[model] - return cost_per_token["input"] * prompt_tokens + cost_per_token["output"] * completion_tokens - return None - - def get_model_parameters(kwargs: Dict[str, Any]) -> Dict[str, Any]: """Gets the model parameters from the kwargs.""" return { @@ -234,7 +210,6 @@ def create_trace_args( output: str, latency: float, tokens: int, - cost: float, prompt_tokens: int, completion_tokens: int, model: str, @@ -250,7 +225,6 @@ def create_trace_args( "output": output, "latency": latency, "tokens": tokens, - "cost": cost, "prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens, "model": model, @@ -300,19 +274,12 @@ def handle_non_streaming_create( # Try to add step to the trace try: output_data = parse_non_streaming_output_data(response) - cost = estimate_cost( - model=response.model, - prompt_tokens=response.usage.prompt_tokens, - completion_tokens=response.usage.completion_tokens, - is_azure_openai=is_azure_openai, - ) trace_args = create_trace_args( end_time=end_time, inputs={"prompt": kwargs["messages"]}, output=output_data, latency=(end_time - start_time) * 1000, tokens=response.usage.total_tokens, - cost=cost, prompt_tokens=response.usage.prompt_tokens, completion_tokens=response.usage.completion_tokens, model=response.model, @@ -373,7 +340,7 @@ def trace_openai_assistant_thread_run(client: openai.OpenAI, run: "openai.types. """Trace a run from an OpenAI assistant. Once the run is completed, the thread data is published to Openlayer, - along with the latency, cost, and number of tokens used.""" + along with the latency, and number of tokens used.""" _type_check_run(run) # Do nothing if the run is not completed @@ -420,11 +387,6 @@ def _extract_run_vars(run: "openai.types.beta.threads.run.Run") -> Dict[str, any "completion_tokens": run.usage.completion_tokens, "tokens": run.usage.total_tokens, "model": run.model, - "cost": estimate_cost( - model=run.model, - prompt_tokens=run.usage.prompt_tokens, - completion_tokens=run.usage.completion_tokens, - ), } diff --git a/src/openlayer/lib/tracing/tracer.py b/src/openlayer/lib/tracing/tracer.py index 739a89f3..199f0667 100644 --- a/src/openlayer/lib/tracing/tracer.py +++ b/src/openlayer/lib/tracing/tracer.py @@ -306,7 +306,7 @@ def post_process_trace( else: input_variable_names = [] - processed_steps = bubble_up_costs_and_tokens(trace_obj.to_dict()) + processed_steps = trace_obj.to_dict() trace_data = { "inferenceTimestamp": root_step.start_time, @@ -322,34 +322,3 @@ def post_process_trace( trace_data.update(input_variables) return trace_data, input_variable_names - - -def bubble_up_costs_and_tokens(trace_dict: List[Dict[str, Any]]) -> List[Dict[str, Any]]: - """Adds the cost and number of tokens of nested steps to their parent steps.""" - - def add_step_costs_and_tokens(step: Dict[str, Any]) -> Tuple[float, int]: - step_cost = step_tokens = 0 - - if "cost" in step and step["cost"] is not None: - step_cost += step["cost"] - if "tokens" in step and step["tokens"] is not None: - step_tokens += step["tokens"] - - # Recursively add costs and tokens from nested steps - for nested_step in step.get("steps", []): - nested_cost, nested_tokens = add_step_costs_and_tokens(nested_step) - step_cost += nested_cost - step_tokens += nested_tokens - - if "steps" in step: - if step_cost > 0 and "cost" not in step: - step["cost"] = step_cost - if step_tokens > 0 and "tokens" not in step: - step["tokens"] = step_tokens - - return step_cost, step_tokens - - for root_step_dict in trace_dict: - add_step_costs_and_tokens(root_step_dict) - - return trace_dict