diff --git a/python/packages/autogen-ext/pyproject.toml b/python/packages/autogen-ext/pyproject.toml index 40acd71aff15..d5ac3454b8d3 100644 --- a/python/packages/autogen-ext/pyproject.toml +++ b/python/packages/autogen-ext/pyproject.toml @@ -31,6 +31,11 @@ file-surfer = [ "autogen-agentchat==0.4.3", "markitdown>=0.0.1a2", ] + +llama-cpp = [ + "llama-cpp-python" +] + graphrag = ["graphrag>=1.0.1"] web-surfer = [ "autogen-agentchat==0.4.3", diff --git a/python/packages/autogen-ext/src/autogen_ext/models/llama_cpp/__init__.py b/python/packages/autogen-ext/src/autogen_ext/models/llama_cpp/__init__.py new file mode 100644 index 000000000000..c0182d5c0c1d --- /dev/null +++ b/python/packages/autogen-ext/src/autogen_ext/models/llama_cpp/__init__.py @@ -0,0 +1,8 @@ +try: + from ._llama_cpp_completion_client import LlamaCppChatCompletionClient +except ImportError as e: + raise ImportError( + "Dependencies for Llama Cpp not found. " "Please install llama-cpp-python: " "pip install llama-cpp-python" + ) from e + +__all__ = ["LlamaCppChatCompletionClient"] diff --git a/python/packages/autogen-ext/src/autogen_ext/models/llama_cpp/_llama_cpp_completion_client.py b/python/packages/autogen-ext/src/autogen_ext/models/llama_cpp/_llama_cpp_completion_client.py new file mode 100644 index 000000000000..7f2428bd7f32 --- /dev/null +++ b/python/packages/autogen-ext/src/autogen_ext/models/llama_cpp/_llama_cpp_completion_client.py @@ -0,0 +1,240 @@ +import json +import logging # added import +from typing import Any, AsyncGenerator, Dict, List, Literal, Optional, Sequence, Union + +from autogen_core import CancellationToken +from autogen_core.models import AssistantMessage, ChatCompletionClient, CreateResult, SystemMessage, UserMessage +from autogen_core.tools import Tool +from llama_cpp import Llama +from pydantic import BaseModel + + +class ComponentModel(BaseModel): + provider: str + component_type: Optional[Literal["model", "agent", "tool", "termination", "token_provider"]] = None + version: Optional[int] = None + component_version: Optional[int] = None + description: Optional[str] = None + config: Dict[str, Any] + + +class LlamaCppChatCompletionClient(ChatCompletionClient): + def __init__( + self, + repo_id: str, + filename: str, + n_gpu_layers: int = -1, + seed: int = 1337, + n_ctx: int = 1000, + verbose: bool = True, + ): + """ + Initialize the LlamaCpp client. + """ + self.logger = logging.getLogger(__name__) # initialize logger + self.logger.setLevel(logging.DEBUG if verbose else logging.INFO) # set level based on verbosity + self.llm = Llama.from_pretrained( + repo_id=repo_id, + filename=filename, + n_gpu_layers=n_gpu_layers, + seed=seed, + n_ctx=n_ctx, + verbose=verbose, + ) + self._total_usage = {"prompt_tokens": 0, "completion_tokens": 0} + + async def create(self, messages: List[Any], tools: List[Any] = None, **kwargs) -> CreateResult: + """ + Generate a response using the model, incorporating tool metadata. + + :param messages: A list of message objects to process. + :param tools: A list of tool objects to register dynamically. + :param kwargs: Additional arguments for the model. + :return: A CreateResult object containing the model's response. + """ + tools = tools or [] + + # Convert LLMMessage objects to dictionaries with 'role' and 'content' + converted_messages = [] + for msg in messages: + if isinstance(msg, SystemMessage): + converted_messages.append({"role": "system", "content": msg.content}) + elif isinstance(msg, UserMessage): + converted_messages.append({"role": "user", "content": msg.content}) + elif isinstance(msg, AssistantMessage): + converted_messages.append({"role": "assistant", "content": msg.content}) + else: + raise ValueError(f"Unsupported message type: {type(msg)}") + + # Add tool descriptions to the system message + tool_descriptions = "\n".join( + [f"Tool: {i+1}. {tool.name} - {tool.description}" for i, tool in enumerate(tools)] + ) + + few_shot_example = """ + Example tool usage: + User: Validate this request: {"patient_name": "John Doe", "patient_id": "12345", "procedure": "MRI Knee"} + Assistant: Calling tool 'validate_request' with arguments: {"patient_name": "John Doe", "patient_id": "12345", "procedure": "MRI Knee"} + """ + + system_message = ( + "You are an assistant with access to tools. " + "If a user query matches a tool, explicitly invoke it with JSON arguments. " + "Here are the tools available:\n" + f"{tool_descriptions}\n" + f"{few_shot_example}" + ) + converted_messages.insert(0, {"role": "system", "content": system_message}) + + # Debugging outputs + # print(f"DEBUG: System message: {system_message}") + # print(f"DEBUG: Converted messages: {converted_messages}") + + # Generate the model response + response = self.llm.create_chat_completion(messages=converted_messages, stream=False) + self._total_usage["prompt_tokens"] += response.get("usage", {}).get("prompt_tokens", 0) + self._total_usage["completion_tokens"] += response.get("usage", {}).get("completion_tokens", 0) + + # Parse the response + response_text = response["choices"][0]["message"]["content"] + # print(f"DEBUG: Model response: {response_text}") + + # Detect tool usage in the response + tool_call = await self._detect_and_execute_tool(response_text, tools) + if not tool_call: + self.logger.debug("DEBUG: No tool was invoked. Returning raw model response.") + else: + self.logger.debug(f"DEBUG: Tool executed successfully: {tool_call}") + + # Create a CreateResult object + create_result = CreateResult( + content=tool_call if tool_call else response_text, + usage=response.get("usage", {}), + finish_reason=response["choices"][0].get("finish_reason", "unknown"), + cached=False, + ) + return create_result + + async def _detect_and_execute_tool(self, response_text: str, tools: List[Tool]) -> Optional[str]: + """ + Detect if the model is requesting a tool and execute the tool. + + :param response_text: The raw response text from the model. + :param tools: A list of available tools. + :return: The result of the tool execution or None if no tool is called. + """ + for tool in tools: + if tool.name.lower() in response_text.lower(): # Case-insensitive matching + self.logger.debug(f"DEBUG: Detected tool '{tool.name}' in response.") + # Extract arguments (if any) from the response + func_args = self._extract_tool_arguments(response_text) + if func_args: + self.logger.debug(f"DEBUG: Extracted arguments for tool '{tool.name}': {func_args}") + else: + self.logger.debug(f"DEBUG: No arguments found for tool '{tool.name}'.") + return f"Error: No valid arguments provided for tool '{tool.name}'." + + # Ensure arguments match the tool's args_type + try: + args_model = tool.args_type() + if "request" in args_model.__fields__: # Handle nested arguments + func_args = {"request": func_args} + args_instance = args_model(**func_args) + except Exception as e: + return f"Error parsing arguments for tool '{tool.name}': {e}" + + # Execute the tool + try: + result = await tool.run(args=args_instance, cancellation_token=CancellationToken()) + if isinstance(result, dict): + return json.dumps(result) + elif hasattr(result, "model_dump"): # If it's a Pydantic model + return json.dumps(result.model_dump()) + else: + return str(result) + except Exception as e: + return f"Error executing tool '{tool.name}': {e}" + + return None + + def _extract_tool_arguments(self, response_text: str) -> Dict[str, Any]: + """ + Extract tool arguments from the response text. + + :param response_text: The raw response text. + :return: A dictionary of extracted arguments. + """ + try: + args_start = response_text.find("{") + args_end = response_text.find("}") + if args_start != -1 and args_end != -1: + args_str = response_text[args_start : args_end + 1] + return json.loads(args_str) + except json.JSONDecodeError as e: + self.logger.debug(f"DEBUG: Failed to parse arguments: {e}") + return {} + + async def create_stream(self, messages: List[Any], tools: List[Any] = None, **kwargs) -> AsyncGenerator[str, None]: + """ + Generate a streaming response using the model. + + :param messages: A list of messages to process. + :param tools: A list of tool objects to register dynamically. + :param kwargs: Additional arguments for the model. + :return: An asynchronous generator yielding the response stream. + """ + tools = tools or [] + + # Convert LLMMessage objects to dictionaries with 'role' and 'content' + converted_messages = [] + for msg in messages: + if isinstance(msg, SystemMessage): + converted_messages.append({"role": "system", "content": msg.content}) + elif isinstance(msg, UserMessage): + converted_messages.append({"role": "user", "content": msg.content}) + elif isinstance(msg, AssistantMessage): + converted_messages.append({"role": "assistant", "content": msg.content}) + else: + raise ValueError(f"Unsupported message type: {type(msg)}") + + # Add tool descriptions to the system message + tool_descriptions = "\n".join([f"Tool: {tool.name} - {tool.description}" for tool in tools]) + if tool_descriptions: + converted_messages.insert( + 0, {"role": "system", "content": f"The following tools are available:\n{tool_descriptions}"} + ) + + # Convert messages into a plain string prompt + prompt = "\n".join(f"{msg['role']}: {msg['content']}" for msg in converted_messages) + # Call the model with streaming enabled + response_generator = self.llm(prompt=prompt, stream=True) + + for token in response_generator: + yield token["choices"][0]["text"] + + # Implement abstract methods + def actual_usage(self) -> Dict[str, int]: + return self._total_usage + + @property + def capabilities(self) -> Dict[str, bool]: + return {"chat": True, "stream": True} + + def count_tokens(self, messages: Sequence[Dict[str, Any]], **kwargs) -> int: + return sum(len(msg["content"].split()) for msg in messages) + + @property + def model_info(self) -> Dict[str, Any]: + return { + "name": "llama-cpp", + "capabilities": {"chat": True, "stream": True}, + "context_window": self.llm.n_ctx, + "function_calling": True, + } + + def remaining_tokens(self, messages: Sequence[Dict[str, Any]], **kwargs) -> int: + used_tokens = self.count_tokens(messages) + return max(self.llm.n_ctx - used_tokens, 0) + + def total_usage(self) -> Dict[str, int]: + return self._total_usage diff --git a/python/uv.lock b/python/uv.lock index d01ba4f8d7fa..54fccef3d1c9 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -588,6 +588,9 @@ jupyter-executor = [ langchain = [ { name = "langchain-core" }, ] +llama-cpp = [ + { name = "llama-cpp-python" }, +] magentic-one = [ { name = "autogen-agentchat" }, { name = "markitdown" }, @@ -676,6 +679,7 @@ requires-dist = [ { name = "grpcio", marker = "extra == 'grpc'", specifier = "~=1.62.0" }, { name = "ipykernel", marker = "extra == 'jupyter-executor'", specifier = ">=6.29.5" }, { name = "langchain-core", marker = "extra == 'langchain'", specifier = "~=0.3.3" }, + { name = "llama-cpp-python", marker = "extra == 'llama-cpp'" }, { name = "markitdown", marker = "extra == 'file-surfer'", specifier = ">=0.0.1a2" }, { name = "markitdown", marker = "extra == 'magentic-one'", specifier = ">=0.0.1a2" }, { name = "markitdown", marker = "extra == 'web-surfer'", specifier = ">=0.0.1a2" }, @@ -3200,6 +3204,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/99/0f/af106de1780cf526c96de1ba279edcb55a0376a4484a7dea206f9f038cc4/llama_cloud-0.1.8-py3-none-any.whl", hash = "sha256:1a0c4cf212a04f2375f1d0791ca4e5f196e0fb0567c4ec96cd9dbcad773de60a", size = 247083 }, ] +[[package]] +name = "llama-cpp-python" +version = "0.3.7" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "diskcache" }, + { name = "jinja2" }, + { name = "numpy" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a6/38/7a47b1fb1d83eaddd86ca8ddaf20f141cbc019faf7b425283d8e5ef710e5/llama_cpp_python-0.3.7.tar.gz", hash = "sha256:0566a0dcc0f38005c4093309a87f67c2452449522e3e17e15cd735a62957894c", size = 66715891 } + [[package]] name = "llama-index" version = "0.12.11" @@ -4233,7 +4249,6 @@ name = "nvidia-cublas-cu12" version = "12.4.5.8" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/7f/7f/7fbae15a3982dc9595e49ce0f19332423b260045d0a6afe93cdbe2f1f624/nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0f8aa1706812e00b9f19dfe0cdb3999b092ccb8ca168c0db5b8ea712456fd9b3", size = 363333771 }, { url = "https://files.pythonhosted.org/packages/ae/71/1c91302526c45ab494c23f61c7a84aa568b8c1f9d196efa5993957faf906/nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl", hash = "sha256:2fc8da60df463fdefa81e323eef2e36489e1c94335b5358bcb38360adf75ac9b", size = 363438805 }, ] @@ -4242,7 +4257,6 @@ name = "nvidia-cuda-cupti-cu12" version = "12.4.127" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/93/b5/9fb3d00386d3361b03874246190dfec7b206fd74e6e287b26a8fcb359d95/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:79279b35cf6f91da114182a5ce1864997fd52294a87a16179ce275773799458a", size = 12354556 }, { url = "https://files.pythonhosted.org/packages/67/42/f4f60238e8194a3106d06a058d494b18e006c10bb2b915655bd9f6ea4cb1/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:9dec60f5ac126f7bb551c055072b69d85392b13311fcc1bcda2202d172df30fb", size = 13813957 }, ] @@ -4251,7 +4265,6 @@ name = "nvidia-cuda-nvrtc-cu12" version = "12.4.127" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/77/aa/083b01c427e963ad0b314040565ea396f914349914c298556484f799e61b/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0eedf14185e04b76aa05b1fea04133e59f465b6f960c0cbf4e37c3cb6b0ea198", size = 24133372 }, { url = "https://files.pythonhosted.org/packages/2c/14/91ae57cd4db3f9ef7aa99f4019cfa8d54cb4caa7e00975df6467e9725a9f/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a178759ebb095827bd30ef56598ec182b85547f1508941a3d560eb7ea1fbf338", size = 24640306 }, ] @@ -4260,7 +4273,6 @@ name = "nvidia-cuda-runtime-cu12" version = "12.4.127" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/a1/aa/b656d755f474e2084971e9a297def515938d56b466ab39624012070cb773/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:961fe0e2e716a2a1d967aab7caee97512f71767f852f67432d572e36cb3a11f3", size = 894177 }, { url = "https://files.pythonhosted.org/packages/ea/27/1795d86fe88ef397885f2e580ac37628ed058a92ed2c39dc8eac3adf0619/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:64403288fa2136ee8e467cdc9c9427e0434110899d07c779f25b5c068934faa5", size = 883737 }, ] @@ -4283,7 +4295,6 @@ dependencies = [ { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/7a/8a/0e728f749baca3fbeffad762738276e5df60851958be7783af121a7221e7/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5dad8008fc7f92f5ddfa2101430917ce2ffacd86824914c82e28990ad7f00399", size = 211422548 }, { url = "https://files.pythonhosted.org/packages/27/94/3266821f65b92b3138631e9c8e7fe1fb513804ac934485a8d05776e1dd43/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f083fc24912aa410be21fa16d157fed2055dab1cc4b6934a0e03cba69eb242b9", size = 211459117 }, ] @@ -4292,7 +4303,6 @@ name = "nvidia-curand-cu12" version = "10.3.5.147" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/80/9c/a79180e4d70995fdf030c6946991d0171555c6edf95c265c6b2bf7011112/nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1f173f09e3e3c76ab084aba0de819c49e56614feae5c12f69883f4ae9bb5fad9", size = 56314811 }, { url = "https://files.pythonhosted.org/packages/8a/6d/44ad094874c6f1b9c654f8ed939590bdc408349f137f9b98a3a23ccec411/nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a88f583d4e0bb643c49743469964103aa59f7f708d862c3ddb0fc07f851e3b8b", size = 56305206 }, ] @@ -4306,7 +4316,6 @@ dependencies = [ { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/46/6b/a5c33cf16af09166845345275c34ad2190944bcc6026797a39f8e0a282e0/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_aarch64.whl", hash = "sha256:d338f155f174f90724bbde3758b7ac375a70ce8e706d70b018dd3375545fc84e", size = 127634111 }, { url = "https://files.pythonhosted.org/packages/3a/e1/5b9089a4b2a4790dfdea8b3a006052cfecff58139d5a4e34cb1a51df8d6f/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:19e33fa442bcfd085b3086c4ebf7e8debc07cfe01e11513cc6d332fd918ac260", size = 127936057 }, ] @@ -4318,7 +4327,6 @@ dependencies = [ { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/96/a9/c0d2f83a53d40a4a41be14cea6a0bf9e668ffcf8b004bd65633f433050c0/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_aarch64.whl", hash = "sha256:9d32f62896231ebe0480efd8a7f702e143c98cfaa0e8a76df3386c1ba2b54df3", size = 207381987 }, { url = "https://files.pythonhosted.org/packages/db/f7/97a9ea26ed4bbbfc2d470994b8b4f338ef663be97b8f677519ac195e113d/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1", size = 207454763 }, ] @@ -4335,7 +4343,6 @@ name = "nvidia-nvjitlink-cu12" version = "12.4.127" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/02/45/239d52c05074898a80a900f49b1615d81c07fceadd5ad6c4f86a987c0bc4/nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:4abe7fef64914ccfa909bc2ba39739670ecc9e820c83ccc7a6ed414122599b83", size = 20552510 }, { url = "https://files.pythonhosted.org/packages/ff/ff/847841bacfbefc97a00036e0fce5a0f086b640756dc38caea5e1bb002655/nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:06b3b9b25bf3f8af351d664978ca26a16d2c5127dbd53c0497e28d1fb9611d57", size = 21066810 }, ] @@ -4344,7 +4351,6 @@ name = "nvidia-nvtx-cu12" version = "12.4.127" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/06/39/471f581edbb7804b39e8063d92fc8305bdc7a80ae5c07dbe6ea5c50d14a5/nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7959ad635db13edf4fc65c06a6e9f9e55fc2f92596db928d169c0bb031e88ef3", size = 100417 }, { url = "https://files.pythonhosted.org/packages/87/20/199b8713428322a2f22b722c62b8cc278cc53dffa9705d744484b5035ee9/nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:781e950d9b9f60d8241ccea575b32f5105a5baf4c2351cab5256a24869f12a1a", size = 99144 }, ]