diff --git a/python/packages/autogen-ext/pyproject.toml b/python/packages/autogen-ext/pyproject.toml
index 40acd71aff15..d5ac3454b8d3 100644
--- a/python/packages/autogen-ext/pyproject.toml
+++ b/python/packages/autogen-ext/pyproject.toml
@@ -31,6 +31,11 @@ file-surfer = [
     "autogen-agentchat==0.4.3",
     "markitdown>=0.0.1a2",
 ]
+
+llama-cpp = [
+    "llama-cpp-python"
+]
+
 graphrag = ["graphrag>=1.0.1"]
 web-surfer = [
     "autogen-agentchat==0.4.3",
diff --git a/python/packages/autogen-ext/src/autogen_ext/models/llama_cpp/__init__.py b/python/packages/autogen-ext/src/autogen_ext/models/llama_cpp/__init__.py
new file mode 100644
index 000000000000..c0182d5c0c1d
--- /dev/null
+++ b/python/packages/autogen-ext/src/autogen_ext/models/llama_cpp/__init__.py
@@ -0,0 +1,8 @@
+try:
+    from ._llama_cpp_completion_client import LlamaCppChatCompletionClient
+except ImportError as e:
+    raise ImportError(
+        "Dependencies for Llama Cpp not found. " "Please install llama-cpp-python: " "pip install llama-cpp-python"
+    ) from e
+
+__all__ = ["LlamaCppChatCompletionClient"]
diff --git a/python/packages/autogen-ext/src/autogen_ext/models/llama_cpp/_llama_cpp_completion_client.py b/python/packages/autogen-ext/src/autogen_ext/models/llama_cpp/_llama_cpp_completion_client.py
new file mode 100644
index 000000000000..7f2428bd7f32
--- /dev/null
+++ b/python/packages/autogen-ext/src/autogen_ext/models/llama_cpp/_llama_cpp_completion_client.py
@@ -0,0 +1,240 @@
+import json
+import logging  # added import
+from typing import Any, AsyncGenerator, Dict, List, Literal, Optional, Sequence, Union
+
+from autogen_core import CancellationToken
+from autogen_core.models import AssistantMessage, ChatCompletionClient, CreateResult, SystemMessage, UserMessage
+from autogen_core.tools import Tool
+from llama_cpp import Llama
+from pydantic import BaseModel
+
+
+class ComponentModel(BaseModel):
+    provider: str
+    component_type: Optional[Literal["model", "agent", "tool", "termination", "token_provider"]] = None
+    version: Optional[int] = None
+    component_version: Optional[int] = None
+    description: Optional[str] = None
+    config: Dict[str, Any]
+
+
+class LlamaCppChatCompletionClient(ChatCompletionClient):
+    def __init__(
+        self,
+        repo_id: str,
+        filename: str,
+        n_gpu_layers: int = -1,
+        seed: int = 1337,
+        n_ctx: int = 1000,
+        verbose: bool = True,
+    ):
+        """
+        Initialize the LlamaCpp client.
+        """
+        self.logger = logging.getLogger(__name__)  # initialize logger
+        self.logger.setLevel(logging.DEBUG if verbose else logging.INFO)  # set level based on verbosity
+        self.llm = Llama.from_pretrained(
+            repo_id=repo_id,
+            filename=filename,
+            n_gpu_layers=n_gpu_layers,
+            seed=seed,
+            n_ctx=n_ctx,
+            verbose=verbose,
+        )
+        self._total_usage = {"prompt_tokens": 0, "completion_tokens": 0}
+
+    async def create(self, messages: List[Any], tools: List[Any] = None, **kwargs) -> CreateResult:
+        """
+        Generate a response using the model, incorporating tool metadata.
+
+        :param messages: A list of message objects to process.
+        :param tools: A list of tool objects to register dynamically.
+        :param kwargs: Additional arguments for the model.
+        :return: A CreateResult object containing the model's response.
+        """
+        tools = tools or []
+
+        # Convert LLMMessage objects to dictionaries with 'role' and 'content'
+        converted_messages = []
+        for msg in messages:
+            if isinstance(msg, SystemMessage):
+                converted_messages.append({"role": "system", "content": msg.content})
+            elif isinstance(msg, UserMessage):
+                converted_messages.append({"role": "user", "content": msg.content})
+            elif isinstance(msg, AssistantMessage):
+                converted_messages.append({"role": "assistant", "content": msg.content})
+            else:
+                raise ValueError(f"Unsupported message type: {type(msg)}")
+
+        # Add tool descriptions to the system message
+        tool_descriptions = "\n".join(
+            [f"Tool: {i+1}. {tool.name} - {tool.description}" for i, tool in enumerate(tools)]
+        )
+
+        few_shot_example = """
+        Example tool usage:
+        User: Validate this request: {"patient_name": "John Doe", "patient_id": "12345", "procedure": "MRI Knee"}
+        Assistant: Calling tool 'validate_request' with arguments: {"patient_name": "John Doe", "patient_id": "12345", "procedure": "MRI Knee"}
+        """
+
+        system_message = (
+            "You are an assistant with access to tools. "
+            "If a user query matches a tool, explicitly invoke it with JSON arguments. "
+            "Here are the tools available:\n"
+            f"{tool_descriptions}\n"
+            f"{few_shot_example}"
+        )
+        converted_messages.insert(0, {"role": "system", "content": system_message})
+
+        # Debugging outputs
+        # print(f"DEBUG: System message: {system_message}")
+        # print(f"DEBUG: Converted messages: {converted_messages}")
+
+        # Generate the model response
+        response = self.llm.create_chat_completion(messages=converted_messages, stream=False)
+        self._total_usage["prompt_tokens"] += response.get("usage", {}).get("prompt_tokens", 0)
+        self._total_usage["completion_tokens"] += response.get("usage", {}).get("completion_tokens", 0)
+
+        # Parse the response
+        response_text = response["choices"][0]["message"]["content"]
+        # print(f"DEBUG: Model response: {response_text}")
+
+        # Detect tool usage in the response
+        tool_call = await self._detect_and_execute_tool(response_text, tools)
+        if not tool_call:
+            self.logger.debug("DEBUG: No tool was invoked. Returning raw model response.")
+        else:
+            self.logger.debug(f"DEBUG: Tool executed successfully: {tool_call}")
+
+        # Create a CreateResult object
+        create_result = CreateResult(
+            content=tool_call if tool_call else response_text,
+            usage=response.get("usage", {}),
+            finish_reason=response["choices"][0].get("finish_reason", "unknown"),
+            cached=False,
+        )
+        return create_result
+
+    async def _detect_and_execute_tool(self, response_text: str, tools: List[Tool]) -> Optional[str]:
+        """
+        Detect if the model is requesting a tool and execute the tool.
+
+        :param response_text: The raw response text from the model.
+        :param tools: A list of available tools.
+        :return: The result of the tool execution or None if no tool is called.
+        """
+        for tool in tools:
+            if tool.name.lower() in response_text.lower():  # Case-insensitive matching
+                self.logger.debug(f"DEBUG: Detected tool '{tool.name}' in response.")
+                # Extract arguments (if any) from the response
+                func_args = self._extract_tool_arguments(response_text)
+                if func_args:
+                    self.logger.debug(f"DEBUG: Extracted arguments for tool '{tool.name}': {func_args}")
+                else:
+                    self.logger.debug(f"DEBUG: No arguments found for tool '{tool.name}'.")
+                    return f"Error: No valid arguments provided for tool '{tool.name}'."
+
+                # Ensure arguments match the tool's args_type
+                try:
+                    args_model = tool.args_type()
+                    if "request" in args_model.__fields__:  # Handle nested arguments
+                        func_args = {"request": func_args}
+                    args_instance = args_model(**func_args)
+                except Exception as e:
+                    return f"Error parsing arguments for tool '{tool.name}': {e}"
+
+                # Execute the tool
+                try:
+                    result = await tool.run(args=args_instance, cancellation_token=CancellationToken())
+                    if isinstance(result, dict):
+                        return json.dumps(result)
+                    elif hasattr(result, "model_dump"):  # If it's a Pydantic model
+                        return json.dumps(result.model_dump())
+                    else:
+                        return str(result)
+                except Exception as e:
+                    return f"Error executing tool '{tool.name}': {e}"
+
+        return None
+
+    def _extract_tool_arguments(self, response_text: str) -> Dict[str, Any]:
+        """
+        Extract tool arguments from the response text.
+
+        :param response_text: The raw response text.
+        :return: A dictionary of extracted arguments.
+        """
+        try:
+            args_start = response_text.find("{")
+            args_end = response_text.find("}")
+            if args_start != -1 and args_end != -1:
+                args_str = response_text[args_start : args_end + 1]
+                return json.loads(args_str)
+        except json.JSONDecodeError as e:
+            self.logger.debug(f"DEBUG: Failed to parse arguments: {e}")
+        return {}
+
+    async def create_stream(self, messages: List[Any], tools: List[Any] = None, **kwargs) -> AsyncGenerator[str, None]:
+        """
+        Generate a streaming response using the model.
+
+        :param messages: A list of messages to process.
+        :param tools: A list of tool objects to register dynamically.
+        :param kwargs: Additional arguments for the model.
+        :return: An asynchronous generator yielding the response stream.
+        """
+        tools = tools or []
+
+        # Convert LLMMessage objects to dictionaries with 'role' and 'content'
+        converted_messages = []
+        for msg in messages:
+            if isinstance(msg, SystemMessage):
+                converted_messages.append({"role": "system", "content": msg.content})
+            elif isinstance(msg, UserMessage):
+                converted_messages.append({"role": "user", "content": msg.content})
+            elif isinstance(msg, AssistantMessage):
+                converted_messages.append({"role": "assistant", "content": msg.content})
+            else:
+                raise ValueError(f"Unsupported message type: {type(msg)}")
+
+        # Add tool descriptions to the system message
+        tool_descriptions = "\n".join([f"Tool: {tool.name} - {tool.description}" for tool in tools])
+        if tool_descriptions:
+            converted_messages.insert(
+                0, {"role": "system", "content": f"The following tools are available:\n{tool_descriptions}"}
+            )
+
+        # Convert messages into a plain string prompt
+        prompt = "\n".join(f"{msg['role']}: {msg['content']}" for msg in converted_messages)
+        # Call the model with streaming enabled
+        response_generator = self.llm(prompt=prompt, stream=True)
+
+        for token in response_generator:
+            yield token["choices"][0]["text"]
+
+    # Implement abstract methods
+    def actual_usage(self) -> Dict[str, int]:
+        return self._total_usage
+
+    @property
+    def capabilities(self) -> Dict[str, bool]:
+        return {"chat": True, "stream": True}
+
+    def count_tokens(self, messages: Sequence[Dict[str, Any]], **kwargs) -> int:
+        return sum(len(msg["content"].split()) for msg in messages)
+
+    @property
+    def model_info(self) -> Dict[str, Any]:
+        return {
+            "name": "llama-cpp",
+            "capabilities": {"chat": True, "stream": True},
+            "context_window": self.llm.n_ctx,
+            "function_calling": True,
+        }
+
+    def remaining_tokens(self, messages: Sequence[Dict[str, Any]], **kwargs) -> int:
+        used_tokens = self.count_tokens(messages)
+        return max(self.llm.n_ctx - used_tokens, 0)
+
+    def total_usage(self) -> Dict[str, int]:
+        return self._total_usage
diff --git a/python/uv.lock b/python/uv.lock
index d01ba4f8d7fa..54fccef3d1c9 100644
--- a/python/uv.lock
+++ b/python/uv.lock
@@ -588,6 +588,9 @@ jupyter-executor = [
 langchain = [
     { name = "langchain-core" },
 ]
+llama-cpp = [
+    { name = "llama-cpp-python" },
+]
 magentic-one = [
     { name = "autogen-agentchat" },
     { name = "markitdown" },
@@ -676,6 +679,7 @@ requires-dist = [
     { name = "grpcio", marker = "extra == 'grpc'", specifier = "~=1.62.0" },
     { name = "ipykernel", marker = "extra == 'jupyter-executor'", specifier = ">=6.29.5" },
     { name = "langchain-core", marker = "extra == 'langchain'", specifier = "~=0.3.3" },
+    { name = "llama-cpp-python", marker = "extra == 'llama-cpp'" },
     { name = "markitdown", marker = "extra == 'file-surfer'", specifier = ">=0.0.1a2" },
     { name = "markitdown", marker = "extra == 'magentic-one'", specifier = ">=0.0.1a2" },
     { name = "markitdown", marker = "extra == 'web-surfer'", specifier = ">=0.0.1a2" },
@@ -3200,6 +3204,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/99/0f/af106de1780cf526c96de1ba279edcb55a0376a4484a7dea206f9f038cc4/llama_cloud-0.1.8-py3-none-any.whl", hash = "sha256:1a0c4cf212a04f2375f1d0791ca4e5f196e0fb0567c4ec96cd9dbcad773de60a", size = 247083 },
 ]
 
+[[package]]
+name = "llama-cpp-python"
+version = "0.3.7"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "diskcache" },
+    { name = "jinja2" },
+    { name = "numpy" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a6/38/7a47b1fb1d83eaddd86ca8ddaf20f141cbc019faf7b425283d8e5ef710e5/llama_cpp_python-0.3.7.tar.gz", hash = "sha256:0566a0dcc0f38005c4093309a87f67c2452449522e3e17e15cd735a62957894c", size = 66715891 }
+
 [[package]]
 name = "llama-index"
 version = "0.12.11"
@@ -4233,7 +4249,6 @@ name = "nvidia-cublas-cu12"
 version = "12.4.5.8"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/7f/7f/7fbae15a3982dc9595e49ce0f19332423b260045d0a6afe93cdbe2f1f624/nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0f8aa1706812e00b9f19dfe0cdb3999b092ccb8ca168c0db5b8ea712456fd9b3", size = 363333771 },
     { url = "https://files.pythonhosted.org/packages/ae/71/1c91302526c45ab494c23f61c7a84aa568b8c1f9d196efa5993957faf906/nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl", hash = "sha256:2fc8da60df463fdefa81e323eef2e36489e1c94335b5358bcb38360adf75ac9b", size = 363438805 },
 ]
 
@@ -4242,7 +4257,6 @@ name = "nvidia-cuda-cupti-cu12"
 version = "12.4.127"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/93/b5/9fb3d00386d3361b03874246190dfec7b206fd74e6e287b26a8fcb359d95/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:79279b35cf6f91da114182a5ce1864997fd52294a87a16179ce275773799458a", size = 12354556 },
     { url = "https://files.pythonhosted.org/packages/67/42/f4f60238e8194a3106d06a058d494b18e006c10bb2b915655bd9f6ea4cb1/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:9dec60f5ac126f7bb551c055072b69d85392b13311fcc1bcda2202d172df30fb", size = 13813957 },
 ]
 
@@ -4251,7 +4265,6 @@ name = "nvidia-cuda-nvrtc-cu12"
 version = "12.4.127"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/77/aa/083b01c427e963ad0b314040565ea396f914349914c298556484f799e61b/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0eedf14185e04b76aa05b1fea04133e59f465b6f960c0cbf4e37c3cb6b0ea198", size = 24133372 },
     { url = "https://files.pythonhosted.org/packages/2c/14/91ae57cd4db3f9ef7aa99f4019cfa8d54cb4caa7e00975df6467e9725a9f/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a178759ebb095827bd30ef56598ec182b85547f1508941a3d560eb7ea1fbf338", size = 24640306 },
 ]
 
@@ -4260,7 +4273,6 @@ name = "nvidia-cuda-runtime-cu12"
 version = "12.4.127"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/a1/aa/b656d755f474e2084971e9a297def515938d56b466ab39624012070cb773/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:961fe0e2e716a2a1d967aab7caee97512f71767f852f67432d572e36cb3a11f3", size = 894177 },
     { url = "https://files.pythonhosted.org/packages/ea/27/1795d86fe88ef397885f2e580ac37628ed058a92ed2c39dc8eac3adf0619/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:64403288fa2136ee8e467cdc9c9427e0434110899d07c779f25b5c068934faa5", size = 883737 },
 ]
 
@@ -4283,7 +4295,6 @@ dependencies = [
     { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/7a/8a/0e728f749baca3fbeffad762738276e5df60851958be7783af121a7221e7/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5dad8008fc7f92f5ddfa2101430917ce2ffacd86824914c82e28990ad7f00399", size = 211422548 },
     { url = "https://files.pythonhosted.org/packages/27/94/3266821f65b92b3138631e9c8e7fe1fb513804ac934485a8d05776e1dd43/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f083fc24912aa410be21fa16d157fed2055dab1cc4b6934a0e03cba69eb242b9", size = 211459117 },
 ]
 
@@ -4292,7 +4303,6 @@ name = "nvidia-curand-cu12"
 version = "10.3.5.147"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/80/9c/a79180e4d70995fdf030c6946991d0171555c6edf95c265c6b2bf7011112/nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1f173f09e3e3c76ab084aba0de819c49e56614feae5c12f69883f4ae9bb5fad9", size = 56314811 },
     { url = "https://files.pythonhosted.org/packages/8a/6d/44ad094874c6f1b9c654f8ed939590bdc408349f137f9b98a3a23ccec411/nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a88f583d4e0bb643c49743469964103aa59f7f708d862c3ddb0fc07f851e3b8b", size = 56305206 },
 ]
 
@@ -4306,7 +4316,6 @@ dependencies = [
     { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/46/6b/a5c33cf16af09166845345275c34ad2190944bcc6026797a39f8e0a282e0/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_aarch64.whl", hash = "sha256:d338f155f174f90724bbde3758b7ac375a70ce8e706d70b018dd3375545fc84e", size = 127634111 },
     { url = "https://files.pythonhosted.org/packages/3a/e1/5b9089a4b2a4790dfdea8b3a006052cfecff58139d5a4e34cb1a51df8d6f/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:19e33fa442bcfd085b3086c4ebf7e8debc07cfe01e11513cc6d332fd918ac260", size = 127936057 },
 ]
 
@@ -4318,7 +4327,6 @@ dependencies = [
     { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/96/a9/c0d2f83a53d40a4a41be14cea6a0bf9e668ffcf8b004bd65633f433050c0/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_aarch64.whl", hash = "sha256:9d32f62896231ebe0480efd8a7f702e143c98cfaa0e8a76df3386c1ba2b54df3", size = 207381987 },
     { url = "https://files.pythonhosted.org/packages/db/f7/97a9ea26ed4bbbfc2d470994b8b4f338ef663be97b8f677519ac195e113d/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1", size = 207454763 },
 ]
 
@@ -4335,7 +4343,6 @@ name = "nvidia-nvjitlink-cu12"
 version = "12.4.127"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/02/45/239d52c05074898a80a900f49b1615d81c07fceadd5ad6c4f86a987c0bc4/nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:4abe7fef64914ccfa909bc2ba39739670ecc9e820c83ccc7a6ed414122599b83", size = 20552510 },
     { url = "https://files.pythonhosted.org/packages/ff/ff/847841bacfbefc97a00036e0fce5a0f086b640756dc38caea5e1bb002655/nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:06b3b9b25bf3f8af351d664978ca26a16d2c5127dbd53c0497e28d1fb9611d57", size = 21066810 },
 ]
 
@@ -4344,7 +4351,6 @@ name = "nvidia-nvtx-cu12"
 version = "12.4.127"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/06/39/471f581edbb7804b39e8063d92fc8305bdc7a80ae5c07dbe6ea5c50d14a5/nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7959ad635db13edf4fc65c06a6e9f9e55fc2f92596db928d169c0bb031e88ef3", size = 100417 },
     { url = "https://files.pythonhosted.org/packages/87/20/199b8713428322a2f22b722c62b8cc278cc53dffa9705d744484b5035ee9/nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:781e950d9b9f60d8241ccea575b32f5105a5baf4c2351cab5256a24869f12a1a", size = 99144 },
 ]