diff --git a/llama_hub/llama_packs/library.json b/llama_hub/llama_packs/library.json
index b571ed3f67..cd8ba8bcaa 100644
--- a/llama_hub/llama_packs/library.json
+++ b/llama_hub/llama_packs/library.json
@@ -278,6 +278,11 @@
     "author": "jerryjliu",
     "keywords": ["vanna", "sql", "ai", "text-to-sql"]
   },
+  "SelfRAGPack": {
+    "id": "llama_packs/self_rag",
+    "author": "mmaatouk",
+    "keywords": ["self-RAG", "llm", "smart-retreiver"]
+  },
   "InferRetrieveRerankPack": {
     "id": "llama_packs/research/infer_retrieve_rerank",
     "author": "jerryjliu",
diff --git a/llama_hub/llama_packs/self_rag/README.md b/llama_hub/llama_packs/self_rag/README.md
new file mode 100644
index 0000000000..c5fe9cf995
--- /dev/null
+++ b/llama_hub/llama_packs/self_rag/README.md
@@ -0,0 +1,63 @@
+# Simple self-RAG short form pack
+
+This LlamaPack implements (*in short form) the [self-RAG paper by Akari et al.](https://arxiv.org/pdf/2310.11511.pdf).
+
+This paper presents a novel framework called Self-Reflective Retrieval-Augmented Generation (SELF-RAG). Which aims to enhance the quality and factuality of large language models (LLMs) by combining retrieval and self-reflection mechanisms.
+
+The implementation is adapted from the author [implementation](https://github.com/AkariAsai/self-rag)
+A full notebook guide can be found [here](https://github.com/run-llama/llama-hub/blob/main/llama_hub/llama_packs/self_rag/self_rag.ipynb).
+
+
+## CLI Usage
+
+You can download llamapacks directly using `llamaindex-cli`, which comes installed with the `llama-index` python package:
+
+```bash
+llamaindex-cli download-llamapack SelfRAGPack --download-dir ./self_rag_pack
+```
+
+You can then inspect the files at `./self_rag_pack` and use them as a template for your own project!
+
+## Code Usage
+
+We will show you how to import the agent from these files!
+The implementation uses llama-cpp, to download the relevant models (be sure to replace DIR_PATH)
+```bash
+pip3 install -q huggingface-hub
+huggingface-cli download m4r1/selfrag_llama2_7b-GGUF selfrag_llama2_7b.q4_k_m.gguf --local-dir "<DIR_PATH>" --local-dir-use-symlinks False
+```
+
+```python
+from llama_index.llama_pack import download_llama_pack
+
+# download and install dependencies
+SelfRAGPack = download_llama_pack(
+  "SelfRAGPack", "./self_rag_pack"
+)
+
+```
+
+From here, you can use the pack. You can import the relevant modules from the download folder (in the example below we assume it's a relative import or the directory 
+has been added to your system path).
+
+```python
+from self_rag_pack.base import SelfRAGQueryEngine
+
+query_engine = SelfRAGQueryEngine(model_path=model_path, retriever=retriever, verbose=True)
+
+response = query_engine.query("Who won best Director in the 1972 Academy Awards?")
+```
+
+You can also use/initialize the pack directly.
+
+```python
+from llm_compiler_agent_pack.base import SelfRAGPack
+
+agent_pack = SelfRAGPack(model_path=model_path, retriever=retriever, verbose=True)
+```
+
+The `run()` function is a light wrapper around `agent.chat()`.
+
+```python
+response = pack.run("Who won best Director in the 1972 Academy Awards?")
+```
diff --git a/llama_hub/llama_packs/self_rag/__init__.py b/llama_hub/llama_packs/self_rag/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/llama_hub/llama_packs/self_rag/base.py b/llama_hub/llama_packs/self_rag/base.py
new file mode 100644
index 0000000000..afa160b0d2
--- /dev/null
+++ b/llama_hub/llama_packs/self_rag/base.py
@@ -0,0 +1,310 @@
+from dataclasses import dataclass
+from typing import Any, Dict, List
+import numpy as np
+
+from llama_index import Response
+from llama_index.llama_pack.base import BaseLlamaPack
+from llama_index.bridge.pydantic import Field
+from llama_index.query_engine import CustomQueryEngine
+from llama_index.core.base_retriever import BaseRetriever
+from llama_index.schema import NodeWithScore, TextNode
+from llama_index.utils import print_text
+
+_IMPORT_ERROR_MSG = (
+    "`llama_cpp` package not found, please run `pip install llama_cpp_python`"
+)
+
+_RELEVANCE_TOKENS = ["[Irrelevant]", "[Relevant]"]
+
+_RETRIEVAL_TOKENS = ["[No Retrieval]", "[Retrieval]", "[Continue to Use Evidence]"]
+_UTILITY_TOKENS = [
+    "[Utility:1]",
+    "[Utility:2]",
+    "[Utility:3]",
+    "[Utility:4]",
+    "[Utility:5]",
+]
+_GROUND_TOKENS = [
+    "[Fully supported]",
+    "[Partially supported]",
+    "[No support / Contradictory]",
+]
+_CTRL_TOKENS = [
+    "[Fully supported]",
+    "[Partially supported]",
+    "[No support / Contradictory]",
+    "[No Retrieval]",
+    "[Retrieval]",
+    "[Irrelevant]",
+    "[Relevant]",
+    "[Continue to Use Evidence]",
+    "<paragraph>",
+    "</paragraph>",
+    "[Utility:1]",
+    "[Utility:2]",
+    "[Utility:3]",
+    "[Utility:4]",
+    "[Utility:5]",
+]
+
+_MODEL_KWARGS = {"logits_all": True, "n_ctx": 2048, "n_gpu_layers": -1}
+_GENERATE_KWARGS = {
+    "temperature": 0.0,
+    "top_p": 1.0,
+    "max_tokens": 50,
+    "logprobs": 32016,
+}
+
+
+@dataclass
+class CriticOutput:
+    llm_response_per_paragraph: Dict[int, str]
+    paragraphs_final_score: Dict[int, float]
+    source_nodes: List[NodeWithScore]
+
+
+def _format_prompt(input: str, paragraph: str = None) -> str:
+    prompt = "### Instruction:\n{0}\n\n### Response:\n".format(input)
+    if paragraph is not None:
+        prompt += "[Retrieval]<paragraph>{0}</paragraph>".format(paragraph)
+    return prompt
+
+
+def _postprocess_answer(answer: str) -> str:
+    for token in _CTRL_TOKENS:
+        answer = answer.replace(token, "")
+
+    if "</s>" in answer:
+        answer = answer.replace("</s>", "")
+    if "\n" in answer:
+        answer = answer.replace("\n", "")
+
+    if "<|endoftext|>" in answer:
+        answer = answer.replace("<|endoftext|>", "")
+
+    return answer
+
+
+def _relevance_score(pred_log_probs: Dict[str, float]) -> float:
+    """Compute relevance score
+
+    Args:
+        pred_log_probs (Dict[str, float]): log probabilities of tokens
+
+    Returns:
+        float: relevance score
+    """
+    rel_prob = np.exp(float(pred_log_probs["[Relevant]"]))
+    irel_prob = np.exp(float(pred_log_probs["[Irrelevant]"]))
+    return rel_prob / (rel_prob + irel_prob)
+
+
+def _is_supported_score(
+    pred_tokens: List[int], pred_log_probs_dict: List[Dict[str, float]]
+) -> float:
+    """Compute support score
+
+    Args:
+        pred_tokens (List[int]): List of predicted tokens
+        pred_log_probs_dict (List[Dict[str, float]]): log probabilities of tokens for each predicted tokens
+
+    Returns:
+        float: support score
+    """
+    isSup_score = 0
+    groundness_token_appear_id = -1
+    for tok_idx, token in enumerate(pred_tokens):
+        if token in _GROUND_TOKENS:
+            groundness_token_appear_id = tok_idx
+            break
+    if groundness_token_appear_id > -1:
+        grd_score_dict = {}
+        for token in _GROUND_TOKENS:
+            prob = pred_log_probs_dict[groundness_token_appear_id][token]
+            grd_score_dict[token] = np.exp(float(prob))
+        isSup_score = (
+            grd_score_dict["[Fully supported]"]
+            + 0.5 * grd_score_dict["[Partially supported]"]
+        ) / np.sum(list(grd_score_dict.values()))
+    return isSup_score
+
+
+def _is_useful_score(
+    pred_tokens: List[int], pred_log_probs_dict: List[Dict[str, float]]
+) -> float:
+    """Compute usefulness score
+
+    Args:
+        pred_tokens (List[int]): List of predicted tokens
+        pred_log_probs_dict (List[Dict[str, float]]): log probabilities of tokens for each predicted tokens
+
+    Returns:
+        float: relevance score
+    """
+    isUse_score = 0
+    utility_token_appear_id = -1
+    for tok_idx, tok in enumerate(pred_tokens):
+        if tok in _UTILITY_TOKENS:
+            utility_token_appear_id = tok_idx
+    if utility_token_appear_id > -1:
+        ut_score_dict = {}
+        for token in _UTILITY_TOKENS:
+            prob = pred_log_probs_dict[utility_token_appear_id][token]
+            ut_score_dict[token] = np.exp(float(prob))
+
+        ut_sum = np.sum(list(ut_score_dict.values()))
+        ut_weights = [-1, -0.5, 0, 0.5, 1]
+        isUse_score = np.sum(
+            [
+                ut_weights[i] * (ut_score_dict["[Utility:{}]".format(i + 1)] / ut_sum)
+                for i in range(len(ut_weights))
+            ]
+        )
+    return isUse_score
+
+
+class SelfRAGQueryEngine(CustomQueryEngine):
+    """Simple short form self RAG query engine."""
+
+    llm: Any = Field(default=None, description="llm")
+    retriever: BaseRetriever = Field(default=None, description="retriever")
+    generate_kwargs: Dict = Field(default=None, description="llm generation arguments")
+    verbose: bool = Field(default=True, description="Verbose.")
+
+    def __init__(
+        self,
+        model_path: str,
+        retriever: BaseRetriever,
+        verbose: bool = False,
+        model_kwargs: Dict = None,
+        generate_kwargs: Dict = None,
+        **kwargs: Any,
+    ) -> None:
+        """Init params."""
+        super().__init__(verbose=verbose, **kwargs)
+        model_kwargs = model_kwargs or _MODEL_KWARGS
+        self.generate_kwargs = generate_kwargs or _GENERATE_KWARGS
+        try:
+            from llama_cpp import Llama  # noqa: F401
+        except ImportError:
+            raise ImportError(_IMPORT_ERROR_MSG)
+        self.llm = Llama(model_path=model_path, verbose=verbose, **model_kwargs)
+        self.retriever = retriever
+
+    def _run_critic(self, paragraphs: List[str]) -> CriticOutput:
+        """Run Critic component, the llm will generate responses based on the paragraphs and then evaluate them
+
+        Args:
+            paragraphs (List[str]): List of paragraphs to evaluate
+
+        Returns:
+            CriticOutput: Paragraphs final score, LLM predictions and source nodes
+        """
+        paragraphs_final_score = {}
+        llm_response_text = {}
+        source_nodes = []
+
+        for p_idx, paragraph in enumerate(paragraphs):
+            pred = self.llm(paragraph, **self.generate_kwargs)
+            # Cache llm answer
+            llm_response_text[p_idx] = pred["choices"][0]["text"]
+
+            logprobs = pred["choices"][0]["logprobs"]
+            pred_log_probs = logprobs["top_logprobs"]
+            # Compute isRel score, on the first predicted token
+            isRel_score = _relevance_score(pred_log_probs[0])
+
+            # Compute isSup score
+            isSup_score = _is_supported_score(logprobs["tokens"], pred_log_probs)
+
+            # Compute isUse score
+            isUse_score = _is_useful_score(logprobs["tokens"], pred_log_probs)
+
+            paragraphs_final_score[p_idx] = (
+                isRel_score + isSup_score + 0.5 * isUse_score
+            )
+            # Add the paragraph as source node with its relevance score
+            source_nodes.append(
+                NodeWithScore(
+                    node=TextNode(text=paragraph, id_=p_idx),
+                    score=isRel_score,
+                )
+            )
+
+            if self.verbose:
+                print_text(
+                    f"Input: {paragraph}\nPrediction: {llm_response_text[p_idx]}\nScore: {paragraphs_final_score[p_idx]}\n",
+                    color="blue",
+                )
+                print_text(
+                    f"{p_idx + 1}/{len(paragraphs)} paragraphs done\n\n", color="blue"
+                )
+
+        return CriticOutput(llm_response_text, paragraphs_final_score, source_nodes)
+
+    def custom_query(self, query_str: str) -> Response:
+        """Run self-RAG."""
+        response = self.llm(prompt=_format_prompt(query_str), **_GENERATE_KWARGS)
+        answer = response["choices"][0]["text"]
+        source_nodes = []
+
+        if "[Retrieval]" in answer:
+            if self.verbose:
+                print_text("Retreival required\n", color="blue")
+            documents = self.retriever.retrieve(query_str)
+            if self.verbose:
+                print_text(f"Received: {len(documents)} documents\n", color="blue")
+            paragraphs = [
+                _format_prompt(query_str, document.node.text) for document in documents
+            ]
+
+            if self.verbose:
+                print_text("Start evaluation\n", color="blue")
+
+            critic_output = self._run_critic(paragraphs)
+
+            paragraphs_final_score = critic_output.paragraphs_final_score
+            llm_response_per_paragraph = critic_output.llm_response_per_paragraph
+            source_nodes = critic_output.source_nodes
+
+            if self.verbose:
+                print_text("End evaluation\n", color="blue")
+
+            best_paragraph_id = max(
+                paragraphs_final_score, key=paragraphs_final_score.get
+            )
+            answer = llm_response_per_paragraph[best_paragraph_id]
+            if self.verbose:
+                print_text(f"Selected the best answer: {answer}\n", color="blue")
+
+        answer = _postprocess_answer(answer)
+        if self.verbose:
+            print_text(f"Final answer: {answer}\n", color="green")
+        return Response(response=str(answer), source_nodes=source_nodes)
+
+
+class SelfRAGPack(BaseLlamaPack):
+    """Simple short form Self-RAG pack."""
+
+    def __init__(
+        self,
+        model_path: str,
+        retriever: BaseRetriever,
+        verbose: bool = False,
+        **kwargs: Any,
+    ) -> None:
+        """Init params."""
+
+        self.query_engine = SelfRAGQueryEngine(model_path, retriever, verbose)
+
+    def get_modules(self) -> Dict[str, Any]:
+        """Get modules."""
+        return {
+            "query_engine": self.query_engine,
+            "llm": self.query_engine.llm,
+            "retriever": self.query_engine.retriever,
+        }
+
+    def run(self, *args: Any, **kwargs: Any) -> Any:
+        """Run the pipeline."""
+        return self.query_engine.query(*args, **kwargs)
diff --git a/llama_hub/llama_packs/self_rag/requirements.txt b/llama_hub/llama_packs/self_rag/requirements.txt
new file mode 100644
index 0000000000..01fbed3626
--- /dev/null
+++ b/llama_hub/llama_packs/self_rag/requirements.txt
@@ -0,0 +1 @@
+llama_cpp_python
\ No newline at end of file
diff --git a/llama_hub/llama_packs/self_rag/self_rag.ipynb b/llama_hub/llama_packs/self_rag/self_rag.ipynb
new file mode 100644
index 0000000000..639f624dc5
--- /dev/null
+++ b/llama_hub/llama_packs/self_rag/self_rag.ipynb
@@ -0,0 +1,646 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "94508a4f-ec1a-4a85-b104-63b6dd18a791",
+   "metadata": {},
+   "source": [
+    "# Simple Self RAG Notebook\n",
+    "\n",
+    "<a href=\"https://colab.research.google.com/github/run-llama/llama-hub/blob/main/llama_hub/llama_packs/self_rag/self_rag.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
+    "\n",
+    "This LlamaPack implements short form the [self-RAG paper by Akari et al.](https://arxiv.org/pdf/2310.11511.pdf).\n",
+    "\n",
+    "Novel framework called Self-Reflective Retrieval-Augmented Generation (SELF-RAG). Which aims to enhance the quality and factuality of large language models (LLMs) by combining retrieval and self-reflection mechanisms.\n",
+    "\n",
+    "The implementation is adapted from the author [implementation](https://github.com/AkariAsai/self-rag)\n",
+    "A full notebook guide can be found [here](https://github.com/run-llama/llama-hub/blob/main/llama_hub/llama_packs/self_rag/self_rag.ipynb).\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0646bc55-493e-4836-9df3-0f42814bc4fd",
+   "metadata": {},
+   "source": [
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "e52689a4-152b-49ef-8bf1-f588a692c5ee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index import Document, VectorStoreIndex\n",
+    "from llama_index.retrievers import VectorIndexRetriever\n",
+    "\n",
+    "# Create documents\n",
+    "documents = [\n",
+    "    Document(\n",
+    "        text=\"A group of penguins, known as a 'waddle' on land, shuffled across the Antarctic ice, their tuxedo-like plumage standing out against the snow.\"\n",
+    "    ),\n",
+    "    Document(\n",
+    "        text=\"Emperor penguins, the tallest of all penguin species, can dive deeper than any other bird, reaching depths of over 500 meters.\"\n",
+    "    ),\n",
+    "    Document(\n",
+    "        text=\"Penguins' black and white coloring is a form of camouflage called countershading; from above, their black back blends with the ocean depths, and from below, their white belly matches the bright surface.\"\n",
+    "    ),\n",
+    "    Document(\n",
+    "        text=\"Despite their upright stance, penguins are birds that cannot fly; their wings have evolved into flippers, making them expert swimmers.\"\n",
+    "    ),\n",
+    "    Document(\n",
+    "        text=\"The fastest species, the Gentoo penguin, can swim up to 36 kilometers per hour, using their flippers and streamlined bodies to slice through the water.\"\n",
+    "    ),\n",
+    "    Document(\n",
+    "        text=\"Penguins are social birds; many species form large colonies for breeding, which can number in the tens of thousands.\"\n",
+    "    ),\n",
+    "    Document(\n",
+    "        text=\"Intriguingly, penguins have excellent hearing and rely on distinct calls to identify their mates and chicks amidst the noisy colonies.\"\n",
+    "    ),\n",
+    "    Document(\n",
+    "        text=\"The smallest penguin species, the Little Blue Penguin, stands just about 40 cm tall and is found along the coastlines of southern Australia and New Zealand.\"\n",
+    "    ),\n",
+    "    Document(\n",
+    "        text=\"During the breeding season, male Emperor penguins endure the harsh Antarctic winter for months, fasting and incubating their eggs, while females hunt at sea.\"\n",
+    "    ),\n",
+    "    Document(\n",
+    "        text=\"Penguins consume a variety of seafood; their diet mainly consists of fish, squid, and krill, which they catch on their diving expeditions.\"\n",
+    "    ),\n",
+    "]\n",
+    "\n",
+    "index = VectorStoreIndex.from_documents(documents)\n",
+    "\n",
+    "# Setup a simple retriever\n",
+    "retriever = VectorIndexRetriever(\n",
+    "    index=index,\n",
+    "    similarity_top_k=10,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ed9206f7-8271-4436-8b77-c653e8f1620f",
+   "metadata": {},
+   "source": [
+    "## Load Pack / Setup\n",
+    "\n",
+    "Now we do `download_llama_pack` to load the Self-RAG LlamaPack (you can also import the module directly if using the llama-hub package).\n",
+    "\n",
+    "We will also optionally setup observability/tracing so we can observe the intermediate steps."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "74dfb6f1-68c9-4378-83c0-318e12b1cb41",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Option: if developing with the llama_hub package\n",
+    "# from base import SelfRAGQueryEngine\n",
+    "\n",
+    "\n",
+    "# Option: download llama_pack\n",
+    "from llama_index.llama_pack import download_llama_pack\n",
+    "\n",
+    "download_llama_pack(\n",
+    "    \"SelfRAGPack\",\n",
+    "    \"./self_rag_pack\",\n",
+    "    skip_load=True,\n",
+    ")\n",
+    "from self_rag_pack.base import SelfRAGQueryEngine"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "7065a08f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: huggingface-hub in /home/mmaatouk/repos/mvenv/lib/python3.10/site-packages (0.20.3)\n",
+      "Requirement already satisfied: tqdm>=4.42.1 in /home/mmaatouk/repos/mvenv/lib/python3.10/site-packages (from huggingface-hub) (4.66.1)\n",
+      "Requirement already satisfied: filelock in /home/mmaatouk/repos/mvenv/lib/python3.10/site-packages (from huggingface-hub) (3.13.1)\n",
+      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /home/mmaatouk/repos/mvenv/lib/python3.10/site-packages (from huggingface-hub) (4.9.0)\n",
+      "Requirement already satisfied: fsspec>=2023.5.0 in /home/mmaatouk/repos/mvenv/lib/python3.10/site-packages (from huggingface-hub) (2023.12.2)\n",
+      "Requirement already satisfied: requests in /home/mmaatouk/repos/mvenv/lib/python3.10/site-packages (from huggingface-hub) (2.31.0)\n",
+      "Requirement already satisfied: packaging>=20.9 in /home/mmaatouk/repos/mvenv/lib/python3.10/site-packages (from huggingface-hub) (23.2)\n",
+      "Requirement already satisfied: pyyaml>=5.1 in /home/mmaatouk/repos/mvenv/lib/python3.10/site-packages (from huggingface-hub) (6.0.1)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /home/mmaatouk/repos/mvenv/lib/python3.10/site-packages (from requests->huggingface-hub) (2023.11.17)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/mmaatouk/repos/mvenv/lib/python3.10/site-packages (from requests->huggingface-hub) (2.0.7)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /home/mmaatouk/repos/mvenv/lib/python3.10/site-packages (from requests->huggingface-hub) (3.6)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /home/mmaatouk/repos/mvenv/lib/python3.10/site-packages (from requests->huggingface-hub) (3.3.2)\n",
+      "Consider using `hf_transfer` for faster downloads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.\n",
+      "downloading https://huggingface.co/m4r1/selfrag_llama2_7b-GGUF/resolve/main/selfrag_llama2_7b.q4_k_m.gguf to /home/mmaatouk/.cache/huggingface/hub/tmpdqmfpera\n",
+      "selfrag_llama2_7b.q4_k_m.gguf: 100%|███████| 4.08G/4.08G [02:37<00:00, 25.9MB/s]\n",
+      "/home/mmaatouk/tmp/selfrag_llama2_7b.q4_k_m.gguf\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Download the self-RAG model\n",
+    "download_dir = \"/home/mmaatouk/tmp\"  # Replace\n",
+    "!pip3 install -q huggingface-hub\n",
+    "!huggingface-cli download m4r1/selfrag_llama2_7b-GGUF selfrag_llama2_7b.q4_k_m.gguf --local-dir {download_dir} --local-dir-use-symlinks False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "e48f36b3-9278-4da9-891c-3af4de747528",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "llama_model_loader: loaded meta data with 21 key-value pairs and 291 tensors from /home/mmaatouk/tmp/selfrag_llama2_7b.q4_k_m.gguf (version GGUF V3 (latest))\n",
+      "llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n",
+      "llama_model_loader: - kv   0:                       general.architecture str              = llama\n",
+      "llama_model_loader: - kv   1:                               general.name str              = LLaMA v2\n",
+      "llama_model_loader: - kv   2:                       llama.context_length u32              = 4096\n",
+      "llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096\n",
+      "llama_model_loader: - kv   4:                          llama.block_count u32              = 32\n",
+      "llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008\n",
+      "llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128\n",
+      "llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 32\n",
+      "llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 32\n",
+      "llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010\n",
+      "llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 10000.000000\n",
+      "llama_model_loader: - kv  11:                          general.file_type u32              = 15\n",
+      "llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama\n",
+      "llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = [\"<unk>\", \"<s>\", \"</s>\", \"<0x00>\", \"<...\n",
+      "llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...\n",
+      "llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...\n",
+      "llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1\n",
+      "llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2\n",
+      "llama_model_loader: - kv  18:               tokenizer.ggml.add_bos_token bool             = true\n",
+      "llama_model_loader: - kv  19:               tokenizer.ggml.add_eos_token bool             = false\n",
+      "llama_model_loader: - kv  20:               general.quantization_version u32              = 2\n",
+      "llama_model_loader: - type  f32:   65 tensors\n",
+      "llama_model_loader: - type q4_K:  193 tensors\n",
+      "llama_model_loader: - type q6_K:   33 tensors\n",
+      "llm_load_vocab: special tokens definition check successful ( 275/32016 ).\n",
+      "llm_load_print_meta: format           = GGUF V3 (latest)\n",
+      "llm_load_print_meta: arch             = llama\n",
+      "llm_load_print_meta: vocab type       = SPM\n",
+      "llm_load_print_meta: n_vocab          = 32016\n",
+      "llm_load_print_meta: n_merges         = 0\n",
+      "llm_load_print_meta: n_ctx_train      = 4096\n",
+      "llm_load_print_meta: n_embd           = 4096\n",
+      "llm_load_print_meta: n_head           = 32\n",
+      "llm_load_print_meta: n_head_kv        = 32\n",
+      "llm_load_print_meta: n_layer          = 32\n",
+      "llm_load_print_meta: n_rot            = 128\n",
+      "llm_load_print_meta: n_embd_head_k    = 128\n",
+      "llm_load_print_meta: n_embd_head_v    = 128\n",
+      "llm_load_print_meta: n_gqa            = 1\n",
+      "llm_load_print_meta: n_embd_k_gqa     = 4096\n",
+      "llm_load_print_meta: n_embd_v_gqa     = 4096\n",
+      "llm_load_print_meta: f_norm_eps       = 0.0e+00\n",
+      "llm_load_print_meta: f_norm_rms_eps   = 1.0e-05\n",
+      "llm_load_print_meta: f_clamp_kqv      = 0.0e+00\n",
+      "llm_load_print_meta: f_max_alibi_bias = 0.0e+00\n",
+      "llm_load_print_meta: n_ff             = 11008\n",
+      "llm_load_print_meta: n_expert         = 0\n",
+      "llm_load_print_meta: n_expert_used    = 0\n",
+      "llm_load_print_meta: rope scaling     = linear\n",
+      "llm_load_print_meta: freq_base_train  = 10000.0\n",
+      "llm_load_print_meta: freq_scale_train = 1\n",
+      "llm_load_print_meta: n_yarn_orig_ctx  = 4096\n",
+      "llm_load_print_meta: rope_finetuned   = unknown\n",
+      "llm_load_print_meta: model type       = 7B\n",
+      "llm_load_print_meta: model ftype      = Q4_K - Medium\n",
+      "llm_load_print_meta: model params     = 6.74 B\n",
+      "llm_load_print_meta: model size       = 3.80 GiB (4.84 BPW) \n",
+      "llm_load_print_meta: general.name     = LLaMA v2\n",
+      "llm_load_print_meta: BOS token        = 1 '<s>'\n",
+      "llm_load_print_meta: EOS token        = 2 '</s>'\n",
+      "llm_load_print_meta: UNK token        = 0 '<unk>'\n",
+      "llm_load_print_meta: LF token         = 13 '<0x0A>'\n",
+      "llm_load_tensors: ggml ctx size =    0.11 MiB\n",
+      "llm_load_tensors: offloading 32 repeating layers to GPU\n",
+      "llm_load_tensors: offloading non-repeating layers to GPU\n",
+      "llm_load_tensors: offloaded 33/33 layers to GPU\n",
+      "llm_load_tensors:        CPU buffer size =  3891.33 MiB\n",
+      "..................................................................................................\n",
+      "llama_new_context_with_model: n_ctx      = 2048\n",
+      "llama_new_context_with_model: freq_base  = 10000.0\n",
+      "llama_new_context_with_model: freq_scale = 1\n",
+      "llama_kv_cache_init:        CPU KV buffer size =  1024.00 MiB\n",
+      "llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB\n",
+      "llama_new_context_with_model: graph splits (measure): 1\n",
+      "llama_new_context_with_model:        CPU compute buffer size =   172.00 MiB\n",
+      "AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | \n",
+      "Model metadata: {'tokenizer.ggml.add_eos_token': 'false', 'tokenizer.ggml.eos_token_id': '2', 'general.architecture': 'llama', 'llama.rope.freq_base': '10000.000000', 'llama.context_length': '4096', 'general.name': 'LLaMA v2', 'tokenizer.ggml.add_bos_token': 'true', 'llama.embedding_length': '4096', 'llama.feed_forward_length': '11008', 'llama.attention.layer_norm_rms_epsilon': '0.000010', 'llama.rope.dimension_count': '128', 'tokenizer.ggml.bos_token_id': '1', 'llama.attention.head_count': '32', 'llama.block_count': '32', 'llama.attention.head_count_kv': '32', 'general.quantization_version': '2', 'tokenizer.ggml.model': 'llama', 'general.file_type': '15'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "model_path = Path(download_dir) / \"selfrag_llama2_7b.q4_k_m.gguf\"\n",
+    "query_engine = SelfRAGQueryEngine(str(model_path), retriever, verbose=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "40213404-e3a2-4412-a217-82fa95f06cec",
+   "metadata": {},
+   "source": [
+    "## Try out some Queries\n",
+    "\n",
+    "Now let's try out our `SelfRAGQueryEngine`!\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "cdb8462b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "llama_print_timings:        load time =    1582.98 ms\n",
+      "llama_print_timings:      sample time =       6.87 ms /    22 runs   (    0.31 ms per token,  3201.40 tokens per second)\n",
+      "llama_print_timings: prompt eval time =    1582.02 ms /    24 tokens (   65.92 ms per token,    15.17 tokens per second)\n",
+      "llama_print_timings:        eval time =    2685.22 ms /    21 runs   (  127.87 ms per token,     7.82 tokens per second)\n",
+      "llama_print_timings:       total time =    4364.67 ms /    45 tokens\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[1;3;32mFinal answer: The book \"Pride and Prejudice\" is a romantic novel by Jane Austen.\n",
+      "\u001b[0m"
+     ]
+    }
+   ],
+   "source": [
+    "# No retreival example\n",
+    "response = query_engine.query(\"Which genre the book pride and prejudice?\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "127b16c5",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Llama.generate: prefix-match hit\n",
+      "\n",
+      "llama_print_timings:        load time =    1582.98 ms\n",
+      "llama_print_timings:      sample time =      16.08 ms /    50 runs   (    0.32 ms per token,  3108.68 tokens per second)\n",
+      "llama_print_timings: prompt eval time =    1005.45 ms /    16 tokens (   62.84 ms per token,    15.91 tokens per second)\n",
+      "llama_print_timings:        eval time =    6345.52 ms /    49 runs   (  129.50 ms per token,     7.72 tokens per second)\n",
+      "llama_print_timings:       total time =    7517.03 ms /    65 tokens\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[1;3;34mRetreival required\n",
+      "\u001b[0m\u001b[1;3;34mReceived: 10 documents\n",
+      "\u001b[0m\u001b[1;3;34mStart evaluation\n",
+      "\u001b[0m"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Llama.generate: prefix-match hit\n",
+      "\n",
+      "llama_print_timings:        load time =    1582.98 ms\n",
+      "llama_print_timings:      sample time =      13.51 ms /    43 runs   (    0.31 ms per token,  3183.53 tokens per second)\n",
+      "llama_print_timings: prompt eval time =    2447.83 ms /    39 tokens (   62.76 ms per token,    15.93 tokens per second)\n",
+      "llama_print_timings:        eval time =    5438.94 ms /    42 runs   (  129.50 ms per token,     7.72 tokens per second)\n",
+      "llama_print_timings:       total time =    8188.26 ms /    81 tokens\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[1;3;34mInput: ### Instruction:\n",
+      "How tall is the smallest penguins?\n",
+      "\n",
+      "### Response:\n",
+      "[Retrieval]<paragraph>The smallest penguin species, the Little Blue Penguin, stands just about 40 cm tall and is found along the coastlines of southern Australia and New Zealand.</paragraph>\n",
+      "Prediction: [Relevant]The smallest penguin species is the Little Blue Penguin (also known as the Fairy Penguin), which can grow to be around 40 centimeters in height.[Fully supported][Utility:5]\n",
+      "Score: 2.4709723458196087\n",
+      "\u001b[0m\u001b[1;3;34m1/10 paragraphs done\n",
+      "\n",
+      "\u001b[0m"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Llama.generate: prefix-match hit\n",
+      "\n",
+      "llama_print_timings:        load time =    1582.98 ms\n",
+      "llama_print_timings:      sample time =       8.51 ms /    26 runs   (    0.33 ms per token,  3054.15 tokens per second)\n",
+      "llama_print_timings: prompt eval time =    2431.51 ms /    37 tokens (   65.72 ms per token,    15.22 tokens per second)\n",
+      "llama_print_timings:        eval time =    3271.24 ms /    25 runs   (  130.85 ms per token,     7.64 tokens per second)\n",
+      "llama_print_timings:       total time =    5901.59 ms /    62 tokens\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[1;3;34mInput: ### Instruction:\n",
+      "How tall is the smallest penguins?\n",
+      "\n",
+      "### Response:\n",
+      "[Retrieval]<paragraph>Emperor penguins, the tallest of all penguin species, can dive deeper than any other bird, reaching depths of over 500 meters.</paragraph>\n",
+      "Prediction: [Relevant]The smallest penguin species is the Emperor Penguin (Aptenodytes forsteri).[Fully supported][Utility:5]\n",
+      "Score: 2.1767850110288887\n",
+      "\u001b[0m\u001b[1;3;34m2/10 paragraphs done\n",
+      "\n",
+      "\u001b[0m"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Llama.generate: prefix-match hit\n",
+      "\n",
+      "llama_print_timings:        load time =    1582.98 ms\n",
+      "llama_print_timings:      sample time =       8.62 ms /    26 runs   (    0.33 ms per token,  3016.59 tokens per second)\n",
+      "llama_print_timings: prompt eval time =    2846.05 ms /    43 tokens (   66.19 ms per token,    15.11 tokens per second)\n",
+      "llama_print_timings:        eval time =    3340.62 ms /    25 runs   (  133.62 ms per token,     7.48 tokens per second)\n",
+      "llama_print_timings:       total time =    6433.70 ms /    68 tokens\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[1;3;34mInput: ### Instruction:\n",
+      "How tall is the smallest penguins?\n",
+      "\n",
+      "### Response:\n",
+      "[Retrieval]<paragraph>A group of penguins, known as a 'waddle' on land, shuffled across the Antarctic ice, their tuxedo-like plumage standing out against the snow.</paragraph>\n",
+      "Prediction: [Relevant]The smallest penguin species is the African or little penguin (Eudyptula minor).[No support / Contradictory][Utility:5]\n",
+      "Score: 1.5998614571701189\n",
+      "\u001b[0m\u001b[1;3;34m3/10 paragraphs done\n",
+      "\n",
+      "\u001b[0m"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Llama.generate: prefix-match hit\n",
+      "\n",
+      "llama_print_timings:        load time =    1582.98 ms\n",
+      "llama_print_timings:      sample time =       6.24 ms /    18 runs   (    0.35 ms per token,  2885.54 tokens per second)\n",
+      "llama_print_timings: prompt eval time =    2461.25 ms /    37 tokens (   66.52 ms per token,    15.03 tokens per second)\n",
+      "llama_print_timings:        eval time =    2272.68 ms /    17 runs   (  133.69 ms per token,     7.48 tokens per second)\n",
+      "llama_print_timings:       total time =    4892.65 ms /    54 tokens\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[1;3;34mInput: ### Instruction:\n",
+      "How tall is the smallest penguins?\n",
+      "\n",
+      "### Response:\n",
+      "[Retrieval]<paragraph>Despite their upright stance, penguins are birds that cannot fly; their wings have evolved into flippers, making them expert swimmers.</paragraph>\n",
+      "Prediction: [Relevant]The height of a penguin varies depending on the species.[No support / Contradictory][Utility:5]\n",
+      "Score: 1.4486356991581153\n",
+      "\u001b[0m\u001b[1;3;34m4/10 paragraphs done\n",
+      "\n",
+      "\u001b[0m"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Llama.generate: prefix-match hit\n",
+      "\n",
+      "llama_print_timings:        load time =    1582.98 ms\n",
+      "llama_print_timings:      sample time =      13.34 ms /    39 runs   (    0.34 ms per token,  2923.10 tokens per second)\n",
+      "llama_print_timings: prompt eval time =    2735.91 ms /    41 tokens (   66.73 ms per token,    14.99 tokens per second)\n",
+      "llama_print_timings:        eval time =    5088.15 ms /    38 runs   (  133.90 ms per token,     7.47 tokens per second)\n",
+      "llama_print_timings:       total time =    8140.45 ms /    79 tokens\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[1;3;34mInput: ### Instruction:\n",
+      "How tall is the smallest penguins?\n",
+      "\n",
+      "### Response:\n",
+      "[Retrieval]<paragraph>The fastest species, the Gentoo penguin, can swim up to 36 kilometers per hour, using their flippers and streamlined bodies to slice through the water.</paragraph>\n",
+      "Prediction: [Relevant]The smallest penguin species is the African or little penguin (also known as the jackass penguin).[No support / Contradictory][Continue to Use Evidence]They are about 17 inches tall.[Utility:5]\n",
+      "Score: 1.4687150930489146\n",
+      "\u001b[0m\u001b[1;3;34m5/10 paragraphs done\n",
+      "\n",
+      "\u001b[0m"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Llama.generate: prefix-match hit\n",
+      "\n",
+      "llama_print_timings:        load time =    1582.98 ms\n",
+      "llama_print_timings:      sample time =      13.66 ms /    38 runs   (    0.36 ms per token,  2781.64 tokens per second)\n",
+      "llama_print_timings: prompt eval time =    3413.28 ms /    50 tokens (   68.27 ms per token,    14.65 tokens per second)\n",
+      "llama_print_timings:        eval time =    4859.28 ms /    37 runs   (  131.33 ms per token,     7.61 tokens per second)\n",
+      "llama_print_timings:       total time =    8526.62 ms /    87 tokens\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[1;3;34mInput: ### Instruction:\n",
+      "How tall is the smallest penguins?\n",
+      "\n",
+      "### Response:\n",
+      "[Retrieval]<paragraph>Penguins' black and white coloring is a form of camouflage called countershading; from above, their black back blends with the ocean depths, and from below, their white belly matches the bright surface.</paragraph>\n",
+      "Prediction: [Relevant]The smallest penguin species is the African or little penguin (Eudyptula minor), which can grow to be about 17 inches tall.[No support / Contradictory][Utility:5]\n",
+      "Score: 1.6357659323827645\n",
+      "\u001b[0m\u001b[1;3;34m6/10 paragraphs done\n",
+      "\n",
+      "\u001b[0m"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Llama.generate: prefix-match hit\n",
+      "\n",
+      "llama_print_timings:        load time =    1582.98 ms\n",
+      "llama_print_timings:      sample time =       8.49 ms /    25 runs   (    0.34 ms per token,  2943.25 tokens per second)\n",
+      "llama_print_timings: prompt eval time =    1744.16 ms /    24 tokens (   72.67 ms per token,    13.76 tokens per second)\n",
+      "llama_print_timings:        eval time =    3862.56 ms /    24 runs   (  160.94 ms per token,     6.21 tokens per second)\n",
+      "llama_print_timings:       total time =    5867.77 ms /    48 tokens\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[1;3;34mInput: ### Instruction:\n",
+      "How tall is the smallest penguins?\n",
+      "\n",
+      "### Response:\n",
+      "[Retrieval]<paragraph>Penguins are social birds; many species form large colonies for breeding, which can number in the tens of thousands.</paragraph>\n",
+      "Prediction: [Relevant]The smallest penguin is the African or little penguin (Eudyptula minor).[No support / Contradictory][Utility:5]\n",
+      "Score: 1.4935304338695037\n",
+      "\u001b[0m\u001b[1;3;34m7/10 paragraphs done\n",
+      "\n",
+      "\u001b[0m"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Llama.generate: prefix-match hit\n",
+      "\n",
+      "llama_print_timings:        load time =    1582.98 ms\n",
+      "llama_print_timings:      sample time =       9.34 ms /    26 runs   (    0.36 ms per token,  2782.83 tokens per second)\n",
+      "llama_print_timings: prompt eval time =    2438.93 ms /    33 tokens (   73.91 ms per token,    13.53 tokens per second)\n",
+      "llama_print_timings:        eval time =    3521.54 ms /    25 runs   (  140.86 ms per token,     7.10 tokens per second)\n",
+      "llama_print_timings:       total time =    6158.74 ms /    58 tokens\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[1;3;34mInput: ### Instruction:\n",
+      "How tall is the smallest penguins?\n",
+      "\n",
+      "### Response:\n",
+      "[Retrieval]<paragraph>Intriguingly, penguins have excellent hearing and rely on distinct calls to identify their mates and chicks amidst the noisy colonies.</paragraph>\n",
+      "Prediction: [Relevant]The smallest penguin species is the African or little penguin (Eudyptula minor).[No support / Contradictory][Utility:5]\n",
+      "Score: 1.4074488783945505\n",
+      "\u001b[0m\u001b[1;3;34m8/10 paragraphs done\n",
+      "\n",
+      "\u001b[0m"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Llama.generate: prefix-match hit\n",
+      "\n",
+      "llama_print_timings:        load time =    1582.98 ms\n",
+      "llama_print_timings:      sample time =       9.03 ms /    26 runs   (    0.35 ms per token,  2878.02 tokens per second)\n",
+      "llama_print_timings: prompt eval time =    2850.73 ms /    41 tokens (   69.53 ms per token,    14.38 tokens per second)\n",
+      "llama_print_timings:        eval time =    3430.31 ms /    25 runs   (  137.21 ms per token,     7.29 tokens per second)\n",
+      "llama_print_timings:       total time =    6558.69 ms /    66 tokens\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[1;3;34mInput: ### Instruction:\n",
+      "How tall is the smallest penguins?\n",
+      "\n",
+      "### Response:\n",
+      "[Retrieval]<paragraph>During the breeding season, male Emperor penguins endure the harsh Antarctic winter for months, fasting and incubating their eggs, while females hunt at sea.</paragraph>\n",
+      "Prediction: [Relevant]The smallest penguin species is the Emperor Penguin (Aptenodytes forsteri).[No support / Contradictory][Utility:5]\n",
+      "Score: 1.415058228804781\n",
+      "\u001b[0m\u001b[1;3;34m9/10 paragraphs done\n",
+      "\n",
+      "\u001b[0m"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Llama.generate: prefix-match hit\n",
+      "\n",
+      "llama_print_timings:        load time =    1582.98 ms\n",
+      "llama_print_timings:      sample time =       7.27 ms /    20 runs   (    0.36 ms per token,  2752.55 tokens per second)\n",
+      "llama_print_timings: prompt eval time =    2766.95 ms /    37 tokens (   74.78 ms per token,    13.37 tokens per second)\n",
+      "llama_print_timings:        eval time =    2538.61 ms /    19 runs   (  133.61 ms per token,     7.48 tokens per second)\n",
+      "llama_print_timings:       total time =    5471.43 ms /    56 tokens\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[1;3;34mInput: ### Instruction:\n",
+      "How tall is the smallest penguins?\n",
+      "\n",
+      "### Response:\n",
+      "[Retrieval]<paragraph>Penguins consume a variety of seafood; their diet mainly consists of fish, squid, and krill, which they catch on their diving expeditions.</paragraph>\n",
+      "Prediction: [Relevant]The height of the smallest penguin species can vary depending on the species.[No support / Contradictory][Utility:5]\n",
+      "Score: 1.4213598342974365\n",
+      "\u001b[0m\u001b[1;3;34m10/10 paragraphs done\n",
+      "\n",
+      "\u001b[0m\u001b[1;3;34mEnd evaluation\n",
+      "\u001b[0m\u001b[1;3;34mSelected the best answer: [Relevant]The smallest penguin species is the Little Blue Penguin (also known as the Fairy Penguin), which can grow to be around 40 centimeters in height.[Fully supported][Utility:5]\n",
+      "\u001b[0m\u001b[1;3;32mFinal answer: The smallest penguin species is the Little Blue Penguin (also known as the Fairy Penguin), which can grow to be around 40 centimeters in height.\n",
+      "\u001b[0m"
+     ]
+    }
+   ],
+   "source": [
+    "# Retreival example\n",
+    "response = query_engine.query(\"How tall is the smallest penguins?\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}