diff --git a/llama_hub/llama_packs/library.json b/llama_hub/llama_packs/library.json index b571ed3f67..cd8ba8bcaa 100644 --- a/llama_hub/llama_packs/library.json +++ b/llama_hub/llama_packs/library.json @@ -278,6 +278,11 @@ "author": "jerryjliu", "keywords": ["vanna", "sql", "ai", "text-to-sql"] }, + "SelfRAGPack": { + "id": "llama_packs/self_rag", + "author": "mmaatouk", + "keywords": ["self-RAG", "llm", "smart-retreiver"] + }, "InferRetrieveRerankPack": { "id": "llama_packs/research/infer_retrieve_rerank", "author": "jerryjliu", diff --git a/llama_hub/llama_packs/self_rag/README.md b/llama_hub/llama_packs/self_rag/README.md new file mode 100644 index 0000000000..c5fe9cf995 --- /dev/null +++ b/llama_hub/llama_packs/self_rag/README.md @@ -0,0 +1,63 @@ +# Simple self-RAG short form pack + +This LlamaPack implements (*in short form) the [self-RAG paper by Akari et al.](https://arxiv.org/pdf/2310.11511.pdf). + +This paper presents a novel framework called Self-Reflective Retrieval-Augmented Generation (SELF-RAG). Which aims to enhance the quality and factuality of large language models (LLMs) by combining retrieval and self-reflection mechanisms. + +The implementation is adapted from the author [implementation](https://github.com/AkariAsai/self-rag) +A full notebook guide can be found [here](https://github.com/run-llama/llama-hub/blob/main/llama_hub/llama_packs/self_rag/self_rag.ipynb). + + +## CLI Usage + +You can download llamapacks directly using `llamaindex-cli`, which comes installed with the `llama-index` python package: + +```bash +llamaindex-cli download-llamapack SelfRAGPack --download-dir ./self_rag_pack +``` + +You can then inspect the files at `./self_rag_pack` and use them as a template for your own project! + +## Code Usage + +We will show you how to import the agent from these files! +The implementation uses llama-cpp, to download the relevant models (be sure to replace DIR_PATH) +```bash +pip3 install -q huggingface-hub +huggingface-cli download m4r1/selfrag_llama2_7b-GGUF selfrag_llama2_7b.q4_k_m.gguf --local-dir "" --local-dir-use-symlinks False +``` + +```python +from llama_index.llama_pack import download_llama_pack + +# download and install dependencies +SelfRAGPack = download_llama_pack( + "SelfRAGPack", "./self_rag_pack" +) + +``` + +From here, you can use the pack. You can import the relevant modules from the download folder (in the example below we assume it's a relative import or the directory +has been added to your system path). + +```python +from self_rag_pack.base import SelfRAGQueryEngine + +query_engine = SelfRAGQueryEngine(model_path=model_path, retriever=retriever, verbose=True) + +response = query_engine.query("Who won best Director in the 1972 Academy Awards?") +``` + +You can also use/initialize the pack directly. + +```python +from llm_compiler_agent_pack.base import SelfRAGPack + +agent_pack = SelfRAGPack(model_path=model_path, retriever=retriever, verbose=True) +``` + +The `run()` function is a light wrapper around `agent.chat()`. + +```python +response = pack.run("Who won best Director in the 1972 Academy Awards?") +``` diff --git a/llama_hub/llama_packs/self_rag/__init__.py b/llama_hub/llama_packs/self_rag/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/llama_hub/llama_packs/self_rag/base.py b/llama_hub/llama_packs/self_rag/base.py new file mode 100644 index 0000000000..afa160b0d2 --- /dev/null +++ b/llama_hub/llama_packs/self_rag/base.py @@ -0,0 +1,310 @@ +from dataclasses import dataclass +from typing import Any, Dict, List +import numpy as np + +from llama_index import Response +from llama_index.llama_pack.base import BaseLlamaPack +from llama_index.bridge.pydantic import Field +from llama_index.query_engine import CustomQueryEngine +from llama_index.core.base_retriever import BaseRetriever +from llama_index.schema import NodeWithScore, TextNode +from llama_index.utils import print_text + +_IMPORT_ERROR_MSG = ( + "`llama_cpp` package not found, please run `pip install llama_cpp_python`" +) + +_RELEVANCE_TOKENS = ["[Irrelevant]", "[Relevant]"] + +_RETRIEVAL_TOKENS = ["[No Retrieval]", "[Retrieval]", "[Continue to Use Evidence]"] +_UTILITY_TOKENS = [ + "[Utility:1]", + "[Utility:2]", + "[Utility:3]", + "[Utility:4]", + "[Utility:5]", +] +_GROUND_TOKENS = [ + "[Fully supported]", + "[Partially supported]", + "[No support / Contradictory]", +] +_CTRL_TOKENS = [ + "[Fully supported]", + "[Partially supported]", + "[No support / Contradictory]", + "[No Retrieval]", + "[Retrieval]", + "[Irrelevant]", + "[Relevant]", + "[Continue to Use Evidence]", + "", + "", + "[Utility:1]", + "[Utility:2]", + "[Utility:3]", + "[Utility:4]", + "[Utility:5]", +] + +_MODEL_KWARGS = {"logits_all": True, "n_ctx": 2048, "n_gpu_layers": -1} +_GENERATE_KWARGS = { + "temperature": 0.0, + "top_p": 1.0, + "max_tokens": 50, + "logprobs": 32016, +} + + +@dataclass +class CriticOutput: + llm_response_per_paragraph: Dict[int, str] + paragraphs_final_score: Dict[int, float] + source_nodes: List[NodeWithScore] + + +def _format_prompt(input: str, paragraph: str = None) -> str: + prompt = "### Instruction:\n{0}\n\n### Response:\n".format(input) + if paragraph is not None: + prompt += "[Retrieval]{0}".format(paragraph) + return prompt + + +def _postprocess_answer(answer: str) -> str: + for token in _CTRL_TOKENS: + answer = answer.replace(token, "") + + if "" in answer: + answer = answer.replace("", "") + if "\n" in answer: + answer = answer.replace("\n", "") + + if "<|endoftext|>" in answer: + answer = answer.replace("<|endoftext|>", "") + + return answer + + +def _relevance_score(pred_log_probs: Dict[str, float]) -> float: + """Compute relevance score + + Args: + pred_log_probs (Dict[str, float]): log probabilities of tokens + + Returns: + float: relevance score + """ + rel_prob = np.exp(float(pred_log_probs["[Relevant]"])) + irel_prob = np.exp(float(pred_log_probs["[Irrelevant]"])) + return rel_prob / (rel_prob + irel_prob) + + +def _is_supported_score( + pred_tokens: List[int], pred_log_probs_dict: List[Dict[str, float]] +) -> float: + """Compute support score + + Args: + pred_tokens (List[int]): List of predicted tokens + pred_log_probs_dict (List[Dict[str, float]]): log probabilities of tokens for each predicted tokens + + Returns: + float: support score + """ + isSup_score = 0 + groundness_token_appear_id = -1 + for tok_idx, token in enumerate(pred_tokens): + if token in _GROUND_TOKENS: + groundness_token_appear_id = tok_idx + break + if groundness_token_appear_id > -1: + grd_score_dict = {} + for token in _GROUND_TOKENS: + prob = pred_log_probs_dict[groundness_token_appear_id][token] + grd_score_dict[token] = np.exp(float(prob)) + isSup_score = ( + grd_score_dict["[Fully supported]"] + + 0.5 * grd_score_dict["[Partially supported]"] + ) / np.sum(list(grd_score_dict.values())) + return isSup_score + + +def _is_useful_score( + pred_tokens: List[int], pred_log_probs_dict: List[Dict[str, float]] +) -> float: + """Compute usefulness score + + Args: + pred_tokens (List[int]): List of predicted tokens + pred_log_probs_dict (List[Dict[str, float]]): log probabilities of tokens for each predicted tokens + + Returns: + float: relevance score + """ + isUse_score = 0 + utility_token_appear_id = -1 + for tok_idx, tok in enumerate(pred_tokens): + if tok in _UTILITY_TOKENS: + utility_token_appear_id = tok_idx + if utility_token_appear_id > -1: + ut_score_dict = {} + for token in _UTILITY_TOKENS: + prob = pred_log_probs_dict[utility_token_appear_id][token] + ut_score_dict[token] = np.exp(float(prob)) + + ut_sum = np.sum(list(ut_score_dict.values())) + ut_weights = [-1, -0.5, 0, 0.5, 1] + isUse_score = np.sum( + [ + ut_weights[i] * (ut_score_dict["[Utility:{}]".format(i + 1)] / ut_sum) + for i in range(len(ut_weights)) + ] + ) + return isUse_score + + +class SelfRAGQueryEngine(CustomQueryEngine): + """Simple short form self RAG query engine.""" + + llm: Any = Field(default=None, description="llm") + retriever: BaseRetriever = Field(default=None, description="retriever") + generate_kwargs: Dict = Field(default=None, description="llm generation arguments") + verbose: bool = Field(default=True, description="Verbose.") + + def __init__( + self, + model_path: str, + retriever: BaseRetriever, + verbose: bool = False, + model_kwargs: Dict = None, + generate_kwargs: Dict = None, + **kwargs: Any, + ) -> None: + """Init params.""" + super().__init__(verbose=verbose, **kwargs) + model_kwargs = model_kwargs or _MODEL_KWARGS + self.generate_kwargs = generate_kwargs or _GENERATE_KWARGS + try: + from llama_cpp import Llama # noqa: F401 + except ImportError: + raise ImportError(_IMPORT_ERROR_MSG) + self.llm = Llama(model_path=model_path, verbose=verbose, **model_kwargs) + self.retriever = retriever + + def _run_critic(self, paragraphs: List[str]) -> CriticOutput: + """Run Critic component, the llm will generate responses based on the paragraphs and then evaluate them + + Args: + paragraphs (List[str]): List of paragraphs to evaluate + + Returns: + CriticOutput: Paragraphs final score, LLM predictions and source nodes + """ + paragraphs_final_score = {} + llm_response_text = {} + source_nodes = [] + + for p_idx, paragraph in enumerate(paragraphs): + pred = self.llm(paragraph, **self.generate_kwargs) + # Cache llm answer + llm_response_text[p_idx] = pred["choices"][0]["text"] + + logprobs = pred["choices"][0]["logprobs"] + pred_log_probs = logprobs["top_logprobs"] + # Compute isRel score, on the first predicted token + isRel_score = _relevance_score(pred_log_probs[0]) + + # Compute isSup score + isSup_score = _is_supported_score(logprobs["tokens"], pred_log_probs) + + # Compute isUse score + isUse_score = _is_useful_score(logprobs["tokens"], pred_log_probs) + + paragraphs_final_score[p_idx] = ( + isRel_score + isSup_score + 0.5 * isUse_score + ) + # Add the paragraph as source node with its relevance score + source_nodes.append( + NodeWithScore( + node=TextNode(text=paragraph, id_=p_idx), + score=isRel_score, + ) + ) + + if self.verbose: + print_text( + f"Input: {paragraph}\nPrediction: {llm_response_text[p_idx]}\nScore: {paragraphs_final_score[p_idx]}\n", + color="blue", + ) + print_text( + f"{p_idx + 1}/{len(paragraphs)} paragraphs done\n\n", color="blue" + ) + + return CriticOutput(llm_response_text, paragraphs_final_score, source_nodes) + + def custom_query(self, query_str: str) -> Response: + """Run self-RAG.""" + response = self.llm(prompt=_format_prompt(query_str), **_GENERATE_KWARGS) + answer = response["choices"][0]["text"] + source_nodes = [] + + if "[Retrieval]" in answer: + if self.verbose: + print_text("Retreival required\n", color="blue") + documents = self.retriever.retrieve(query_str) + if self.verbose: + print_text(f"Received: {len(documents)} documents\n", color="blue") + paragraphs = [ + _format_prompt(query_str, document.node.text) for document in documents + ] + + if self.verbose: + print_text("Start evaluation\n", color="blue") + + critic_output = self._run_critic(paragraphs) + + paragraphs_final_score = critic_output.paragraphs_final_score + llm_response_per_paragraph = critic_output.llm_response_per_paragraph + source_nodes = critic_output.source_nodes + + if self.verbose: + print_text("End evaluation\n", color="blue") + + best_paragraph_id = max( + paragraphs_final_score, key=paragraphs_final_score.get + ) + answer = llm_response_per_paragraph[best_paragraph_id] + if self.verbose: + print_text(f"Selected the best answer: {answer}\n", color="blue") + + answer = _postprocess_answer(answer) + if self.verbose: + print_text(f"Final answer: {answer}\n", color="green") + return Response(response=str(answer), source_nodes=source_nodes) + + +class SelfRAGPack(BaseLlamaPack): + """Simple short form Self-RAG pack.""" + + def __init__( + self, + model_path: str, + retriever: BaseRetriever, + verbose: bool = False, + **kwargs: Any, + ) -> None: + """Init params.""" + + self.query_engine = SelfRAGQueryEngine(model_path, retriever, verbose) + + def get_modules(self) -> Dict[str, Any]: + """Get modules.""" + return { + "query_engine": self.query_engine, + "llm": self.query_engine.llm, + "retriever": self.query_engine.retriever, + } + + def run(self, *args: Any, **kwargs: Any) -> Any: + """Run the pipeline.""" + return self.query_engine.query(*args, **kwargs) diff --git a/llama_hub/llama_packs/self_rag/requirements.txt b/llama_hub/llama_packs/self_rag/requirements.txt new file mode 100644 index 0000000000..01fbed3626 --- /dev/null +++ b/llama_hub/llama_packs/self_rag/requirements.txt @@ -0,0 +1 @@ +llama_cpp_python \ No newline at end of file diff --git a/llama_hub/llama_packs/self_rag/self_rag.ipynb b/llama_hub/llama_packs/self_rag/self_rag.ipynb new file mode 100644 index 0000000000..639f624dc5 --- /dev/null +++ b/llama_hub/llama_packs/self_rag/self_rag.ipynb @@ -0,0 +1,646 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "94508a4f-ec1a-4a85-b104-63b6dd18a791", + "metadata": {}, + "source": [ + "# Simple Self RAG Notebook\n", + "\n", + "\"Open\n", + "\n", + "This LlamaPack implements short form the [self-RAG paper by Akari et al.](https://arxiv.org/pdf/2310.11511.pdf).\n", + "\n", + "Novel framework called Self-Reflective Retrieval-Augmented Generation (SELF-RAG). Which aims to enhance the quality and factuality of large language models (LLMs) by combining retrieval and self-reflection mechanisms.\n", + "\n", + "The implementation is adapted from the author [implementation](https://github.com/AkariAsai/self-rag)\n", + "A full notebook guide can be found [here](https://github.com/run-llama/llama-hub/blob/main/llama_hub/llama_packs/self_rag/self_rag.ipynb).\n" + ] + }, + { + "cell_type": "markdown", + "id": "0646bc55-493e-4836-9df3-0f42814bc4fd", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "e52689a4-152b-49ef-8bf1-f588a692c5ee", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index import Document, VectorStoreIndex\n", + "from llama_index.retrievers import VectorIndexRetriever\n", + "\n", + "# Create documents\n", + "documents = [\n", + " Document(\n", + " text=\"A group of penguins, known as a 'waddle' on land, shuffled across the Antarctic ice, their tuxedo-like plumage standing out against the snow.\"\n", + " ),\n", + " Document(\n", + " text=\"Emperor penguins, the tallest of all penguin species, can dive deeper than any other bird, reaching depths of over 500 meters.\"\n", + " ),\n", + " Document(\n", + " text=\"Penguins' black and white coloring is a form of camouflage called countershading; from above, their black back blends with the ocean depths, and from below, their white belly matches the bright surface.\"\n", + " ),\n", + " Document(\n", + " text=\"Despite their upright stance, penguins are birds that cannot fly; their wings have evolved into flippers, making them expert swimmers.\"\n", + " ),\n", + " Document(\n", + " text=\"The fastest species, the Gentoo penguin, can swim up to 36 kilometers per hour, using their flippers and streamlined bodies to slice through the water.\"\n", + " ),\n", + " Document(\n", + " text=\"Penguins are social birds; many species form large colonies for breeding, which can number in the tens of thousands.\"\n", + " ),\n", + " Document(\n", + " text=\"Intriguingly, penguins have excellent hearing and rely on distinct calls to identify their mates and chicks amidst the noisy colonies.\"\n", + " ),\n", + " Document(\n", + " text=\"The smallest penguin species, the Little Blue Penguin, stands just about 40 cm tall and is found along the coastlines of southern Australia and New Zealand.\"\n", + " ),\n", + " Document(\n", + " text=\"During the breeding season, male Emperor penguins endure the harsh Antarctic winter for months, fasting and incubating their eggs, while females hunt at sea.\"\n", + " ),\n", + " Document(\n", + " text=\"Penguins consume a variety of seafood; their diet mainly consists of fish, squid, and krill, which they catch on their diving expeditions.\"\n", + " ),\n", + "]\n", + "\n", + "index = VectorStoreIndex.from_documents(documents)\n", + "\n", + "# Setup a simple retriever\n", + "retriever = VectorIndexRetriever(\n", + " index=index,\n", + " similarity_top_k=10,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "ed9206f7-8271-4436-8b77-c653e8f1620f", + "metadata": {}, + "source": [ + "## Load Pack / Setup\n", + "\n", + "Now we do `download_llama_pack` to load the Self-RAG LlamaPack (you can also import the module directly if using the llama-hub package).\n", + "\n", + "We will also optionally setup observability/tracing so we can observe the intermediate steps." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "74dfb6f1-68c9-4378-83c0-318e12b1cb41", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Option: if developing with the llama_hub package\n", + "# from base import SelfRAGQueryEngine\n", + "\n", + "\n", + "# Option: download llama_pack\n", + "from llama_index.llama_pack import download_llama_pack\n", + "\n", + "download_llama_pack(\n", + " \"SelfRAGPack\",\n", + " \"./self_rag_pack\",\n", + " skip_load=True,\n", + ")\n", + "from self_rag_pack.base import SelfRAGQueryEngine" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "7065a08f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: huggingface-hub in /home/mmaatouk/repos/mvenv/lib/python3.10/site-packages (0.20.3)\n", + "Requirement already satisfied: tqdm>=4.42.1 in /home/mmaatouk/repos/mvenv/lib/python3.10/site-packages (from huggingface-hub) (4.66.1)\n", + "Requirement already satisfied: filelock in /home/mmaatouk/repos/mvenv/lib/python3.10/site-packages (from huggingface-hub) (3.13.1)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /home/mmaatouk/repos/mvenv/lib/python3.10/site-packages (from huggingface-hub) (4.9.0)\n", + "Requirement already satisfied: fsspec>=2023.5.0 in /home/mmaatouk/repos/mvenv/lib/python3.10/site-packages (from huggingface-hub) (2023.12.2)\n", + "Requirement already satisfied: requests in /home/mmaatouk/repos/mvenv/lib/python3.10/site-packages (from huggingface-hub) (2.31.0)\n", + "Requirement already satisfied: packaging>=20.9 in /home/mmaatouk/repos/mvenv/lib/python3.10/site-packages (from huggingface-hub) (23.2)\n", + "Requirement already satisfied: pyyaml>=5.1 in /home/mmaatouk/repos/mvenv/lib/python3.10/site-packages (from huggingface-hub) (6.0.1)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /home/mmaatouk/repos/mvenv/lib/python3.10/site-packages (from requests->huggingface-hub) (2023.11.17)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/mmaatouk/repos/mvenv/lib/python3.10/site-packages (from requests->huggingface-hub) (2.0.7)\n", + "Requirement already satisfied: idna<4,>=2.5 in /home/mmaatouk/repos/mvenv/lib/python3.10/site-packages (from requests->huggingface-hub) (3.6)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /home/mmaatouk/repos/mvenv/lib/python3.10/site-packages (from requests->huggingface-hub) (3.3.2)\n", + "Consider using `hf_transfer` for faster downloads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.\n", + "downloading https://huggingface.co/m4r1/selfrag_llama2_7b-GGUF/resolve/main/selfrag_llama2_7b.q4_k_m.gguf to /home/mmaatouk/.cache/huggingface/hub/tmpdqmfpera\n", + "selfrag_llama2_7b.q4_k_m.gguf: 100%|███████| 4.08G/4.08G [02:37<00:00, 25.9MB/s]\n", + "/home/mmaatouk/tmp/selfrag_llama2_7b.q4_k_m.gguf\n" + ] + } + ], + "source": [ + "# Download the self-RAG model\n", + "download_dir = \"/home/mmaatouk/tmp\" # Replace\n", + "!pip3 install -q huggingface-hub\n", + "!huggingface-cli download m4r1/selfrag_llama2_7b-GGUF selfrag_llama2_7b.q4_k_m.gguf --local-dir {download_dir} --local-dir-use-symlinks False" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "e48f36b3-9278-4da9-891c-3af4de747528", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_loader: loaded meta data with 21 key-value pairs and 291 tensors from /home/mmaatouk/tmp/selfrag_llama2_7b.q4_k_m.gguf (version GGUF V3 (latest))\n", + "llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", + "llama_model_loader: - kv 0: general.architecture str = llama\n", + "llama_model_loader: - kv 1: general.name str = LLaMA v2\n", + "llama_model_loader: - kv 2: llama.context_length u32 = 4096\n", + "llama_model_loader: - kv 3: llama.embedding_length u32 = 4096\n", + "llama_model_loader: - kv 4: llama.block_count u32 = 32\n", + "llama_model_loader: - kv 5: llama.feed_forward_length u32 = 11008\n", + "llama_model_loader: - kv 6: llama.rope.dimension_count u32 = 128\n", + "llama_model_loader: - kv 7: llama.attention.head_count u32 = 32\n", + "llama_model_loader: - kv 8: llama.attention.head_count_kv u32 = 32\n", + "llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 = 0.000010\n", + "llama_model_loader: - kv 10: llama.rope.freq_base f32 = 10000.000000\n", + "llama_model_loader: - kv 11: general.file_type u32 = 15\n", + "llama_model_loader: - kv 12: tokenizer.ggml.model str = llama\n", + "llama_model_loader: - kv 13: tokenizer.ggml.tokens arr[str,32016] = [\"\", \"\", \"\", \"<0x00>\", \"<...\n", + "llama_model_loader: - kv 14: tokenizer.ggml.scores arr[f32,32016] = [0.000000, 0.000000, 0.000000, 0.0000...\n", + "llama_model_loader: - kv 15: tokenizer.ggml.token_type arr[i32,32016] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...\n", + "llama_model_loader: - kv 16: tokenizer.ggml.bos_token_id u32 = 1\n", + "llama_model_loader: - kv 17: tokenizer.ggml.eos_token_id u32 = 2\n", + "llama_model_loader: - kv 18: tokenizer.ggml.add_bos_token bool = true\n", + "llama_model_loader: - kv 19: tokenizer.ggml.add_eos_token bool = false\n", + "llama_model_loader: - kv 20: general.quantization_version u32 = 2\n", + "llama_model_loader: - type f32: 65 tensors\n", + "llama_model_loader: - type q4_K: 193 tensors\n", + "llama_model_loader: - type q6_K: 33 tensors\n", + "llm_load_vocab: special tokens definition check successful ( 275/32016 ).\n", + "llm_load_print_meta: format = GGUF V3 (latest)\n", + "llm_load_print_meta: arch = llama\n", + "llm_load_print_meta: vocab type = SPM\n", + "llm_load_print_meta: n_vocab = 32016\n", + "llm_load_print_meta: n_merges = 0\n", + "llm_load_print_meta: n_ctx_train = 4096\n", + "llm_load_print_meta: n_embd = 4096\n", + "llm_load_print_meta: n_head = 32\n", + "llm_load_print_meta: n_head_kv = 32\n", + "llm_load_print_meta: n_layer = 32\n", + "llm_load_print_meta: n_rot = 128\n", + "llm_load_print_meta: n_embd_head_k = 128\n", + "llm_load_print_meta: n_embd_head_v = 128\n", + "llm_load_print_meta: n_gqa = 1\n", + "llm_load_print_meta: n_embd_k_gqa = 4096\n", + "llm_load_print_meta: n_embd_v_gqa = 4096\n", + "llm_load_print_meta: f_norm_eps = 0.0e+00\n", + "llm_load_print_meta: f_norm_rms_eps = 1.0e-05\n", + "llm_load_print_meta: f_clamp_kqv = 0.0e+00\n", + "llm_load_print_meta: f_max_alibi_bias = 0.0e+00\n", + "llm_load_print_meta: n_ff = 11008\n", + "llm_load_print_meta: n_expert = 0\n", + "llm_load_print_meta: n_expert_used = 0\n", + "llm_load_print_meta: rope scaling = linear\n", + "llm_load_print_meta: freq_base_train = 10000.0\n", + "llm_load_print_meta: freq_scale_train = 1\n", + "llm_load_print_meta: n_yarn_orig_ctx = 4096\n", + "llm_load_print_meta: rope_finetuned = unknown\n", + "llm_load_print_meta: model type = 7B\n", + "llm_load_print_meta: model ftype = Q4_K - Medium\n", + "llm_load_print_meta: model params = 6.74 B\n", + "llm_load_print_meta: model size = 3.80 GiB (4.84 BPW) \n", + "llm_load_print_meta: general.name = LLaMA v2\n", + "llm_load_print_meta: BOS token = 1 ''\n", + "llm_load_print_meta: EOS token = 2 ''\n", + "llm_load_print_meta: UNK token = 0 ''\n", + "llm_load_print_meta: LF token = 13 '<0x0A>'\n", + "llm_load_tensors: ggml ctx size = 0.11 MiB\n", + "llm_load_tensors: offloading 32 repeating layers to GPU\n", + "llm_load_tensors: offloading non-repeating layers to GPU\n", + "llm_load_tensors: offloaded 33/33 layers to GPU\n", + "llm_load_tensors: CPU buffer size = 3891.33 MiB\n", + "..................................................................................................\n", + "llama_new_context_with_model: n_ctx = 2048\n", + "llama_new_context_with_model: freq_base = 10000.0\n", + "llama_new_context_with_model: freq_scale = 1\n", + "llama_kv_cache_init: CPU KV buffer size = 1024.00 MiB\n", + "llama_new_context_with_model: KV self size = 1024.00 MiB, K (f16): 512.00 MiB, V (f16): 512.00 MiB\n", + "llama_new_context_with_model: graph splits (measure): 1\n", + "llama_new_context_with_model: CPU compute buffer size = 172.00 MiB\n", + "AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | \n", + "Model metadata: {'tokenizer.ggml.add_eos_token': 'false', 'tokenizer.ggml.eos_token_id': '2', 'general.architecture': 'llama', 'llama.rope.freq_base': '10000.000000', 'llama.context_length': '4096', 'general.name': 'LLaMA v2', 'tokenizer.ggml.add_bos_token': 'true', 'llama.embedding_length': '4096', 'llama.feed_forward_length': '11008', 'llama.attention.layer_norm_rms_epsilon': '0.000010', 'llama.rope.dimension_count': '128', 'tokenizer.ggml.bos_token_id': '1', 'llama.attention.head_count': '32', 'llama.block_count': '32', 'llama.attention.head_count_kv': '32', 'general.quantization_version': '2', 'tokenizer.ggml.model': 'llama', 'general.file_type': '15'}\n" + ] + } + ], + "source": [ + "from pathlib import Path\n", + "\n", + "model_path = Path(download_dir) / \"selfrag_llama2_7b.q4_k_m.gguf\"\n", + "query_engine = SelfRAGQueryEngine(str(model_path), retriever, verbose=True)" + ] + }, + { + "cell_type": "markdown", + "id": "40213404-e3a2-4412-a217-82fa95f06cec", + "metadata": {}, + "source": [ + "## Try out some Queries\n", + "\n", + "Now let's try out our `SelfRAGQueryEngine`!\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "cdb8462b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "llama_print_timings: load time = 1582.98 ms\n", + "llama_print_timings: sample time = 6.87 ms / 22 runs ( 0.31 ms per token, 3201.40 tokens per second)\n", + "llama_print_timings: prompt eval time = 1582.02 ms / 24 tokens ( 65.92 ms per token, 15.17 tokens per second)\n", + "llama_print_timings: eval time = 2685.22 ms / 21 runs ( 127.87 ms per token, 7.82 tokens per second)\n", + "llama_print_timings: total time = 4364.67 ms / 45 tokens\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1;3;32mFinal answer: The book \"Pride and Prejudice\" is a romantic novel by Jane Austen.\n", + "\u001b[0m" + ] + } + ], + "source": [ + "# No retreival example\n", + "response = query_engine.query(\"Which genre the book pride and prejudice?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "127b16c5", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Llama.generate: prefix-match hit\n", + "\n", + "llama_print_timings: load time = 1582.98 ms\n", + "llama_print_timings: sample time = 16.08 ms / 50 runs ( 0.32 ms per token, 3108.68 tokens per second)\n", + "llama_print_timings: prompt eval time = 1005.45 ms / 16 tokens ( 62.84 ms per token, 15.91 tokens per second)\n", + "llama_print_timings: eval time = 6345.52 ms / 49 runs ( 129.50 ms per token, 7.72 tokens per second)\n", + "llama_print_timings: total time = 7517.03 ms / 65 tokens\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1;3;34mRetreival required\n", + "\u001b[0m\u001b[1;3;34mReceived: 10 documents\n", + "\u001b[0m\u001b[1;3;34mStart evaluation\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Llama.generate: prefix-match hit\n", + "\n", + "llama_print_timings: load time = 1582.98 ms\n", + "llama_print_timings: sample time = 13.51 ms / 43 runs ( 0.31 ms per token, 3183.53 tokens per second)\n", + "llama_print_timings: prompt eval time = 2447.83 ms / 39 tokens ( 62.76 ms per token, 15.93 tokens per second)\n", + "llama_print_timings: eval time = 5438.94 ms / 42 runs ( 129.50 ms per token, 7.72 tokens per second)\n", + "llama_print_timings: total time = 8188.26 ms / 81 tokens\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1;3;34mInput: ### Instruction:\n", + "How tall is the smallest penguins?\n", + "\n", + "### Response:\n", + "[Retrieval]The smallest penguin species, the Little Blue Penguin, stands just about 40 cm tall and is found along the coastlines of southern Australia and New Zealand.\n", + "Prediction: [Relevant]The smallest penguin species is the Little Blue Penguin (also known as the Fairy Penguin), which can grow to be around 40 centimeters in height.[Fully supported][Utility:5]\n", + "Score: 2.4709723458196087\n", + "\u001b[0m\u001b[1;3;34m1/10 paragraphs done\n", + "\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Llama.generate: prefix-match hit\n", + "\n", + "llama_print_timings: load time = 1582.98 ms\n", + "llama_print_timings: sample time = 8.51 ms / 26 runs ( 0.33 ms per token, 3054.15 tokens per second)\n", + "llama_print_timings: prompt eval time = 2431.51 ms / 37 tokens ( 65.72 ms per token, 15.22 tokens per second)\n", + "llama_print_timings: eval time = 3271.24 ms / 25 runs ( 130.85 ms per token, 7.64 tokens per second)\n", + "llama_print_timings: total time = 5901.59 ms / 62 tokens\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1;3;34mInput: ### Instruction:\n", + "How tall is the smallest penguins?\n", + "\n", + "### Response:\n", + "[Retrieval]Emperor penguins, the tallest of all penguin species, can dive deeper than any other bird, reaching depths of over 500 meters.\n", + "Prediction: [Relevant]The smallest penguin species is the Emperor Penguin (Aptenodytes forsteri).[Fully supported][Utility:5]\n", + "Score: 2.1767850110288887\n", + "\u001b[0m\u001b[1;3;34m2/10 paragraphs done\n", + "\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Llama.generate: prefix-match hit\n", + "\n", + "llama_print_timings: load time = 1582.98 ms\n", + "llama_print_timings: sample time = 8.62 ms / 26 runs ( 0.33 ms per token, 3016.59 tokens per second)\n", + "llama_print_timings: prompt eval time = 2846.05 ms / 43 tokens ( 66.19 ms per token, 15.11 tokens per second)\n", + "llama_print_timings: eval time = 3340.62 ms / 25 runs ( 133.62 ms per token, 7.48 tokens per second)\n", + "llama_print_timings: total time = 6433.70 ms / 68 tokens\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1;3;34mInput: ### Instruction:\n", + "How tall is the smallest penguins?\n", + "\n", + "### Response:\n", + "[Retrieval]A group of penguins, known as a 'waddle' on land, shuffled across the Antarctic ice, their tuxedo-like plumage standing out against the snow.\n", + "Prediction: [Relevant]The smallest penguin species is the African or little penguin (Eudyptula minor).[No support / Contradictory][Utility:5]\n", + "Score: 1.5998614571701189\n", + "\u001b[0m\u001b[1;3;34m3/10 paragraphs done\n", + "\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Llama.generate: prefix-match hit\n", + "\n", + "llama_print_timings: load time = 1582.98 ms\n", + "llama_print_timings: sample time = 6.24 ms / 18 runs ( 0.35 ms per token, 2885.54 tokens per second)\n", + "llama_print_timings: prompt eval time = 2461.25 ms / 37 tokens ( 66.52 ms per token, 15.03 tokens per second)\n", + "llama_print_timings: eval time = 2272.68 ms / 17 runs ( 133.69 ms per token, 7.48 tokens per second)\n", + "llama_print_timings: total time = 4892.65 ms / 54 tokens\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1;3;34mInput: ### Instruction:\n", + "How tall is the smallest penguins?\n", + "\n", + "### Response:\n", + "[Retrieval]Despite their upright stance, penguins are birds that cannot fly; their wings have evolved into flippers, making them expert swimmers.\n", + "Prediction: [Relevant]The height of a penguin varies depending on the species.[No support / Contradictory][Utility:5]\n", + "Score: 1.4486356991581153\n", + "\u001b[0m\u001b[1;3;34m4/10 paragraphs done\n", + "\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Llama.generate: prefix-match hit\n", + "\n", + "llama_print_timings: load time = 1582.98 ms\n", + "llama_print_timings: sample time = 13.34 ms / 39 runs ( 0.34 ms per token, 2923.10 tokens per second)\n", + "llama_print_timings: prompt eval time = 2735.91 ms / 41 tokens ( 66.73 ms per token, 14.99 tokens per second)\n", + "llama_print_timings: eval time = 5088.15 ms / 38 runs ( 133.90 ms per token, 7.47 tokens per second)\n", + "llama_print_timings: total time = 8140.45 ms / 79 tokens\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1;3;34mInput: ### Instruction:\n", + "How tall is the smallest penguins?\n", + "\n", + "### Response:\n", + "[Retrieval]The fastest species, the Gentoo penguin, can swim up to 36 kilometers per hour, using their flippers and streamlined bodies to slice through the water.\n", + "Prediction: [Relevant]The smallest penguin species is the African or little penguin (also known as the jackass penguin).[No support / Contradictory][Continue to Use Evidence]They are about 17 inches tall.[Utility:5]\n", + "Score: 1.4687150930489146\n", + "\u001b[0m\u001b[1;3;34m5/10 paragraphs done\n", + "\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Llama.generate: prefix-match hit\n", + "\n", + "llama_print_timings: load time = 1582.98 ms\n", + "llama_print_timings: sample time = 13.66 ms / 38 runs ( 0.36 ms per token, 2781.64 tokens per second)\n", + "llama_print_timings: prompt eval time = 3413.28 ms / 50 tokens ( 68.27 ms per token, 14.65 tokens per second)\n", + "llama_print_timings: eval time = 4859.28 ms / 37 runs ( 131.33 ms per token, 7.61 tokens per second)\n", + "llama_print_timings: total time = 8526.62 ms / 87 tokens\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1;3;34mInput: ### Instruction:\n", + "How tall is the smallest penguins?\n", + "\n", + "### Response:\n", + "[Retrieval]Penguins' black and white coloring is a form of camouflage called countershading; from above, their black back blends with the ocean depths, and from below, their white belly matches the bright surface.\n", + "Prediction: [Relevant]The smallest penguin species is the African or little penguin (Eudyptula minor), which can grow to be about 17 inches tall.[No support / Contradictory][Utility:5]\n", + "Score: 1.6357659323827645\n", + "\u001b[0m\u001b[1;3;34m6/10 paragraphs done\n", + "\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Llama.generate: prefix-match hit\n", + "\n", + "llama_print_timings: load time = 1582.98 ms\n", + "llama_print_timings: sample time = 8.49 ms / 25 runs ( 0.34 ms per token, 2943.25 tokens per second)\n", + "llama_print_timings: prompt eval time = 1744.16 ms / 24 tokens ( 72.67 ms per token, 13.76 tokens per second)\n", + "llama_print_timings: eval time = 3862.56 ms / 24 runs ( 160.94 ms per token, 6.21 tokens per second)\n", + "llama_print_timings: total time = 5867.77 ms / 48 tokens\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1;3;34mInput: ### Instruction:\n", + "How tall is the smallest penguins?\n", + "\n", + "### Response:\n", + "[Retrieval]Penguins are social birds; many species form large colonies for breeding, which can number in the tens of thousands.\n", + "Prediction: [Relevant]The smallest penguin is the African or little penguin (Eudyptula minor).[No support / Contradictory][Utility:5]\n", + "Score: 1.4935304338695037\n", + "\u001b[0m\u001b[1;3;34m7/10 paragraphs done\n", + "\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Llama.generate: prefix-match hit\n", + "\n", + "llama_print_timings: load time = 1582.98 ms\n", + "llama_print_timings: sample time = 9.34 ms / 26 runs ( 0.36 ms per token, 2782.83 tokens per second)\n", + "llama_print_timings: prompt eval time = 2438.93 ms / 33 tokens ( 73.91 ms per token, 13.53 tokens per second)\n", + "llama_print_timings: eval time = 3521.54 ms / 25 runs ( 140.86 ms per token, 7.10 tokens per second)\n", + "llama_print_timings: total time = 6158.74 ms / 58 tokens\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1;3;34mInput: ### Instruction:\n", + "How tall is the smallest penguins?\n", + "\n", + "### Response:\n", + "[Retrieval]Intriguingly, penguins have excellent hearing and rely on distinct calls to identify their mates and chicks amidst the noisy colonies.\n", + "Prediction: [Relevant]The smallest penguin species is the African or little penguin (Eudyptula minor).[No support / Contradictory][Utility:5]\n", + "Score: 1.4074488783945505\n", + "\u001b[0m\u001b[1;3;34m8/10 paragraphs done\n", + "\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Llama.generate: prefix-match hit\n", + "\n", + "llama_print_timings: load time = 1582.98 ms\n", + "llama_print_timings: sample time = 9.03 ms / 26 runs ( 0.35 ms per token, 2878.02 tokens per second)\n", + "llama_print_timings: prompt eval time = 2850.73 ms / 41 tokens ( 69.53 ms per token, 14.38 tokens per second)\n", + "llama_print_timings: eval time = 3430.31 ms / 25 runs ( 137.21 ms per token, 7.29 tokens per second)\n", + "llama_print_timings: total time = 6558.69 ms / 66 tokens\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1;3;34mInput: ### Instruction:\n", + "How tall is the smallest penguins?\n", + "\n", + "### Response:\n", + "[Retrieval]During the breeding season, male Emperor penguins endure the harsh Antarctic winter for months, fasting and incubating their eggs, while females hunt at sea.\n", + "Prediction: [Relevant]The smallest penguin species is the Emperor Penguin (Aptenodytes forsteri).[No support / Contradictory][Utility:5]\n", + "Score: 1.415058228804781\n", + "\u001b[0m\u001b[1;3;34m9/10 paragraphs done\n", + "\n", + "\u001b[0m" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Llama.generate: prefix-match hit\n", + "\n", + "llama_print_timings: load time = 1582.98 ms\n", + "llama_print_timings: sample time = 7.27 ms / 20 runs ( 0.36 ms per token, 2752.55 tokens per second)\n", + "llama_print_timings: prompt eval time = 2766.95 ms / 37 tokens ( 74.78 ms per token, 13.37 tokens per second)\n", + "llama_print_timings: eval time = 2538.61 ms / 19 runs ( 133.61 ms per token, 7.48 tokens per second)\n", + "llama_print_timings: total time = 5471.43 ms / 56 tokens\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1;3;34mInput: ### Instruction:\n", + "How tall is the smallest penguins?\n", + "\n", + "### Response:\n", + "[Retrieval]Penguins consume a variety of seafood; their diet mainly consists of fish, squid, and krill, which they catch on their diving expeditions.\n", + "Prediction: [Relevant]The height of the smallest penguin species can vary depending on the species.[No support / Contradictory][Utility:5]\n", + "Score: 1.4213598342974365\n", + "\u001b[0m\u001b[1;3;34m10/10 paragraphs done\n", + "\n", + "\u001b[0m\u001b[1;3;34mEnd evaluation\n", + "\u001b[0m\u001b[1;3;34mSelected the best answer: [Relevant]The smallest penguin species is the Little Blue Penguin (also known as the Fairy Penguin), which can grow to be around 40 centimeters in height.[Fully supported][Utility:5]\n", + "\u001b[0m\u001b[1;3;32mFinal answer: The smallest penguin species is the Little Blue Penguin (also known as the Fairy Penguin), which can grow to be around 40 centimeters in height.\n", + "\u001b[0m" + ] + } + ], + "source": [ + "# Retreival example\n", + "response = query_engine.query(\"How tall is the smallest penguins?\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}