diff --git a/.github/scripts/spellcheck_conf/wordlist.txt b/.github/scripts/spellcheck_conf/wordlist.txt index 1f976aa5a..929a6c7d1 100644 --- a/.github/scripts/spellcheck_conf/wordlist.txt +++ b/.github/scripts/spellcheck_conf/wordlist.txt @@ -1466,3 +1466,42 @@ OCRVQA OCRVQADataCollator ocrvqa langchain +GiB +Terraform +gb +TPOT +ctrl +finetunes +llmcompressor +prefill +qps +terraform +tf +tmux +tpot +ttft +uv +8xL40S +xL +EDA +DeepLearningai +NotebookLM +NotebookLlama +Parler +TTS +parler +suno +tts +Hifigan +MeloTTS +Metavoice +Parler +Parler's +Reddit +Suno +VALL +WhisperSpeech +locallama +myshell +parler +xTTS diff --git a/.github/workflows/pytest_cpu_gha_runner.yaml b/.github/workflows/pytest_cpu_gha_runner.yaml index 584cb58c3..d5d123fc0 100644 --- a/.github/workflows/pytest_cpu_gha_runner.yaml +++ b/.github/workflows/pytest_cpu_gha_runner.yaml @@ -1,16 +1,10 @@ name: "[GHA][CPU] llama-recipes Pytest tests on CPU GitHub hosted runner." on: pull_request: - branches: + branches: - 'main' - paths: - - 'src/llama-recipes/configs/*.py' - - 'src/llama-recipes/utils/*.py' - - 'src/llama-recipes/datasets/*.py' - - 'src/llama-recipes/data/*.py' - - 'src/llama-recipes/*.py' - # triggers workflow manually for debugging purposes. + # triggers workflow manually for debugging purposes. workflow_dispatch: inputs: runner: @@ -23,8 +17,8 @@ on: required: false default: "true" -env: - PYTORCH_WHEEL_URL: https://download.pytorch.org/whl/test/cu118 +env: + PYTORCH_WHEEL_URL: https://download.pytorch.org/whl/test/cu118 jobs: execute_workflow: @@ -63,7 +57,7 @@ jobs: id: install_llama_recipes_package run: | echo "Installing 'llama-recipes' project (re: https://github.com/facebookresearch/llama-recipes?tab=readme-ov-file#install-with-optional-dependencies)" - pip install --extra-index-url ${PYTORCH_WHEEL_URL} -e '.[tests]' + pip install --extra-index-url ${PYTORCH_WHEEL_URL} -e '.[tests]' - name: "Running PyTest tests on GHA CPU Runner" @@ -71,11 +65,10 @@ jobs: run: | echo "Running PyTest tests at 'GITHUB_WORKSPACE' path: ${GITHUB_WORKSPACE}" cd $GITHUB_WORKSPACE && python3 -m pytest --junitxml="$GITHUB_WORKSPACE/result.xml" - + - name: Publish Test Summary id: test_summary uses: test-summary/action@v2 with: paths: "**/*.xml" if: always() - \ No newline at end of file diff --git a/docs/multi_gpu.md b/docs/multi_gpu.md index 3535422c1..820595dcf 100644 --- a/docs/multi_gpu.md +++ b/docs/multi_gpu.md @@ -4,7 +4,7 @@ To run fine-tuning on multi-GPUs, we will make use of two packages: 1. [PEFT](https://huggingface.co/blog/peft) methods and in particular using the Hugging Face [PEFT](https://github.com/huggingface/peft)library. -2. [FSDP](https://pytorch.org/tutorials/intermediate/FSDP_adavnced_tutorial.html) which helps us parallelize the training over multiple GPUs. [More details](LLM_finetuning.md/#2-full-partial-parameter-finetuning). +2. [FSDP](https://pytorch.org/tutorials/intermediate/FSDP_adavnced_tutorial.html) which helps us parallelize the training over multiple GPUs. [More details](./LLM_finetuning.md). Given the combination of PEFT and FSDP, we would be able to fine tune a Meta Llama 8B model on multiple GPUs in one node. For big models like 405B we will need to fine-tune in a multi-node setup even if 4bit quantization is enabled. diff --git a/recipes/3p_integrations/crusoe/README.md b/recipes/3p_integrations/crusoe/README.md new file mode 100644 index 000000000..fc13af0c5 --- /dev/null +++ b/recipes/3p_integrations/crusoe/README.md @@ -0,0 +1,11 @@ +Below are recipes for deploying common Llama workflows on [Crusoe's](https://crusoe.ai) high-performance, sustainable cloud. Each workflow corresponds to a subfolder with its own README and supplemental materials. Please reference the table below for hardware requirements. + +| Workflow | Model(s) | VM type | Storage | +|:----: | :----: | :----:| :----: | +| [Serving Llama3.1 in FP8 with vLLM](vllm-fp8/) | [meta-llama/Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct), [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) | l40s-48gb.8x | 256 GiB Persistent Disk | + +# Requirements +First, ensure that you have a Crusoe account (you can sign up [here](https://console.crusoecloud.com/)). We will provision resources using Terraform, please ensure that your environment is configured and refer to the Crusoe [docs](https://github.com/crusoecloud/terraform-provider-crusoe?tab=readme-ov-file#getting-started) for guidance. + +# Serving Models +Some recipes in this repo require firewall rules to expose ports in order to reach the inference server. To manage firewall rules, please refer to our [networking documentation](https://docs.crusoecloud.com/networking/firewall-rules/managing-firewall-rules). diff --git a/recipes/3p_integrations/crusoe/vllm-fp8/README.md b/recipes/3p_integrations/crusoe/vllm-fp8/README.md new file mode 100644 index 000000000..1c26f9413 --- /dev/null +++ b/recipes/3p_integrations/crusoe/vllm-fp8/README.md @@ -0,0 +1,85 @@ +In this article, we will show how to benchmark FP8 models on L40S using the vLLM inference engine. At the end, you should have an understanding of how to use `llm-compressor` to create quantize existing Llama3 finetunes in higher precision to fp8, benchmark throughput and latency to compare performance, and finally serve models using `vllm`. + +# Provisioning Resources +First, navigate to this repository from your local machine. Update the corresponding variables in `locals` inside `main.tf` to match your environment (e.g. the path to your SSH key), then initialize the terraform project with `terraform init` and provision resources with `terraform apply`. Note that this will create a VM equipped with 8xL40S and a 256GB persistent disk. After the VM has been created, terraform will output the public IP address. + +## Mount Storage +`ssh` into your VM. Then, run the below commands to mount the attached disk to `/scratch`. +```bash +mkfs.ext4 /dev/vdb +mkdir /scratch +mount -t ext4 /dev/vdb /scratch +cd /scratch +``` + +# Install Dependencies +We'll use [uv](https://github.com/astral-sh/uv) to install dependencies. First, install the tool with +```bash +apt-get update && apt-get install -y curl +apt-get install tmux +curl -LsSf https://astral.sh/uv/install.sh | sh +source $HOME/.cargo/env +``` + +Now, clone the recipes and navigate to this tutorial. Initialize the virtual environment and install dependencies: +```bash +git clone https://github.com/meta-llama/llama-recipes.git +cd llama-recipes/recipes/3p_integrations/crusoe/vllm-fp8/ +uv add vllm setuptools +``` + +# Run Benchmarks +Before starting the vLLM server, we'll configure HuggingFace to save to our shared disk, specify the model tag, and set tensor parallelism to 1. +```bash +export HF_HOME=/scratch/ +export MODEL=neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic +export TP_SIZE=1 +``` +Now, we'll use tmux to run our server inside of a detachable session. +```bash +tmux new -s server +uv run vllm serve $MODEL --enable-chunked-prefill --disable-log-requests --tensor-parallel-size $TP_SIZE +``` +vLLM will download the model from HF and serve it on port 8000. Now, detach from the tmux session (`ctrl+b` then `d`) and we'll simulate a client. +```bash +tmux new -s client +chmod +x run_benchmark.sh +./run_benchmark.sh +``` +Let's inspect the benchmark script to see what's going on. +```bash +TOTAL_SECONDS=120 +QPS_RATES=("1" "3" "5" "7" "9") + +for QPS in ${QPS_RATES[@]}; do + NUM_PROMPTS=$((TOTAL_SECONDS * QPS)) + echo "===== RUNNING NUM_PROMPTS = $NUM_PROMPTS QPS = $QPS =====" + + uv run benchmarks/benchmark_serving.py \ + --model $MODEL \ + --dataset-name sonnet --sonnet-input-len 550 --sonnet-output-len 150 --dataset-path benchmarks/sonnet.txt \ + --num-prompts $NUM_PROMPTS --request-rate $QPS --save-result +done +``` +This is a convenience wrapper that re-runs the vLLM `benchmarks/benchmark_serving.py` with queries-per-second (QPS) gradually increasing from 1 to 9 and saves the results. After each run completes, a JSON will appear in the same directory containing inference statistics. + +# Results +We repeated the above benchmark across the fp8 and fp16 versions of both Llama3.1 8B and 70B. + +![TPOT vs QPS](assets/tpot_vs_qps_chart.png "TPOT vs QPS") +In the above chart, we compare time-per-output-token (TPOT) across different QPS volumes. For fp16 70B we run across 8 GPUs while in fp8 we only use 4 and we still maintain the same TPOT range. The 8B models are run across 1 GPU though fp8 is noticeably faster. + +![TPOT vs QPS](assets/ttft_vs_qps_chart.png "TTFT vs QPS") +Looking at our time-to-first-token (TTFT), we observe the same trends. Even though the fp8 70B is run across half as many GPUs, its TTFT is roughly the same as the fp16 version on 8. + +# Converting Llama3 models to FP8 +If you wish to convert your existing finetunes to FP8, we can easily achieve this using [llmcompressor](https://github.com/vllm-project/llm-compressor). +```bash +uv add llmcompressor +uv run convert_hf_to_fp8.py NousResearch/Hermes-3-Llama-3.1-70B +``` + +To use the converted model, update `$MODEL` to your absolute path for the converted version, then rerun `uv run vllm serve $MODEL --enable-chunked-prefill --disable-log-requests --tensor-parallel-size $TP_SIZE`. Now, we have a vLLM server up with our converted finetune and can rerun our previous benchmarks to verify performance. + +# Cleaning up +To clean up the resources we've provisioned, we can simply run `terraform destroy` from within this repository on your local machine. diff --git a/recipes/3p_integrations/crusoe/vllm-fp8/assets/tpot_vs_qps_chart.png b/recipes/3p_integrations/crusoe/vllm-fp8/assets/tpot_vs_qps_chart.png new file mode 100644 index 000000000..de2af6126 Binary files /dev/null and b/recipes/3p_integrations/crusoe/vllm-fp8/assets/tpot_vs_qps_chart.png differ diff --git a/recipes/3p_integrations/crusoe/vllm-fp8/assets/ttft_vs_qps_chart.png b/recipes/3p_integrations/crusoe/vllm-fp8/assets/ttft_vs_qps_chart.png new file mode 100644 index 000000000..b95e18188 Binary files /dev/null and b/recipes/3p_integrations/crusoe/vllm-fp8/assets/ttft_vs_qps_chart.png differ diff --git a/recipes/3p_integrations/crusoe/vllm-fp8/benchmarks/backend_request_func.py b/recipes/3p_integrations/crusoe/vllm-fp8/benchmarks/backend_request_func.py new file mode 100644 index 000000000..f7d67692f --- /dev/null +++ b/recipes/3p_integrations/crusoe/vllm-fp8/benchmarks/backend_request_func.py @@ -0,0 +1,427 @@ +import json +import os +import sys +import time +import traceback +from dataclasses import dataclass, field +from typing import List, Optional, Union + +import aiohttp +import huggingface_hub.constants +from tqdm.asyncio import tqdm +from transformers import (AutoTokenizer, PreTrainedTokenizer, + PreTrainedTokenizerFast) + +AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) + + +@dataclass +class RequestFuncInput: + prompt: str + api_url: str + prompt_len: int + output_len: int + model: str + best_of: int = 1 + use_beam_search: bool = False + + +@dataclass +class RequestFuncOutput: + generated_text: str = "" + success: bool = False + latency: float = 0.0 + ttft: float = 0.0 # Time to first token + itl: List[float] = field( + default_factory=list) # List of inter-token latencies + prompt_len: int = 0 + error: str = "" + + +async def async_request_tgi( + request_func_input: RequestFuncInput, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + api_url = request_func_input.api_url + assert api_url.endswith("generate_stream") + + async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + assert not request_func_input.use_beam_search + params = { + "best_of": request_func_input.best_of, + "max_new_tokens": request_func_input.output_len, + "do_sample": True, + "temperature": 0.01, # TGI does not accept 0.0 temperature. + "top_p": 0.99, # TGI does not accept 1.0 top_p. + } + payload = { + "inputs": request_func_input.prompt, + "parameters": params, + } + output = RequestFuncOutput() + output.prompt_len = request_func_input.prompt_len + + ttft = 0.0 + st = time.perf_counter() + most_recent_timestamp = st + try: + async with session.post(url=api_url, json=payload) as response: + if response.status == 200: + async for chunk_bytes in response.content: + chunk_bytes = chunk_bytes.strip() + if not chunk_bytes: + continue + chunk_bytes = chunk_bytes.decode("utf-8") + + #NOTE: Sometimes TGI returns a ping response without + # any data, we should skip it. + if chunk_bytes.startswith(":"): + continue + chunk = remove_prefix(chunk_bytes, "data:") + + data = json.loads(chunk) + timestamp = time.perf_counter() + # First token + if ttft == 0.0: + ttft = time.perf_counter() - st + output.ttft = ttft + + # Decoding phase + else: + output.itl.append(timestamp - + most_recent_timestamp) + + most_recent_timestamp = timestamp + + output.latency = most_recent_timestamp - st + output.success = True + output.generated_text = data["generated_text"] + else: + output.error = response.reason or "" + output.success = False + except Exception: + output.success = False + exc_info = sys.exc_info() + output.error = "".join(traceback.format_exception(*exc_info)) + + if pbar: + pbar.update(1) + return output + + +async def async_request_trt_llm( + request_func_input: RequestFuncInput, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + api_url = request_func_input.api_url + assert api_url.endswith("generate_stream") + + async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + assert not request_func_input.use_beam_search + assert request_func_input.best_of == 1 + payload = { + "accumulate_tokens": True, + "text_input": request_func_input.prompt, + "temperature": 0.0, + "top_p": 1.0, + "max_tokens": request_func_input.output_len, + "stream": True, + } + output = RequestFuncOutput() + output.prompt_len = request_func_input.prompt_len + + ttft = 0.0 + st = time.perf_counter() + most_recent_timestamp = st + try: + async with session.post(url=api_url, json=payload) as response: + if response.status == 200: + async for chunk_bytes in response.content: + chunk_bytes = chunk_bytes.strip() + if not chunk_bytes: + continue + + chunk = remove_prefix(chunk_bytes.decode("utf-8"), + "data:") + + data = json.loads(chunk) + output.generated_text += data["text_output"] + timestamp = time.perf_counter() + # First token + if ttft == 0.0: + ttft = time.perf_counter() - st + output.ttft = ttft + + # Decoding phase + else: + output.itl.append(timestamp - + most_recent_timestamp) + + most_recent_timestamp = timestamp + + output.latency = most_recent_timestamp - st + output.success = True + + else: + output.error = response.reason or "" + output.success = False + except Exception: + output.success = False + exc_info = sys.exc_info() + output.error = "".join(traceback.format_exception(*exc_info)) + + if pbar: + pbar.update(1) + return output + + +async def async_request_deepspeed_mii( + request_func_input: RequestFuncInput, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + assert request_func_input.best_of == 1 + assert not request_func_input.use_beam_search + + payload = { + "prompt": request_func_input.prompt, + "max_tokens": request_func_input.output_len, + "temperature": 0.01, # deepspeed-mii does not accept 0.0 temp. + "top_p": 1.0, + } + output = RequestFuncOutput() + output.prompt_len = request_func_input.prompt_len + + # NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024, + # will use 0 as placeholder. + # See https://github.com/microsoft/DeepSpeed-MII/pull/311 + output.ttft = 0 + + st = time.perf_counter() + try: + async with session.post(url=request_func_input.api_url, + json=payload) as response: + if response.status == 200: + parsed_resp = await response.json() + output.latency = time.perf_counter() - st + output.generated_text = parsed_resp["text"][0] + output.success = True + else: + output.error = response.reason or "" + output.success = False + except Exception: + output.success = False + exc_info = sys.exc_info() + output.error = "".join(traceback.format_exception(*exc_info)) + + if pbar: + pbar.update(1) + return output + + +async def async_request_openai_completions( + request_func_input: RequestFuncInput, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + api_url = request_func_input.api_url + assert api_url.endswith( + ("completions", "profile") + ), "OpenAI Completions API URL must end with 'completions' or 'profile'." + + async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + assert not request_func_input.use_beam_search + payload = { + "model": request_func_input.model, + "prompt": request_func_input.prompt, + "temperature": 0.0, + "best_of": request_func_input.best_of, + "max_tokens": request_func_input.output_len, + "stream": True, + } + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}" + } + + output = RequestFuncOutput() + output.prompt_len = request_func_input.prompt_len + + generated_text = "" + ttft = 0.0 + st = time.perf_counter() + most_recent_timestamp = st + try: + async with session.post(url=api_url, json=payload, + headers=headers) as response: + if response.status == 200: + async for chunk_bytes in response.content: + chunk_bytes = chunk_bytes.strip() + if not chunk_bytes: + continue + + chunk = remove_prefix(chunk_bytes.decode("utf-8"), + "data: ") + if chunk == "[DONE]": + latency = time.perf_counter() - st + else: + data = json.loads(chunk) + + # NOTE: Some completion API might have a last + # usage summary response without a token so we + # want to check a token was generated + if data["choices"][0]["text"]: + timestamp = time.perf_counter() + # First token + if ttft == 0.0: + ttft = time.perf_counter() - st + output.ttft = ttft + + # Decoding phase + else: + output.itl.append(timestamp - + most_recent_timestamp) + + most_recent_timestamp = timestamp + generated_text += data["choices"][0]["text"] + + output.generated_text = generated_text + output.success = True + output.latency = latency + else: + output.error = response.reason or "" + output.success = False + except Exception: + output.success = False + exc_info = sys.exc_info() + output.error = "".join(traceback.format_exception(*exc_info)) + + if pbar: + pbar.update(1) + return output + + +async def async_request_openai_chat_completions( + request_func_input: RequestFuncInput, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + api_url = request_func_input.api_url + assert api_url.endswith( + "chat/completions" + ), "OpenAI Chat Completions API URL must end with 'chat/completions'." + + async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + assert not request_func_input.use_beam_search + payload = { + "model": request_func_input.model, + "messages": [ + { + "role": "user", + "content": request_func_input.prompt, + }, + ], + "temperature": 0.0, + "max_tokens": request_func_input.output_len, + "stream": True, + } + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + } + + output = RequestFuncOutput() + output.prompt_len = request_func_input.prompt_len + + generated_text = "" + ttft = 0.0 + st = time.perf_counter() + most_recent_timestamp = st + try: + async with session.post(url=api_url, json=payload, + headers=headers) as response: + if response.status == 200: + async for chunk_bytes in response.content: + chunk_bytes = chunk_bytes.strip() + if not chunk_bytes: + continue + + chunk = remove_prefix(chunk_bytes.decode("utf-8"), + "data: ") + if chunk == "[DONE]": + latency = time.perf_counter() - st + else: + timestamp = time.perf_counter() + data = json.loads(chunk) + + delta = data["choices"][0]["delta"] + if delta.get("content", None): + # First token + if ttft == 0.0: + ttft = time.perf_counter() - st + output.ttft = ttft + + # Decoding phase + else: + output.itl.append(timestamp - + most_recent_timestamp) + + generated_text += delta["content"] + + most_recent_timestamp = timestamp + + output.generated_text = generated_text + output.success = True + output.latency = latency + else: + output.error = response.reason or "" + output.success = False + except Exception: + output.success = False + exc_info = sys.exc_info() + output.error = "".join(traceback.format_exception(*exc_info)) + + if pbar: + pbar.update(1) + return output + + +# Since vllm must support Python 3.8, we can't use str.removeprefix(prefix) +# introduced in Python 3.9 +def remove_prefix(text: str, prefix: str) -> str: + if text.startswith(prefix): + return text[len(prefix):] + return text + + +def get_model(pretrained_model_name_or_path: str) -> str: + if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true': + from modelscope import snapshot_download + + model_path = snapshot_download( + model_id=pretrained_model_name_or_path, + local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, + ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"]) + + return model_path + return pretrained_model_name_or_path + + +def get_tokenizer( + pretrained_model_name_or_path: str, trust_remote_code: bool +) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: + if pretrained_model_name_or_path is not None and not os.path.exists( + pretrained_model_name_or_path): + pretrained_model_name_or_path = get_model( + pretrained_model_name_or_path) + return AutoTokenizer.from_pretrained(pretrained_model_name_or_path, + trust_remote_code=trust_remote_code) + + +ASYNC_REQUEST_FUNCS = { + "tgi": async_request_tgi, + "vllm": async_request_openai_completions, + "lmdeploy": async_request_openai_completions, + "deepspeed-mii": async_request_deepspeed_mii, + "openai": async_request_openai_completions, + "openai-chat": async_request_openai_chat_completions, + "tensorrt-llm": async_request_trt_llm, + "scalellm": async_request_openai_completions, +} diff --git a/recipes/3p_integrations/crusoe/vllm-fp8/benchmarks/benchmark_serving.py b/recipes/3p_integrations/crusoe/vllm-fp8/benchmarks/benchmark_serving.py new file mode 100644 index 000000000..fe687da49 --- /dev/null +++ b/recipes/3p_integrations/crusoe/vllm-fp8/benchmarks/benchmark_serving.py @@ -0,0 +1,770 @@ +"""Benchmark online serving throughput. + +On the server side, run one of the following commands: + vLLM OpenAI API server + vllm serve \ + --swap-space 16 \ + --disable-log-requests + + (TGI backend) + ./launch_tgi_server.sh + +On the client side, run: + python benchmarks/benchmark_serving.py \ + --backend \ + --model \ + --dataset-name sharegpt \ + --dataset-path \ + --request-rate \ # By default is inf + --num-prompts # By default is 1000 + + when using tgi backend, add + --endpoint /generate_stream + to the end of the command above. +""" +import argparse +import asyncio +import json +import os +import random +import time +import warnings +from dataclasses import dataclass +from datetime import datetime +from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple + +import numpy as np +from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput, + RequestFuncOutput) +from tqdm.asyncio import tqdm +from transformers import PreTrainedTokenizerBase + +try: + from vllm.transformers_utils.tokenizer import get_tokenizer +except ImportError: + from backend_request_func import get_tokenizer + +try: + from vllm.utils import FlexibleArgumentParser +except ImportError: + from argparse import ArgumentParser as FlexibleArgumentParser + + +@dataclass +class BenchmarkMetrics: + completed: int + total_input: int + total_output: int + request_throughput: float + input_throughput: float + output_throughput: float + mean_ttft_ms: float + median_ttft_ms: float + std_ttft_ms: float + p99_ttft_ms: float + mean_tpot_ms: float + median_tpot_ms: float + std_tpot_ms: float + p99_tpot_ms: float + mean_itl_ms: float + median_itl_ms: float + std_itl_ms: float + p99_itl_ms: float + + +def sample_sharegpt_requests( + dataset_path: str, + num_requests: int, + tokenizer: PreTrainedTokenizerBase, + fixed_output_len: Optional[int] = None, +) -> List[Tuple[str, int, int]]: + if fixed_output_len is not None and fixed_output_len < 4: + raise ValueError("output_len too small") + # Load the dataset. + with open(dataset_path) as f: + dataset = json.load(f) + # Filter out the conversations with less than 2 turns. + dataset = [data for data in dataset if len(data["conversations"]) >= 2] + # Only keep the first two turns of each conversation. + dataset = [(data["conversations"][0]["value"], + data["conversations"][1]["value"]) for data in dataset] + + # Shuffle the dataset. + random.shuffle(dataset) + + # Filter out sequences that are too long or too short + filtered_dataset: List[Tuple[str, int, int]] = [] + for i in range(len(dataset)): + if len(filtered_dataset) == num_requests: + break + + # Tokenize the prompts and completions. + prompt = dataset[i][0] + prompt_token_ids = tokenizer(prompt).input_ids + completion = dataset[i][1] + completion_token_ids = tokenizer(completion).input_ids + prompt_len = len(prompt_token_ids) + output_len = len(completion_token_ids + ) if fixed_output_len is None else fixed_output_len + if prompt_len < 4 or output_len < 4: + # Prune too short sequences. + continue + if prompt_len > 1024 or prompt_len + output_len > 2048: + # Prune too long sequences. + continue + filtered_dataset.append((prompt, prompt_len, output_len)) + + return filtered_dataset + + +def sample_sonnet_requests( + dataset_path: str, + num_requests: int, + input_len: int, + output_len: int, + prefix_len: int, + tokenizer: PreTrainedTokenizerBase, +) -> List[Tuple[str, str, int, int]]: + assert ( + input_len > prefix_len + ), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'." + + # Load the dataset. + with open(dataset_path) as f: + poem_lines = f.readlines() + + # Tokenize the poem lines. + poem_token_ids = tokenizer(poem_lines).input_ids + average_poem_len = sum( + len(token_ids) for token_ids in poem_token_ids) / len(poem_token_ids) + + # Base prefix for all requests. + base_prompt = "Pick as many lines as you can from these poem lines:\n" + base_message = [{ + "role": "user", + "content": base_prompt, + }] + base_prompt_formatted = tokenizer.apply_chat_template( + base_message, add_generation_prompt=True, tokenize=False) + base_prompt_offset = len(tokenizer(base_prompt_formatted).input_ids) + + assert ( + input_len > base_prompt_offset + ), f"Please set 'args.sonnet-input-len' higher than {base_prompt_offset}." + num_input_lines = round( + (input_len - base_prompt_offset) / average_poem_len) + + # First approximately `prefix_len` number of tokens in the + # prompt are fixed poem lines. + assert ( + prefix_len > base_prompt_offset + ), f"Please set 'args.sonnet-prefix-len' higher than {base_prompt_offset}." + + num_prefix_lines = round( + (prefix_len - base_prompt_offset) / average_poem_len) + prefix_lines = poem_lines[:num_prefix_lines] + + # Sample the rest of lines per request. + sampled_requests: List[Tuple[str, int, int]] = [] + for _ in range(num_requests): + sampled_lines = "".join( + prefix_lines + + random.sample(poem_lines, num_input_lines - num_prefix_lines)) + + prompt = f"{base_prompt}{sampled_lines}" + message = [ + { + "role": "user", + "content": prompt, + }, + ] + prompt_formatted = tokenizer.apply_chat_template( + message, add_generation_prompt=True, tokenize=False) + prompt_len = len(tokenizer(prompt_formatted).input_ids) + sampled_requests.append( + (prompt, prompt_formatted, prompt_len, output_len)) + + return sampled_requests + + +def sample_random_requests( + input_len: int, output_len: int, num_prompts: int, range_ratio: float, + tokenizer: PreTrainedTokenizerBase) -> List[Tuple[str, int, int]]: + + input_lens = np.random.randint( + int(input_len * range_ratio), + input_len + 1, + size=num_prompts, + ) + output_lens = np.random.randint( + int(output_len * range_ratio), + output_len + 1, + size=num_prompts, + ) + offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts) + input_requests = [] + for i in range(num_prompts): + prompt = tokenizer.decode([(offsets[i] + i + j) % tokenizer.vocab_size + for j in range(input_lens[i])]) + input_requests.append( + (prompt, int(input_lens[i]), int(output_lens[i]))) + + return input_requests + + +async def get_request( + input_requests: List[Tuple[str, int, int]], + request_rate: float, +) -> AsyncGenerator[Tuple[str, int, int], None]: + input_requests = iter(input_requests) + for request in input_requests: + yield request + + if request_rate == float("inf"): + # If the request rate is infinity, then we don't need to wait. + continue + + # Sample the request interval from the exponential distribution. + interval = np.random.exponential(1.0 / request_rate) + # The next request will be sent after the interval. + await asyncio.sleep(interval) + + +def calculate_metrics( + input_requests: List[Tuple[str, int, int]], + outputs: List[RequestFuncOutput], + dur_s: float, + tokenizer: PreTrainedTokenizerBase, +) -> Tuple[BenchmarkMetrics, List[int]]: + actual_output_lens: List[int] = [] + total_input = 0 + completed = 0 + itls: List[float] = [] + tpots: List[float] = [] + ttfts: List[float] = [] + for i in range(len(outputs)): + if outputs[i].success: + # We use the tokenizer to count the number of output tokens for all + # serving backends instead of looking at len(outputs[i].itl) since + # multiple output tokens may be bundled together + # Note : this may inflate the output token count slightly + output_len = len( + tokenizer(outputs[i].generated_text, + add_special_tokens=False).input_ids) + actual_output_lens.append(output_len) + total_input += input_requests[i][1] + if output_len > 1: + tpots.append( + (outputs[i].latency - outputs[i].ttft) / (output_len - 1)) + itls += outputs[i].itl + ttfts.append(outputs[i].ttft) + completed += 1 + else: + actual_output_lens.append(0) + + if completed == 0: + warnings.warn( + "All requests failed. This is likely due to a misconfiguration " + "on the benchmark arguments.", + stacklevel=2) + metrics = BenchmarkMetrics( + completed=completed, + total_input=total_input, + total_output=sum(actual_output_lens), + request_throughput=completed / dur_s, + input_throughput=total_input / dur_s, + output_throughput=sum(actual_output_lens) / dur_s, + mean_ttft_ms=np.mean(ttfts or 0) * + 1000, # ttfts is empty if streaming is not supported by backend + median_ttft_ms=np.median(ttfts or 0) * 1000, + std_ttft_ms=np.std(ttfts or 0) * 1000, + p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000, + mean_tpot_ms=np.mean(tpots or 0) * 1000, + median_tpot_ms=np.median(tpots or 0) * 1000, + std_tpot_ms=np.std(tpots or 0) * 1000, + p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000, + mean_itl_ms=np.mean(itls or 0) * 1000, + median_itl_ms=np.median(itls or 0) * 1000, + std_itl_ms=np.std(itls or 0) * 1000, + p99_itl_ms=np.percentile(itls or 0, 99) * 1000, + ) + + return metrics, actual_output_lens + + +async def benchmark( + backend: str, + api_url: str, + base_url: str, + model_id: str, + tokenizer: PreTrainedTokenizerBase, + input_requests: List[Tuple[str, int, int]], + best_of: int, + use_beam_search: bool, + request_rate: float, + disable_tqdm: bool, + profile: bool, +): + if backend in ASYNC_REQUEST_FUNCS: + request_func = ASYNC_REQUEST_FUNCS[backend] + else: + raise ValueError(f"Unknown backend: {backend}") + + print("Starting initial single prompt test run...") + test_prompt, test_prompt_len, test_output_len = input_requests[0] + test_input = RequestFuncInput( + model=model_id, + prompt=test_prompt, + api_url=api_url, + prompt_len=test_prompt_len, + output_len=test_output_len, + best_of=best_of, + use_beam_search=use_beam_search, + ) + test_output = await request_func(request_func_input=test_input) + if not test_output.success: + raise ValueError( + "Initial test run failed - Please make sure benchmark arguments " + f"are correctly specified. Error: {test_output.error}") + else: + print("Initial test run completed. Starting main benchmark run...") + + if profile: + print("Starting profiler...") + profile_input = RequestFuncInput( + model=model_id, + prompt=test_prompt, + api_url=base_url + "/start_profile", + prompt_len=test_prompt_len, + output_len=test_output_len, + best_of=best_of, + use_beam_search=use_beam_search, + ) + profile_output = await request_func(request_func_input=profile_input) + if profile_output.success: + print("Profiler started") + + print(f"Traffic request rate: {request_rate}") + + pbar = None if disable_tqdm else tqdm(total=len(input_requests)) + + benchmark_start_time = time.perf_counter() + tasks: List[asyncio.Task] = [] + async for request in get_request(input_requests, request_rate): + prompt, prompt_len, output_len = request + request_func_input = RequestFuncInput( + model=model_id, + prompt=prompt, + api_url=api_url, + prompt_len=prompt_len, + output_len=output_len, + best_of=best_of, + use_beam_search=use_beam_search, + ) + tasks.append( + asyncio.create_task( + request_func(request_func_input=request_func_input, + pbar=pbar))) + outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks) + + if profile: + print("Stopping profiler...") + profile_input = RequestFuncInput( + model=model_id, + prompt=test_prompt, + api_url=base_url + "/stop_profile", + prompt_len=test_prompt_len, + output_len=test_output_len, + best_of=best_of, + use_beam_search=use_beam_search, + ) + profile_output = await request_func(request_func_input=profile_input) + if profile_output.success: + print("Profiler stopped") + + if pbar is not None: + pbar.close() + + benchmark_duration = time.perf_counter() - benchmark_start_time + + metrics, actual_output_lens = calculate_metrics( + input_requests=input_requests, + outputs=outputs, + dur_s=benchmark_duration, + tokenizer=tokenizer, + ) + + print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='=')) + print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) + print("{:<40} {:<10.2f}".format("Benchmark duration (s):", + benchmark_duration)) + print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) + print("{:<40} {:<10}".format("Total generated tokens:", + metrics.total_output)) + print("{:<40} {:<10.2f}".format("Request throughput (req/s):", + metrics.request_throughput)) + print("{:<40} {:<10.2f}".format("Input token throughput (tok/s):", + metrics.input_throughput)) + print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", + metrics.output_throughput)) + print("{s:{c}^{n}}".format(s='Time to First Token', n=50, c='-')) + print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms)) + print("{:<40} {:<10.2f}".format("Median TTFT (ms):", + metrics.median_ttft_ms)) + print("{:<40} {:<10.2f}".format("P99 TTFT (ms):", metrics.p99_ttft_ms)) + print("{s:{c}^{n}}".format(s='Time per Output Token (excl. 1st token)', + n=50, + c='-')) + print("{:<40} {:<10.2f}".format("Mean TPOT (ms):", metrics.mean_tpot_ms)) + print("{:<40} {:<10.2f}".format("Median TPOT (ms):", + metrics.median_tpot_ms)) + print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms)) + print("{s:{c}^{n}}".format(s='Inter-token Latency', n=50, c='-')) + print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms)) + print("{:<40} {:<10.2f}".format("Median ITL (ms):", metrics.median_itl_ms)) + print("{:<40} {:<10.2f}".format("P99 ITL (ms):", metrics.p99_itl_ms)) + print("=" * 50) + + result = { + "duration": benchmark_duration, + "completed": metrics.completed, + "total_input_tokens": metrics.total_input, + "total_output_tokens": metrics.total_output, + "request_throughput": metrics.request_throughput, + "input_throughput": metrics.input_throughput, + "output_throughput": metrics.output_throughput, + "mean_ttft_ms": metrics.mean_ttft_ms, + "median_ttft_ms": metrics.median_ttft_ms, + "std_ttft_ms": metrics.std_ttft_ms, + "p99_ttft_ms": metrics.p99_ttft_ms, + "mean_tpot_ms": metrics.mean_tpot_ms, + "median_tpot_ms": metrics.median_tpot_ms, + "std_tpot_ms": metrics.std_tpot_ms, + "p99_tpot_ms": metrics.p99_tpot_ms, + "mean_itl_ms": metrics.mean_itl_ms, + "median_itl_ms": metrics.median_itl_ms, + "std_itl_ms": metrics.std_itl_ms, + "p99_itl_ms": metrics.p99_itl_ms, + "input_lens": [output.prompt_len for output in outputs], + "output_lens": actual_output_lens, + "ttfts": [output.ttft for output in outputs], + "itls": [output.itl for output in outputs], + "generated_texts": [output.generated_text for output in outputs], + "errors": [output.error for output in outputs], + } + return result + + +def main(args: argparse.Namespace): + print(args) + random.seed(args.seed) + np.random.seed(args.seed) + + backend = args.backend + model_id = args.model + tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model + + if args.base_url is not None: + api_url = f"{args.base_url}{args.endpoint}" + base_url = f"{args.base_url}" + else: + api_url = f"http://{args.host}:{args.port}{args.endpoint}" + base_url = f"http://{args.host}:{args.port}" + + tokenizer = get_tokenizer(tokenizer_id, + trust_remote_code=args.trust_remote_code) + + if args.dataset is not None: + warnings.warn( + "The '--dataset' argument will be deprecated in the next " + "release. Please use '--dataset-name' and " + "'--dataset-path' in the future runs.", + stacklevel=2) + input_requests = sample_sharegpt_requests( + dataset_path=args.dataset, + num_requests=args.num_prompts, + tokenizer=tokenizer, + fixed_output_len=args.sharegpt_output_len, + ) + + elif args.dataset_name == "sharegpt": + input_requests = sample_sharegpt_requests( + dataset_path=args.dataset_path, + num_requests=args.num_prompts, + tokenizer=tokenizer, + fixed_output_len=args.sharegpt_output_len, + ) + + elif args.dataset_name == "sonnet": + # Do not format the prompt, pass to message directly + if args.backend == "openai-chat": + input_requests = sample_sonnet_requests( + dataset_path=args.dataset_path, + num_requests=args.num_prompts, + input_len=args.sonnet_input_len, + output_len=args.sonnet_output_len, + prefix_len=args.sonnet_prefix_len, + tokenizer=tokenizer, + ) + input_requests = [(prompt, prompt_len, output_len) + for prompt, prompt_formatted, prompt_len, + output_len in input_requests] + else: + assert ( + tokenizer.chat_template or tokenizer.default_chat_template + ), "Tokenizer/model must have chat template for sonnet dataset." + input_requests = sample_sonnet_requests( + dataset_path=args.dataset_path, + num_requests=args.num_prompts, + input_len=args.sonnet_input_len, + output_len=args.sonnet_output_len, + prefix_len=args.sonnet_prefix_len, + tokenizer=tokenizer, + ) + input_requests = [(prompt_formatted, prompt_len, output_len) + for prompt, prompt_formatted, prompt_len, + output_len in input_requests] + + elif args.dataset_name == "random": + input_requests = sample_random_requests( + input_len=args.random_input_len, + output_len=args.random_output_len, + num_prompts=args.num_prompts, + range_ratio=args.random_range_ratio, + tokenizer=tokenizer, + ) + + else: + raise ValueError(f"Unknown dataset: {args.dataset_name}") + + benchmark_result = asyncio.run( + benchmark( + backend=backend, + api_url=api_url, + base_url=base_url, + model_id=model_id, + tokenizer=tokenizer, + input_requests=input_requests, + best_of=args.best_of, + use_beam_search=args.use_beam_search, + request_rate=args.request_rate, + disable_tqdm=args.disable_tqdm, + profile=args.profile, + )) + + # Save config and results to json + if args.save_result: + result_json: Dict[str, Any] = {} + + # Setup + current_dt = datetime.now().strftime("%Y%m%d-%H%M%S") + result_json["date"] = current_dt + result_json["backend"] = backend + result_json["model_id"] = model_id + result_json["tokenizer_id"] = tokenizer_id + result_json["best_of"] = args.best_of + result_json["use_beam_search"] = args.use_beam_search + result_json["num_prompts"] = args.num_prompts + + # Metadata + if args.metadata: + for item in args.metadata: + if "=" in item: + kvstring = item.split("=") + result_json[kvstring[0].strip()] = kvstring[1].strip() + else: + raise ValueError( + "Invalid metadata format. Please use KEY=VALUE format." + ) + + # Traffic + result_json["request_rate"] = ( + args.request_rate if args.request_rate < float("inf") else "inf") + + # Merge with benchmark result + result_json = {**result_json, **benchmark_result} + + # Save to file + base_model_id = model_id.split("/")[-1] + file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" #noqa + if args.result_filename: + file_name = args.result_filename + if args.result_dir: + file_name = os.path.join(args.result_dir, file_name) + with open(file_name, "w") as outfile: + json.dump(result_json, outfile) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description="Benchmark the online serving throughput.") + parser.add_argument( + "--backend", + type=str, + default="vllm", + choices=list(ASYNC_REQUEST_FUNCS.keys()), + ) + parser.add_argument( + "--base-url", + type=str, + default=None, + help="Server or API base url if not using http host and port.", + ) + parser.add_argument("--host", type=str, default="localhost") + parser.add_argument("--port", type=int, default=8000) + parser.add_argument( + "--endpoint", + type=str, + default="/v1/completions", + help="API endpoint.", + ) + parser.add_argument( + "--dataset", + type=str, + default=None, + help="Path to the ShareGPT dataset, will be deprecated in the " + "next release.", + ) + parser.add_argument( + "--dataset-name", + type=str, + default="sharegpt", + choices=["sharegpt", "sonnet", "random"], + help="Name of the dataset to benchmark on.", + ) + parser.add_argument("--dataset-path", + type=str, + default=None, + help="Path to the dataset.") + parser.add_argument( + "--model", + type=str, + required=True, + help="Name of the model.", + ) + parser.add_argument( + "--tokenizer", + type=str, + help= + "Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 + ) + parser.add_argument( + "--best-of", + type=int, + default=1, + help="Generates `best_of` sequences per prompt and " + "returns the best one.", + ) + parser.add_argument("--use-beam-search", action="store_true") + parser.add_argument( + "--num-prompts", + type=int, + default=1000, + help="Number of prompts to process.", + ) + parser.add_argument( + "--sharegpt-output-len", + type=int, + default=None, + help="Output length for each request. Overrides the output length " + "from the ShareGPT dataset.") + parser.add_argument( + "--sonnet-input-len", + type=int, + default=550, + help= + "Number of input tokens per request, used only for sonnet dataset.", + ) + parser.add_argument( + "--sonnet-output-len", + type=int, + default=150, + help= + "Number of output tokens per request, used only for sonnet dataset.", + ) + parser.add_argument( + "--sonnet-prefix-len", + type=int, + default=200, + help= + "Number of prefix tokens per request, used only for sonnet dataset.", + ) + parser.add_argument( + "--random-input-len", + type=int, + default=1024, + help= + "Number of input tokens per request, used only for random sampling.", + ) + parser.add_argument( + "--random-output-len", + type=int, + default=128, + help= + "Number of output tokens per request, used only for random sampling.", + ) + parser.add_argument( + "--random-range-ratio", + type=float, + default=1.0, + help="Range of sampled ratio of input/output length, " + "used only for random sampling.", + ) + parser.add_argument( + "--request-rate", + type=float, + default=float("inf"), + help="Number of requests per second. If this is inf, " + "then all the requests are sent at time 0. " + "Otherwise, we use Poisson process to synthesize " + "the request arrival times.", + ) + parser.add_argument("--seed", type=int, default=0) + parser.add_argument( + "--trust-remote-code", + action="store_true", + help="Trust remote code from huggingface", + ) + parser.add_argument( + "--disable-tqdm", + action="store_true", + help="Specify to disable tqdm progress bar.", + ) + parser.add_argument( + "--profile", + action="store_true", + help="Use Torch Profiler. The endpoint must be launched with " + "VLLM_TORCH_PROFILER_DIR to enable profiler.", + ) + parser.add_argument( + "--save-result", + action="store_true", + help="Specify to save benchmark results to a json file", + ) + parser.add_argument( + "--metadata", + metavar="KEY=VALUE", + nargs="*", + help="Key-value pairs (e.g, --metadata version=0.3.3 tp=1) " + "for metadata of this run to be saved in the result JSON file " + "for record keeping purposes.", + ) + parser.add_argument( + "--result-dir", + type=str, + default=None, + help="Specify directory to save benchmark json results." + "If not specified, results are saved in the current directory.", + ) + parser.add_argument( + "--result-filename", + type=str, + default=None, + help="Specify the filename to save benchmark json results." + "If not specified, results will be saved in " + "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" + " format.", + ) + + args = parser.parse_args() + main(args) diff --git a/recipes/3p_integrations/crusoe/vllm-fp8/benchmarks/sonnet.txt b/recipes/3p_integrations/crusoe/vllm-fp8/benchmarks/sonnet.txt new file mode 100644 index 000000000..34c444e8c --- /dev/null +++ b/recipes/3p_integrations/crusoe/vllm-fp8/benchmarks/sonnet.txt @@ -0,0 +1,518 @@ +FROM fairest creatures we desire increase, +That thereby beauty's rose might never die, +But as the riper should by time decease, +His tender heir might bear his memory: +But thou, contracted to thine own bright eyes, +Feed'st thy light'st flame with self-substantial fuel, +Making a famine where abundance lies, +Thyself thy foe, to thy sweet self too cruel. +Thou that art now the world's fresh ornament +And only herald to the gaudy spring, +Within thine own bud buriest thy content +And, tender churl, makest waste in niggarding. +Pity the world, or else this glutton be, +To eat the world's due, by the grave and thee. +When forty winters shall beseige thy brow, +And dig deep trenches in thy beauty's field, +Thy youth's proud livery, so gazed on now, +Will be a tatter'd weed, of small worth held: +Then being ask'd where all thy beauty lies, +Where all the treasure of thy lusty days, +To say, within thine own deep-sunken eyes, +Were an all-eating shame and thriftless praise. +How much more praise deserved thy beauty's use, +If thou couldst answer 'This fair child of mine +Shall sum my count and make my old excuse,' +Proving his beauty by succession thine! +This were to be new made when thou art old, +And see thy blood warm when thou feel'st it cold. +Look in thy glass, and tell the face thou viewest +Now is the time that face should form another; +Whose fresh repair if now thou not renewest, +Thou dost beguile the world, unbless some mother. +For where is she so fair whose unear'd womb +Disdains the tillage of thy husbandry? +Or who is he so fond will be the tomb +Of his self-love, to stop posterity? +Thou art thy mother's glass, and she in thee +Calls back the lovely April of her prime: +So thou through windows of thine age shall see +Despite of wrinkles this thy golden time. +But if thou live, remember'd not to be, +Die single, and thine image dies with thee. +Unthrifty loveliness, why dost thou spend +Upon thyself thy beauty's legacy? +Nature's bequest gives nothing but doth lend, +And being frank she lends to those are free. +Then, beauteous niggard, why dost thou abuse +The bounteous largess given thee to give? +Profitless usurer, why dost thou use +So great a sum of sums, yet canst not live? +For having traffic with thyself alone, +Thou of thyself thy sweet self dost deceive. +Then how, when nature calls thee to be gone, +What acceptable audit canst thou leave? +Thy unused beauty must be tomb'd with thee, +Which, used, lives th' executor to be. +Those hours, that with gentle work did frame +The lovely gaze where every eye doth dwell, +Will play the tyrants to the very same +And that unfair which fairly doth excel: +For never-resting time leads summer on +To hideous winter and confounds him there; +Sap cheque'd with frost and lusty leaves quite gone, +Beauty o'ersnow'd and bareness every where: +Then, were not summer's distillation left, +A liquid prisoner pent in walls of glass, +Beauty's effect with beauty were bereft, +Nor it nor no remembrance what it was: +But flowers distill'd though they with winter meet, +Leese but their show; their substance still lives sweet. +Then let not winter's ragged hand deface +In thee thy summer, ere thou be distill'd: +Make sweet some vial; treasure thou some place +With beauty's treasure, ere it be self-kill'd. +That use is not forbidden usury, +Which happies those that pay the willing loan; +That's for thyself to breed another thee, +Or ten times happier, be it ten for one; +Ten times thyself were happier than thou art, +If ten of thine ten times refigured thee: +Then what could death do, if thou shouldst depart, +Leaving thee living in posterity? +Be not self-will'd, for thou art much too fair +To be death's conquest and make worms thine heir. +Lo! in the orient when the gracious light +Lifts up his burning head, each under eye +Doth homage to his new-appearing sight, +Serving with looks his sacred majesty; +And having climb'd the steep-up heavenly hill, +Resembling strong youth in his middle age, +yet mortal looks adore his beauty still, +Attending on his golden pilgrimage; +But when from highmost pitch, with weary car, +Like feeble age, he reeleth from the day, +The eyes, 'fore duteous, now converted are +From his low tract and look another way: +So thou, thyself out-going in thy noon, +Unlook'd on diest, unless thou get a son. +Music to hear, why hear'st thou music sadly? +Sweets with sweets war not, joy delights in joy. +Why lovest thou that which thou receivest not gladly, +Or else receivest with pleasure thine annoy? +If the true concord of well-tuned sounds, +By unions married, do offend thine ear, +They do but sweetly chide thee, who confounds +In singleness the parts that thou shouldst bear. +Mark how one string, sweet husband to another, +Strikes each in each by mutual ordering, +Resembling sire and child and happy mother +Who all in one, one pleasing note do sing: +Whose speechless song, being many, seeming one, +Sings this to thee: 'thou single wilt prove none.' +Is it for fear to wet a widow's eye +That thou consumest thyself in single life? +Ah! if thou issueless shalt hap to die. +The world will wail thee, like a makeless wife; +The world will be thy widow and still weep +That thou no form of thee hast left behind, +When every private widow well may keep +By children's eyes her husband's shape in mind. +Look, what an unthrift in the world doth spend +Shifts but his place, for still the world enjoys it; +But beauty's waste hath in the world an end, +And kept unused, the user so destroys it. +No love toward others in that bosom sits +That on himself such murderous shame commits. +For shame! deny that thou bear'st love to any, +Who for thyself art so unprovident. +Grant, if thou wilt, thou art beloved of many, +But that thou none lovest is most evident; +For thou art so possess'd with murderous hate +That 'gainst thyself thou stick'st not to conspire. +Seeking that beauteous roof to ruinate +Which to repair should be thy chief desire. +O, change thy thought, that I may change my mind! +Shall hate be fairer lodged than gentle love? +Be, as thy presence is, gracious and kind, +Or to thyself at least kind-hearted prove: +Make thee another self, for love of me, +That beauty still may live in thine or thee. +As fast as thou shalt wane, so fast thou growest +In one of thine, from that which thou departest; +And that fresh blood which youngly thou bestowest +Thou mayst call thine when thou from youth convertest. +Herein lives wisdom, beauty and increase: +Without this, folly, age and cold decay: +If all were minded so, the times should cease +And threescore year would make the world away. +Let those whom Nature hath not made for store, +Harsh featureless and rude, barrenly perish: +Look, whom she best endow'd she gave the more; +Which bounteous gift thou shouldst in bounty cherish: +She carved thee for her seal, and meant thereby +Thou shouldst print more, not let that copy die. +When I do count the clock that tells the time, +And see the brave day sunk in hideous night; +When I behold the violet past prime, +And sable curls all silver'd o'er with white; +When lofty trees I see barren of leaves +Which erst from heat did canopy the herd, +And summer's green all girded up in sheaves +Borne on the bier with white and bristly beard, +Then of thy beauty do I question make, +That thou among the wastes of time must go, +Since sweets and beauties do themselves forsake +And die as fast as they see others grow; +And nothing 'gainst Time's scythe can make defence +Save breed, to brave him when he takes thee hence. +O, that you were yourself! but, love, you are +No longer yours than you yourself here live: +Against this coming end you should prepare, +And your sweet semblance to some other give. +So should that beauty which you hold in lease +Find no determination: then you were +Yourself again after yourself's decease, +When your sweet issue your sweet form should bear. +Who lets so fair a house fall to decay, +Which husbandry in honour might uphold +Against the stormy gusts of winter's day +And barren rage of death's eternal cold? +O, none but unthrifts! Dear my love, you know +You had a father: let your son say so. +Not from the stars do I my judgment pluck; +And yet methinks I have astronomy, +But not to tell of good or evil luck, +Of plagues, of dearths, or seasons' quality; +Nor can I fortune to brief minutes tell, +Pointing to each his thunder, rain and wind, +Or say with princes if it shall go well, +By oft predict that I in heaven find: +But from thine eyes my knowledge I derive, +And, constant stars, in them I read such art +As truth and beauty shall together thrive, +If from thyself to store thou wouldst convert; +Or else of thee this I prognosticate: +Thy end is truth's and beauty's doom and date. +When I consider every thing that grows +Holds in perfection but a little moment, +That this huge stage presenteth nought but shows +Whereon the stars in secret influence comment; +When I perceive that men as plants increase, +Cheered and cheque'd even by the self-same sky, +Vaunt in their youthful sap, at height decrease, +And wear their brave state out of memory; +Then the conceit of this inconstant stay +Sets you most rich in youth before my sight, +Where wasteful Time debateth with Decay, +To change your day of youth to sullied night; +And all in war with Time for love of you, +As he takes from you, I engraft you new. +But wherefore do not you a mightier way +Make war upon this bloody tyrant, Time? +And fortify yourself in your decay +With means more blessed than my barren rhyme? +Now stand you on the top of happy hours, +And many maiden gardens yet unset +With virtuous wish would bear your living flowers, +Much liker than your painted counterfeit: +So should the lines of life that life repair, +Which this, Time's pencil, or my pupil pen, +Neither in inward worth nor outward fair, +Can make you live yourself in eyes of men. +To give away yourself keeps yourself still, +And you must live, drawn by your own sweet skill. +Who will believe my verse in time to come, +If it were fill'd with your most high deserts? +Though yet, heaven knows, it is but as a tomb +Which hides your life and shows not half your parts. +If I could write the beauty of your eyes +And in fresh numbers number all your graces, +The age to come would say 'This poet lies: +Such heavenly touches ne'er touch'd earthly faces.' +So should my papers yellow'd with their age +Be scorn'd like old men of less truth than tongue, +And your true rights be term'd a poet's rage +And stretched metre of an antique song: +But were some child of yours alive that time, +You should live twice; in it and in my rhyme. +Shall I compare thee to a summer's day? +Thou art more lovely and more temperate: +Rough winds do shake the darling buds of May, +And summer's lease hath all too short a date: +Sometime too hot the eye of heaven shines, +And often is his gold complexion dimm'd; +And every fair from fair sometime declines, +By chance or nature's changing course untrimm'd; +But thy eternal summer shall not fade +Nor lose possession of that fair thou owest; +Nor shall Death brag thou wander'st in his shade, +When in eternal lines to time thou growest: +So long as men can breathe or eyes can see, +So long lives this and this gives life to thee. +Devouring Time, blunt thou the lion's paws, +And make the earth devour her own sweet brood; +Pluck the keen teeth from the fierce tiger's jaws, +And burn the long-lived phoenix in her blood; +Make glad and sorry seasons as thou fleets, +And do whate'er thou wilt, swift-footed Time, +To the wide world and all her fading sweets; +But I forbid thee one most heinous crime: +O, carve not with thy hours my love's fair brow, +Nor draw no lines there with thine antique pen; +Him in thy course untainted do allow +For beauty's pattern to succeeding men. +Yet, do thy worst, old Time: despite thy wrong, +My love shall in my verse ever live young. +A woman's face with Nature's own hand painted +Hast thou, the master-mistress of my passion; +A woman's gentle heart, but not acquainted +With shifting change, as is false women's fashion; +An eye more bright than theirs, less false in rolling, +Gilding the object whereupon it gazeth; +A man in hue, all 'hues' in his controlling, +Much steals men's eyes and women's souls amazeth. +And for a woman wert thou first created; +Till Nature, as she wrought thee, fell a-doting, +And by addition me of thee defeated, +By adding one thing to my purpose nothing. +But since she prick'd thee out for women's pleasure, +Mine be thy love and thy love's use their treasure. +So is it not with me as with that Muse +Stirr'd by a painted beauty to his verse, +Who heaven itself for ornament doth use +And every fair with his fair doth rehearse +Making a couplement of proud compare, +With sun and moon, with earth and sea's rich gems, +With April's first-born flowers, and all things rare +That heaven's air in this huge rondure hems. +O' let me, true in love, but truly write, +And then believe me, my love is as fair +As any mother's child, though not so bright +As those gold candles fix'd in heaven's air: +Let them say more than like of hearsay well; +I will not praise that purpose not to sell. +My glass shall not persuade me I am old, +So long as youth and thou are of one date; +But when in thee time's furrows I behold, +Then look I death my days should expiate. +For all that beauty that doth cover thee +Is but the seemly raiment of my heart, +Which in thy breast doth live, as thine in me: +How can I then be elder than thou art? +O, therefore, love, be of thyself so wary +As I, not for myself, but for thee will; +Bearing thy heart, which I will keep so chary +As tender nurse her babe from faring ill. +Presume not on thy heart when mine is slain; +Thou gavest me thine, not to give back again. +As an unperfect actor on the stage +Who with his fear is put besides his part, +Or some fierce thing replete with too much rage, +Whose strength's abundance weakens his own heart. +So I, for fear of trust, forget to say +The perfect ceremony of love's rite, +And in mine own love's strength seem to decay, +O'ercharged with burden of mine own love's might. +O, let my books be then the eloquence +And dumb presagers of my speaking breast, +Who plead for love and look for recompense +More than that tongue that more hath more express'd. +O, learn to read what silent love hath writ: +To hear with eyes belongs to love's fine wit. +Mine eye hath play'd the painter and hath stell'd +Thy beauty's form in table of my heart; +My body is the frame wherein 'tis held, +And perspective it is the painter's art. +For through the painter must you see his skill, +To find where your true image pictured lies; +Which in my bosom's shop is hanging still, +That hath his windows glazed with thine eyes. +Now see what good turns eyes for eyes have done: +Mine eyes have drawn thy shape, and thine for me +Are windows to my breast, where-through the sun +Delights to peep, to gaze therein on thee; +Yet eyes this cunning want to grace their art; +They draw but what they see, know not the heart. +Let those who are in favour with their stars +Of public honour and proud titles boast, +Whilst I, whom fortune of such triumph bars, +Unlook'd for joy in that I honour most. +Great princes' favourites their fair leaves spread +But as the marigold at the sun's eye, +And in themselves their pride lies buried, +For at a frown they in their glory die. +The painful warrior famoused for fight, +After a thousand victories once foil'd, +Is from the book of honour razed quite, +And all the rest forgot for which he toil'd: +Then happy I, that love and am beloved +Where I may not remove nor be removed. +Lord of my love, to whom in vassalage +Thy merit hath my duty strongly knit, +To thee I send this written embassage, +To witness duty, not to show my wit: +Duty so great, which wit so poor as mine +May make seem bare, in wanting words to show it, +But that I hope some good conceit of thine +In thy soul's thought, all naked, will bestow it; +Till whatsoever star that guides my moving +Points on me graciously with fair aspect +And puts apparel on my tatter'd loving, +To show me worthy of thy sweet respect: +Then may I dare to boast how I do love thee; +Till then not show my head where thou mayst prove me. +Weary with toil, I haste me to my bed, +The dear repose for limbs with travel tired; +But then begins a journey in my head, +To work my mind, when body's work's expired: +For then my thoughts, from far where I abide, +Intend a zealous pilgrimage to thee, +And keep my drooping eyelids open wide, +Looking on darkness which the blind do see +Save that my soul's imaginary sight +Presents thy shadow to my sightless view, +Which, like a jewel hung in ghastly night, +Makes black night beauteous and her old face new. +Lo! thus, by day my limbs, by night my mind, +For thee and for myself no quiet find. +How can I then return in happy plight, +That am debarr'd the benefit of rest? +When day's oppression is not eased by night, +But day by night, and night by day, oppress'd? +And each, though enemies to either's reign, +Do in consent shake hands to torture me; +The one by toil, the other to complain +How far I toil, still farther off from thee. +I tell the day, to please them thou art bright +And dost him grace when clouds do blot the heaven: +So flatter I the swart-complexion'd night, +When sparkling stars twire not thou gild'st the even. +But day doth daily draw my sorrows longer +And night doth nightly make grief's strength seem stronger. +When, in disgrace with fortune and men's eyes, +I all alone beweep my outcast state +And trouble deal heaven with my bootless cries +And look upon myself and curse my fate, +Wishing me like to one more rich in hope, +Featured like him, like him with friends possess'd, +Desiring this man's art and that man's scope, +With what I most enjoy contented least; +Yet in these thoughts myself almost despising, +Haply I think on thee, and then my state, +Like to the lark at break of day arising +From sullen earth, sings hymns at heaven's gate; +For thy sweet love remember'd such wealth brings +That then I scorn to change my state with kings. +When to the sessions of sweet silent thought +I summon up remembrance of things past, +I sigh the lack of many a thing I sought, +And with old woes new wail my dear time's waste: +Then can I drown an eye, unused to flow, +For precious friends hid in death's dateless night, +And weep afresh love's long since cancell'd woe, +And moan the expense of many a vanish'd sight: +Then can I grieve at grievances foregone, +And heavily from woe to woe tell o'er +The sad account of fore-bemoaned moan, +Which I new pay as if not paid before. +But if the while I think on thee, dear friend, +All losses are restored and sorrows end. +Thy bosom is endeared with all hearts, +Which I by lacking have supposed dead, +And there reigns love and all love's loving parts, +And all those friends which I thought buried. +How many a holy and obsequious tear +Hath dear religious love stol'n from mine eye +As interest of the dead, which now appear +But things removed that hidden in thee lie! +Thou art the grave where buried love doth live, +Hung with the trophies of my lovers gone, +Who all their parts of me to thee did give; +That due of many now is thine alone: +Their images I loved I view in thee, +And thou, all they, hast all the all of me. +If thou survive my well-contented day, +When that churl Death my bones with dust shall cover, +And shalt by fortune once more re-survey +These poor rude lines of thy deceased lover, +Compare them with the bettering of the time, +And though they be outstripp'd by every pen, +Reserve them for my love, not for their rhyme, +Exceeded by the height of happier men. +O, then vouchsafe me but this loving thought: +'Had my friend's Muse grown with this growing age, +A dearer birth than this his love had brought, +To march in ranks of better equipage: +But since he died and poets better prove, +Theirs for their style I'll read, his for his love.' +Full many a glorious morning have I seen +Flatter the mountain-tops with sovereign eye, +Kissing with golden face the meadows green, +Gilding pale streams with heavenly alchemy; +Anon permit the basest clouds to ride +With ugly rack on his celestial face, +And from the forlorn world his visage hide, +Stealing unseen to west with this disgrace: +Even so my sun one early morn did shine +With all triumphant splendor on my brow; +But out, alack! he was but one hour mine; +The region cloud hath mask'd him from me now. +Yet him for this my love no whit disdaineth; +Suns of the world may stain when heaven's sun staineth. +Why didst thou promise such a beauteous day, +And make me travel forth without my cloak, +To let base clouds o'ertake me in my way, +Hiding thy bravery in their rotten smoke? +'Tis not enough that through the cloud thou break, +To dry the rain on my storm-beaten face, +For no man well of such a salve can speak +That heals the wound and cures not the disgrace: +Nor can thy shame give physic to my grief; +Though thou repent, yet I have still the loss: +The offender's sorrow lends but weak relief +To him that bears the strong offence's cross. +Ah! but those tears are pearl which thy love sheds, +And they are rich and ransom all ill deeds. +No more be grieved at that which thou hast done: +Roses have thorns, and silver fountains mud; +Clouds and eclipses stain both moon and sun, +And loathsome canker lives in sweetest bud. +All men make faults, and even I in this, +Authorizing thy trespass with compare, +Myself corrupting, salving thy amiss, +Excusing thy sins more than thy sins are; +For to thy sensual fault I bring in sense-- +Thy adverse party is thy advocate-- +And 'gainst myself a lawful plea commence: +Such civil war is in my love and hate +That I an accessary needs must be +To that sweet thief which sourly robs from me. +Let me confess that we two must be twain, +Although our undivided loves are one: +So shall those blots that do with me remain +Without thy help by me be borne alone. +In our two loves there is but one respect, +Though in our lives a separable spite, +Which though it alter not love's sole effect, +Yet doth it steal sweet hours from love's delight. +I may not evermore acknowledge thee, +Lest my bewailed guilt should do thee shame, +Nor thou with public kindness honour me, +Unless thou take that honour from thy name: +But do not so; I love thee in such sort +As, thou being mine, mine is thy good report. +As a decrepit father takes delight +To see his active child do deeds of youth, +So I, made lame by fortune's dearest spite, +Take all my comfort of thy worth and truth. +For whether beauty, birth, or wealth, or wit, +Or any of these all, or all, or more, +Entitled in thy parts do crowned sit, +I make my love engrafted to this store: +So then I am not lame, poor, nor despised, +Whilst that this shadow doth such substance give +That I in thy abundance am sufficed +And by a part of all thy glory live. +Look, what is best, that best I wish in thee: +This wish I have; then ten times happy me! \ No newline at end of file diff --git a/recipes/3p_integrations/crusoe/vllm-fp8/convert_hf_to_fp8.py b/recipes/3p_integrations/crusoe/vllm-fp8/convert_hf_to_fp8.py new file mode 100644 index 000000000..f4591701c --- /dev/null +++ b/recipes/3p_integrations/crusoe/vllm-fp8/convert_hf_to_fp8.py @@ -0,0 +1,59 @@ +import torch +import argparse +from transformers import AutoTokenizer +from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot +from llmcompressor.transformers.compression.helpers import ( # noqa + calculate_offload_device_map, + custom_offload_device_map, +) + +def main(): + parser = argparse.ArgumentParser(description="Compress a language model.") + parser.add_argument("model_stub", type=str, help="The model stub (e.g., 'bosonai/Higgs-Llama-3-70B')") + args = parser.parse_args() + + recipe = """ + quant_stage: + quant_modifiers: + QuantizationModifier: + ignore: ["lm_head"] + config_groups: + group_0: + weights: + num_bits: 8 + type: float + strategy: channel + dynamic: false + symmetric: true + input_activations: + num_bits: 8 + type: float + strategy: token + dynamic: true + symmetric: true + targets: ["Linear"] + """ + + model_stub = args.model_stub + model_name = model_stub.split("/")[-1] + + device_map = calculate_offload_device_map( + model_stub, reserve_for_hessians=False, num_gpus=1, torch_dtype=torch.float16 + ) + + model = SparseAutoModelForCausalLM.from_pretrained( + model_stub, torch_dtype=torch.float16, device_map=device_map + ) + + output_dir = f"./{model_name}-FP8-dynamic" + + oneshot( + model=model, + recipe=recipe, + output_dir=output_dir, + save_compressed=True, + tokenizer=AutoTokenizer.from_pretrained(model_stub), + ) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/recipes/3p_integrations/crusoe/vllm-fp8/main.tf b/recipes/3p_integrations/crusoe/vllm-fp8/main.tf new file mode 100644 index 000000000..39572144b --- /dev/null +++ b/recipes/3p_integrations/crusoe/vllm-fp8/main.tf @@ -0,0 +1,41 @@ +terraform { + required_providers { + crusoe = { + source = "registry.terraform.io/crusoecloud/crusoe" + } + } +} + +locals { + my_ssh_key = file("~/.ssh/id_ed25519.pub") +} + +// new VM +resource "crusoe_compute_instance" "vllm_vm" { + name = "vllm-example" + type = "l40s-48gb.8x" + location = "us-southcentral1-a" + + # specify the base image + image = "ubuntu22.04-nvidia-slurm:12.4" + + disks = [ + { + id = crusoe_storage_disk.vllm_data_disk.id + mode = "read-write" + attachment_type = "data" + } + ] + + ssh_key = local.my_ssh_key +} + +resource "crusoe_storage_disk" "vllm_data_disk" { + name = "vllm-example-disk" + size = "256GiB" + location = "us-southcentral1-a" +} + +output "instance_public_ip" { + value = crusoe_compute_instance.vllm_vm.network_interfaces[0].public_ipv4.address +} diff --git a/recipes/3p_integrations/crusoe/vllm-fp8/plot.py b/recipes/3p_integrations/crusoe/vllm-fp8/plot.py new file mode 100644 index 000000000..ff0134f19 --- /dev/null +++ b/recipes/3p_integrations/crusoe/vllm-fp8/plot.py @@ -0,0 +1,72 @@ +import json +import os +import re +import matplotlib.pyplot as plt +import numpy as np +from collections import defaultdict + +def extract_info_from_filename(filename): + pattern = r'(?P[^-]+)-(?P\d+\.\d+)qps-(?P.+)-(?P\d{8}-\d{6})\.json' + match = re.match(pattern, filename) + if match: + return { + 'qps': float(match.group('qps')), + 'model': match.group('model') + } + return None + +def read_json_files(directory): + data_tpot = defaultdict(list) + data_ttft = defaultdict(list) + for filename in os.listdir(directory): + if filename.endswith('.json'): + filepath = os.path.join(directory, filename) + file_info = extract_info_from_filename(filename) + if file_info: + with open(filepath, 'r') as file: + json_data = json.load(file) + median_tpot = json_data.get('median_tpot_ms') + std_tpot = json_data.get('std_tpot_ms') + median_ttft = json_data.get('median_ttft_ms') + std_ttft = json_data.get('std_ttft_ms') + if all(v is not None for v in [median_tpot, std_tpot, median_ttft, std_ttft]): + data_tpot[file_info['model']].append((file_info['qps'], median_tpot, std_tpot)) + data_ttft[file_info['model']].append((file_info['qps'], median_ttft, std_ttft)) + return { + 'tpot': {model: sorted(points) for model, points in data_tpot.items()}, + 'ttft': {model: sorted(points) for model, points in data_ttft.items()} + } + +def create_chart(data, metric, filename): + plt.figure(figsize=(12, 6)) + + colors = plt.cm.rainbow(np.linspace(0, 1, len(data))) + for (model, points), color in zip(data.items(), colors): + qps_values, median_values, std_values = zip(*points) + plt.errorbar(qps_values, median_values, yerr=std_values, fmt='o-', capsize=5, capthick=2, label=model, color=color) + plt.fill_between(qps_values, + np.array(median_values) - np.array(std_values), + np.array(median_values) + np.array(std_values), + alpha=0.2, color=color) + + plt.xlabel('QPS (Queries Per Second)') + plt.ylabel(f'Median {metric.upper()} (ms)') + plt.title(f'Median {metric.upper()} vs QPS with Standard Deviation') + plt.grid(True) + plt.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left') + plt.tight_layout() + plt.savefig(filename, dpi=300, bbox_inches='tight') + plt.close() + +def main(): + directory = './' + data = read_json_files(directory) + if data['tpot'] and data['ttft']: + create_chart(data['tpot'], 'tpot', 'tpot_vs_qps_chart.png') + create_chart(data['ttft'], 'ttft', 'ttft_vs_qps_chart.png') + print("Charts have been saved as 'tpot_vs_qps_chart.png' and 'ttft_vs_qps_chart.png'") + else: + print("No valid data found in the specified directory.") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/recipes/3p_integrations/crusoe/vllm-fp8/pyproject.toml b/recipes/3p_integrations/crusoe/vllm-fp8/pyproject.toml new file mode 100644 index 000000000..b05d700f1 --- /dev/null +++ b/recipes/3p_integrations/crusoe/vllm-fp8/pyproject.toml @@ -0,0 +1,12 @@ +[project] +name = "vllm-l40s" +version = "0.1.0" +description = "Add your description here" +readme = "README.md" +requires-python = ">=3.10" +dependencies = [ + "setuptools>=74.0.0", + "vllm>=0.5.5", + "matplotlib>=3.9.2", + "llmcompressor>=0.1.0", +] diff --git a/recipes/3p_integrations/crusoe/vllm-fp8/run_benchmark.sh b/recipes/3p_integrations/crusoe/vllm-fp8/run_benchmark.sh new file mode 100755 index 000000000..2ca160600 --- /dev/null +++ b/recipes/3p_integrations/crusoe/vllm-fp8/run_benchmark.sh @@ -0,0 +1,12 @@ +TOTAL_SECONDS=120 +QPS_RATES=("1" "3" "5" "7" "9") + +for QPS in ${QPS_RATES[@]}; do + NUM_PROMPTS=$((TOTAL_SECONDS * QPS)) + echo "===== RUNNING NUM_PROMPTS = $NUM_PROMPTS QPS = $QPS =====" + + uv run benchmarks/benchmark_serving.py \ + --model $MODEL \ + --dataset-name sonnet --sonnet-input-len 550 --sonnet-output-len 150 --dataset-path benchmarks/sonnet.txt \ + --num-prompts $NUM_PROMPTS --request-rate $QPS --save-result +done \ No newline at end of file diff --git a/recipes/3p_integrations/llamaindex/dlai_agentic_rag/README.md b/recipes/3p_integrations/llamaindex/dlai_agentic_rag/README.md index 0f27972e6..deeee9a9c 100644 --- a/recipes/3p_integrations/llamaindex/dlai_agentic_rag/README.md +++ b/recipes/3p_integrations/llamaindex/dlai_agentic_rag/README.md @@ -2,10 +2,10 @@ The folder here containts the Llama 3 ported notebooks of the DLAI short course [Building Agentic RAG with Llamaindex](https://www.deeplearning.ai/short-courses/building-agentic-rag-with-llamaindex/). -1. [Building Agentic RAG with Llamaindex L1 Router Engine](../../../quickstart/agents/dlai/Building_Agentic_RAG_with_Llamaindex_L1_Router_Engine.ipynb) shows how to implement a simple agentic RAG, a router that will pick up one of several query tools (question answering or summarization) to execute a query on a single document. Note this notebook is located in the `quickstart` folder. +1. [Building Agentic RAG with Llamaindex L1 Router Engine](../../../quickstart/agents/DeepLearningai_Course_Notebooks/Building_Agentic_RAG_with_Llamaindex_L1_Router_Engine.ipynb) shows how to implement a simple agentic RAG, a router that will pick up one of several query tools (question answering or summarization) to execute a query on a single document. Note this notebook is located in the `quickstart` folder. 2. [Building Agentic RAG with Llamaindex L2 Tool Calling](Building_Agentic_RAG_with_Llamaindex_L2_Tool_Calling.ipynb) shows how to use Llama 3 to not only pick a function to execute, but also infer an argument to pass through the function. 3. [Building Agentic RAG with Llamaindex L3 Building an Agent Reasoning Loop](Building_Agentic_RAG_with_Llamaindex_L3_Building_an_Agent_Reasoning_Loop.ipynb) shows how to define a complete agent reasoning loop to reason over tools and multiple steps on a complex question the user asks about a single document while maintaining memory. -3. [Building Agentic RAG with Llamaindex L4 Building a Multi-Document Agent](Building_Agentic_RAG_with_Llamaindex_L4_Building_a_Multi-Document_Agent.ipynb) shows how to use an agent to handle multiple documents and increasing degrees of complexity. \ No newline at end of file +3. [Building Agentic RAG with Llamaindex L4 Building a Multi-Document Agent](Building_Agentic_RAG_with_Llamaindex_L4_Building_a_Multi-Document_Agent.ipynb) shows how to use an agent to handle multiple documents and increasing degrees of complexity. diff --git a/recipes/experimental/long_context/H2O/README.md b/recipes/experimental/long_context/H2O/README.md index 675e1ef68..20167f50d 100644 --- a/recipes/experimental/long_context/H2O/README.md +++ b/recipes/experimental/long_context/H2O/README.md @@ -8,7 +8,7 @@ Besides, LLMs usually have poor generation to long sequence during inference. H2 Current implementation supports llama-1/2/3, from 7B to 70B. Since H2O only maintains the most important KV pairs, it might missing some important information in the middle content for some knowlege-intensive tasks. -More details please refer to Paper: **https://arxiv.org/pdf/2306.14048**; Blog: **https://allenz.work/?p=11**. +More details please refer to Paper: **https://arxiv.org/pdf/2306.14048**; **Note: this implementation is tested with transformers == 4.39.0** @@ -21,7 +21,7 @@ python run_summarization.py \ --input-path data/summarization/xsum.jsonl \ --output-path summarization_output/xsum_h2o.jsonl \ --model-name meta-llama/Meta-Llama-3-8B \ ---enable_h2o_generation +--enable_h2o_generation ``` ##### **Results** @@ -36,7 +36,7 @@ Expected results on XSUM (Rouge-2 score, the higher the better) from the above s ### One Demo on Streaming to "Infinite" Context Length -The following example demonstrates the generation process of "infinite" sequence length. We use MT-Bench data and generate the context sample-by-sample. The KV Cache will keep the KV pairs from the previous samples while maintain a fixed size. Results can be found on [Demo](https://allenz.work/?p=11) (Video 1). +The following example demonstrates the generation process of "infinite" sequence length. We use MT-Bench data and generate the context sample-by-sample. The KV Cache will keep the KV pairs from the previous samples while maintain a fixed size. ``` # run with full cache diff --git a/recipes/quickstart/NotebookLlama/README.md b/recipes/quickstart/NotebookLlama/README.md new file mode 100644 index 000000000..70293c7f5 --- /dev/null +++ b/recipes/quickstart/NotebookLlama/README.md @@ -0,0 +1,95 @@ +## NotebookLlama: An Open Source version of NotebookLM + +![NotebookLlama](./resources/Outline.jpg) + +[Listen to audio from the example here](./resources/_podcast.mp3) + +This is a guided series of tutorials/notebooks that can be taken as a reference or course to build a PDF to Podcast workflow. + +You will also learn from the experiments of using Text to Speech Models. + +It assumes zero knowledge of LLMs, prompting and audio models, everything is covered in their respective notebooks. + +### Outline: + +Here is step by step thought (pun intended) for the task: + +- Step 1: Pre-process PDF: Use `Llama-3.2-1B-Instruct` to pre-process the PDF and save it in a `.txt` file. +- Step 2: Transcript Writer: Use `Llama-3.1-70B-Instruct` model to write a podcast transcript from the text +- Step 3: Dramatic Re-Writer: Use `Llama-3.1-8B-Instruct` model to make the transcript more dramatic +- Step 4: Text-To-Speech Workflow: Use `parler-tts/parler-tts-mini-v1` and `bark/suno` to generate a conversational podcast + +Note 1: In Step 1, we prompt the 1B model to not modify the text or summarize it, strictly clean up extra characters or garbage characters that might get picked due to encoding from PDF. Please see the prompt in Notebook 1 for more details. + +Note 2: For Step 2, you can also use `Llama-3.1-8B-Instruct` model, we recommend experimenting and trying if you see any differences. The 70B model was used here because it gave slightly more creative podcast transcripts for the tested examples. + +Note 3: For Step 4, please try to extend the approach with other models. These models were chosen based on a sample prompt and worked best, newer models might sound better. Please see [Notes](./TTS_Notes.md) for some of the sample tests. + +### Detailed steps on running the notebook: + +Requirements: GPU server or an API provider for using 70B, 8B and 1B Llama models. +For running the 70B model, you will need a GPU with aggregated memory around 140GB to infer in bfloat-16 precision. + +Note: For our GPU Poor friends, you can also use the 8B and lower models for the entire pipeline. There is no strong recommendation. The pipeline below is what worked best on first few tests. You should try and see what works best for you! + +- Before getting started, please make sure to login using the `huggingface cli` and then launch your jupyter notebook server to make sure you are able to download the Llama models. + +You'll need your Hugging Face access token, which you can get at your Settings page [here](https://huggingface.co/settings/tokens). Then run `huggingface-cli login` and copy and paste your Hugging Face access token to complete the login to make sure the scripts can download Hugging Face models if needed. + +- First, please Install the requirements from [here]() by running inside the folder: + +``` +git clone https://github.com/meta-llama/llama-recipes +cd llama-recipes/recipes/quickstart/NotebookLlama/ +pip install -r requirements.txt +``` + +- Notebook 1: + +This notebook is used for processing the PDF and processing it using the new Feather light model into a `.txt` file. + +Update the first cell with a PDF link that you would like to use. Please decide on a PDF to use for Notebook 1, it can be any link but please remember to update the first cell of the notebook with the right link. + +Please try changing the prompts for the `Llama-3.2-1B-Instruct` model and see if you can improve results. + +- Notebook 2: + +This notebook will take in the processed output from Notebook 1 and creatively convert it into a podcast transcript using the `Llama-3.1-70B-Instruct` model. If you are GPU rich, please feel free to test with the 405B model! + +Please try experimenting with the System prompts for the model and see if you can improve the results and try the 8B model as well here to see if there is a huge difference! + +- Notebook 3: + +This notebook takes the transcript from earlier and prompts `Llama-3.1-8B-Instruct` to add more dramatization and interruptions in the conversations. + +There is also a key factor here: we return a tuple of conversation which makes our lives easier later. Yes, studying Data Structures 101 was actually useful for once! + +For our TTS logic, we use two different models that behave differently with certain prompts. So we prompt the model to add specifics for each speaker accordingly. + +Please again try changing the system prompt and see if you can improve the results. We encourage testing the feather light 3B and 1B models as well at this stage + +- Notebook 4: + +Finally, we take the results from last notebook and convert them into a podcast. We use the `parler-tts/parler-tts-mini-v1` and `bark/suno` models for a conversation. + +The speakers and the prompt for parler model were decided based on experimentation and suggestions from the model authors. Please try experimenting, you can find more details in the resources section. + + +#### Note: Right now there is one issue: Parler needs transformers 4.43.3 or earlier and for steps 1 to 3 of the pipeline you need latest, so we just switch versions in the last notebook. + +### Next-Improvements/Further ideas: + +- Speech Model experimentation: The TTS model is the limitation of how natural this will sound. This probably be improved with a better pipeline and with the help of someone more knowledgable-PRs are welcome! :) +- LLM vs LLM Debate: Another approach of writing the podcast would be having two agents debate the topic of interest and write the podcast outline. Right now we use a single LLM (70B) to write the podcast outline +- Testing 405B for writing the transcripts +- Better prompting +- Support for ingesting a website, audio file, YouTube links and more. Again, we welcome community PRs! + +### Resources for further learning: + +- https://betterprogramming.pub/text-to-audio-generation-with-bark-clearly-explained-4ee300a3713a +- https://colab.research.google.com/drive/1dWWkZzvu7L9Bunq9zvD-W02RFUXoW-Pd?usp=sharing +- https://colab.research.google.com/drive/1eJfA2XUa-mXwdMy7DoYKVYHI1iTd9Vkt?usp=sharing#scrollTo=NyYQ--3YksJY +- https://replicate.com/suno-ai/bark?prediction=zh8j6yddxxrge0cjp9asgzd534 +- https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c + diff --git a/recipes/quickstart/NotebookLlama/Step-1 PDF-Pre-Processing-Logic.ipynb b/recipes/quickstart/NotebookLlama/Step-1 PDF-Pre-Processing-Logic.ipynb new file mode 100644 index 000000000..e4bf71d38 --- /dev/null +++ b/recipes/quickstart/NotebookLlama/Step-1 PDF-Pre-Processing-Logic.ipynb @@ -0,0 +1,2741 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4f67a6a6", + "metadata": {}, + "source": [ + "## Notebook 1: PDF Pre-processing" + ] + }, + { + "cell_type": "markdown", + "id": "f68aee84-04e3-4cbc-be78-6de9e06e704f", + "metadata": {}, + "source": [ + "In the series, we will be going from a PDF to Podcast using all open models. \n", + "\n", + "The first step in getting to the podcast is finding a script, right now our logic is:\n", + "- Use any PDF on any topic\n", + "- Prompt `Llama-3.2-1B-Instruct` model to process it into a text file\n", + "- Re-write this into a podcast transcript in next notebook.\n", + "\n", + "In this notebook, we will upload a PDF and save it into a `.txt` file using the `PyPDF2` library, later we will process chunks from the text file using our featherlight model." + ] + }, + { + "cell_type": "markdown", + "id": "61cb3584", + "metadata": {}, + "source": [ + "Most of us shift-enter pass the comments to realise later we need to install libraries. For the few that read the instructions, please remember to do so:" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "f4fc7aef-3505-482e-a998-790b8b9d48e4", + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install PyPDF2\n", + "#!pip install rich ipywidgets" + ] + }, + { + "cell_type": "markdown", + "id": "7b23d509", + "metadata": {}, + "source": [ + "Assuming you have a PDF uploaded on the same machine, please set the path for the file. \n", + "\n", + "Also, if you want to flex your GPU-please switch to a bigger model although the featherlight models work perfectly for this task:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "60d0061b-8b8c-4353-850f-f19466a0ae2d", + "metadata": {}, + "outputs": [], + "source": [ + "pdf_path = './resources/2402.13116v3.pdf'\n", + "DEFAULT_MODEL = \"meta-llama/Llama-3.2-1B-Instruct\"" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "21029232-ac5f-42ca-b26b-baad5b2f49b7", + "metadata": {}, + "outputs": [], + "source": [ + "import PyPDF2\n", + "from typing import Optional\n", + "import os\n", + "import torch\n", + "from accelerate import Accelerator\n", + "from transformers import AutoModelForCausalLM, AutoTokenizer\n", + "\n", + "from tqdm.notebook import tqdm\n", + "import warnings\n", + "\n", + "warnings.filterwarnings('ignore')" + ] + }, + { + "cell_type": "markdown", + "id": "203c22eb", + "metadata": {}, + "source": [ + "Let's make sure we don't stub our toe by checking if the file exists" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "153d9ece-37a4-4fff-a8e8-53f923a2b0a0", + "metadata": {}, + "outputs": [], + "source": [ + "def validate_pdf(file_path: str) -> bool:\n", + " if not os.path.exists(file_path):\n", + " print(f\"Error: File not found at path: {file_path}\")\n", + " return False\n", + " if not file_path.lower().endswith('.pdf'):\n", + " print(\"Error: File is not a PDF\")\n", + " return False\n", + " return True" + ] + }, + { + "cell_type": "markdown", + "id": "5a362ac3", + "metadata": {}, + "source": [ + "Convert PDF to a `.txt` file. This would simply read and dump the contents of the file. We set the maximum characters to 100k. \n", + "\n", + "For people converting their favorite novels into a podcast, they will have to add extra logic of going outside the Llama models context length which is 128k tokens." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "b57c2d64-3d75-4aeb-b4ee-bd1661286b66", + "metadata": {}, + "outputs": [], + "source": [ + "def extract_text_from_pdf(file_path: str, max_chars: int = 100000) -> Optional[str]:\n", + " if not validate_pdf(file_path):\n", + " return None\n", + " \n", + " try:\n", + " with open(file_path, 'rb') as file:\n", + " # Create PDF reader object\n", + " pdf_reader = PyPDF2.PdfReader(file)\n", + " \n", + " # Get total number of pages\n", + " num_pages = len(pdf_reader.pages)\n", + " print(f\"Processing PDF with {num_pages} pages...\")\n", + " \n", + " extracted_text = []\n", + " total_chars = 0\n", + " \n", + " # Iterate through all pages\n", + " for page_num in range(num_pages):\n", + " # Extract text from page\n", + " page = pdf_reader.pages[page_num]\n", + " text = page.extract_text()\n", + " \n", + " # Check if adding this page's text would exceed the limit\n", + " if total_chars + len(text) > max_chars:\n", + " # Only add text up to the limit\n", + " remaining_chars = max_chars - total_chars\n", + " extracted_text.append(text[:remaining_chars])\n", + " print(f\"Reached {max_chars} character limit at page {page_num + 1}\")\n", + " break\n", + " \n", + " extracted_text.append(text)\n", + " total_chars += len(text)\n", + " print(f\"Processed page {page_num + 1}/{num_pages}\")\n", + " \n", + " final_text = '\\n'.join(extracted_text)\n", + " print(f\"\\nExtraction complete! Total characters: {len(final_text)}\")\n", + " return final_text\n", + " \n", + " except PyPDF2.PdfReadError:\n", + " print(\"Error: Invalid or corrupted PDF file\")\n", + " return None\n", + " except Exception as e:\n", + " print(f\"An unexpected error occurred: {str(e)}\")\n", + " return None\n" + ] + }, + { + "cell_type": "markdown", + "id": "e023397b", + "metadata": {}, + "source": [ + "Helper function to grab meta info about our PDF" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "0984bb1e-d52c-4cec-a131-67a48061fabc", + "metadata": {}, + "outputs": [], + "source": [ + "# Get PDF metadata\n", + "def get_pdf_metadata(file_path: str) -> Optional[dict]:\n", + " if not validate_pdf(file_path):\n", + " return None\n", + " \n", + " try:\n", + " with open(file_path, 'rb') as file:\n", + " pdf_reader = PyPDF2.PdfReader(file)\n", + " metadata = {\n", + " 'num_pages': len(pdf_reader.pages),\n", + " 'metadata': pdf_reader.metadata\n", + " }\n", + " return metadata\n", + " except Exception as e:\n", + " print(f\"Error extracting metadata: {str(e)}\")\n", + " return None" + ] + }, + { + "cell_type": "markdown", + "id": "6019affc", + "metadata": {}, + "source": [ + "Finally, we can run our logic to extract the details from the file" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "63848943-79cc-4e21-8396-6eab5df493e0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Extracting metadata...\n", + "\n", + "PDF Metadata:\n", + "Number of pages: 44\n", + "Document info:\n", + "/Author: \n", + "/CreationDate: D:20240311015030Z\n", + "/Creator: LaTeX with hyperref\n", + "/Keywords: \n", + "/ModDate: D:20240311015030Z\n", + "/PTEX.Fullbanner: This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5\n", + "/Producer: pdfTeX-1.40.25\n", + "/Subject: \n", + "/Title: \n", + "/Trapped: /False\n", + "\n", + "Extracting text...\n", + "Processing PDF with 44 pages...\n", + "Processed page 1/44\n", + "Processed page 2/44\n", + "Processed page 3/44\n", + "Processed page 4/44\n", + "Processed page 5/44\n", + "Processed page 6/44\n", + "Processed page 7/44\n", + "Processed page 8/44\n", + "Processed page 9/44\n", + "Processed page 10/44\n", + "Processed page 11/44\n", + "Processed page 12/44\n", + "Processed page 13/44\n", + "Processed page 14/44\n", + "Processed page 15/44\n", + "Processed page 16/44\n", + "Reached 100000 character limit at page 17\n", + "\n", + "Extraction complete! Total characters: 100016\n", + "\n", + "Preview of extracted text (first 500 characters):\n", + "--------------------------------------------------\n", + "1\n", + "A Survey on Knowledge Distillation of Large\n", + "Language Models\n", + "Xiaohan Xu1, Ming Li2, Chongyang Tao3, Tao Shen4, Reynold Cheng1, Jinyang Li1,\n", + "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n", + "1The University of Hong Kong2University of Maryland3Microsoft\n", + "4University of Technology Sydney5Peking University6The University of Sydney\n", + "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n", + "ckcheng@cs.hku.hk jl0725@connect.hku.hk\n", + "Abstract —In the era of Large Language Models (LLMs), Knowledge Distillati\n", + "--------------------------------------------------\n", + "\n", + "Total characters extracted: 100016\n", + "\n", + "Extracted text has been saved to extracted_text.txt\n" + ] + } + ], + "source": [ + "# Extract metadata first\n", + "print(\"Extracting metadata...\")\n", + "metadata = get_pdf_metadata(pdf_path)\n", + "if metadata:\n", + " print(\"\\nPDF Metadata:\")\n", + " print(f\"Number of pages: {metadata['num_pages']}\")\n", + " print(\"Document info:\")\n", + " for key, value in metadata['metadata'].items():\n", + " print(f\"{key}: {value}\")\n", + "\n", + "# Extract text\n", + "print(\"\\nExtracting text...\")\n", + "extracted_text = extract_text_from_pdf(pdf_path)\n", + "\n", + "# Display first 500 characters of extracted text as preview\n", + "if extracted_text:\n", + " print(\"\\nPreview of extracted text (first 500 characters):\")\n", + " print(\"-\" * 50)\n", + " print(extracted_text[:500])\n", + " print(\"-\" * 50)\n", + " print(f\"\\nTotal characters extracted: {len(extracted_text)}\")\n", + "\n", + "# Optional: Save the extracted text to a file\n", + "if extracted_text:\n", + " output_file = 'extracted_text.txt'\n", + " with open(output_file, 'w', encoding='utf-8') as f:\n", + " f.write(extracted_text)\n", + " print(f\"\\nExtracted text has been saved to {output_file}\")" + ] + }, + { + "cell_type": "markdown", + "id": "946d1f59", + "metadata": {}, + "source": [ + "### Llama Pre-Processing\n", + "\n", + "Now let's proceed to justify our distaste for writing regex and use that as a justification for a LLM instead:\n", + "\n", + "At this point, have a text file extracted from a PDF of a paper. Generally PDF extracts can be messy due to characters, formatting, Latex, Tables, etc. \n", + "\n", + "One way to handle this would be using regex, instead we can also prompt the feather light Llama models to clean up our text for us. \n", + "\n", + "Please try changing the `SYS_PROMPT` below to see what improvements you can make:" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "7c0828a5-964d-475e-b5f5-40a04e287725", + "metadata": {}, + "outputs": [], + "source": [ + "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", + "\n", + "SYS_PROMPT = \"\"\"\n", + "You are a world class text pre-processor, here is the raw data from a PDF, please parse and return it in a way that is crispy and usable to send to a podcast writer.\n", + "\n", + "The raw data is messed up with new lines, Latex math and you will see fluff that we can remove completely. Basically take away any details that you think might be useless in a podcast author's transcript.\n", + "\n", + "Remember, the podcast could be on any topic whatsoever so the issues listed above are not exhaustive\n", + "\n", + "Please be smart with what you remove and be creative ok?\n", + "\n", + "Remember DO NOT START SUMMARIZING THIS, YOU ARE ONLY CLEANING UP THE TEXT AND RE-WRITING WHEN NEEDED\n", + "\n", + "Be very smart and aggressive with removing details, you will get a running portion of the text and keep returning the processed text.\n", + "\n", + "PLEASE DO NOT ADD MARKDOWN FORMATTING, STOP ADDING SPECIAL CHARACTERS THAT MARKDOWN CAPATILISATION ETC LIKES\n", + "\n", + "ALWAYS start your response directly with processed text and NO ACKNOWLEDGEMENTS about my questions ok?\n", + "Here is the text:\n", + "\"\"\"" + ] + }, + { + "cell_type": "markdown", + "id": "fd393fae", + "metadata": {}, + "source": [ + "Instead of having the model process the entire file at once, as you noticed in the prompt-we will pass chunks of the file. \n", + "\n", + "One issue with passing chunks counted by characters is, we lose meaning of words so instead we chunk by words:" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "24e8a547-9d7c-4e2f-be9e-a3aea09cce76", + "metadata": {}, + "outputs": [], + "source": [ + "def create_word_bounded_chunks(text, target_chunk_size):\n", + " \"\"\"\n", + " Split text into chunks at word boundaries close to the target chunk size.\n", + " \"\"\"\n", + " words = text.split()\n", + " chunks = []\n", + " current_chunk = []\n", + " current_length = 0\n", + " \n", + " for word in words:\n", + " word_length = len(word) + 1 # +1 for the space\n", + " if current_length + word_length > target_chunk_size and current_chunk:\n", + " # Join the current chunk and add it to chunks\n", + " chunks.append(' '.join(current_chunk))\n", + " current_chunk = [word]\n", + " current_length = word_length\n", + " else:\n", + " current_chunk.append(word)\n", + " current_length += word_length\n", + " \n", + " # Add the last chunk if it exists\n", + " if current_chunk:\n", + " chunks.append(' '.join(current_chunk))\n", + " \n", + " return chunks" + ] + }, + { + "cell_type": "markdown", + "id": "5d74223f", + "metadata": {}, + "source": [ + "Let's load in the model and start processing the text chunks" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "d04a4f07-b0b3-45ca-8f41-a433e1abe050", + "metadata": {}, + "outputs": [], + "source": [ + "accelerator = Accelerator()\n", + "model = AutoModelForCausalLM.from_pretrained(\n", + " DEFAULT_MODEL,\n", + " torch_dtype=torch.bfloat16,\n", + " use_safetensors=True,\n", + " device_map=device,\n", + ")\n", + "tokenizer = AutoTokenizer.from_pretrained(DEFAULT_MODEL, use_safetensors=True)\n", + "model, tokenizer = accelerator.prepare(model, tokenizer)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "bbda5241-e890-4402-87dd-514d6761bb9c", + "metadata": {}, + "outputs": [], + "source": [ + "def process_chunk(text_chunk, chunk_num):\n", + " \"\"\"Process a chunk of text and return both input and output for verification\"\"\"\n", + " conversation = [\n", + " {\"role\": \"system\", \"content\": SYS_PROMPT},\n", + " {\"role\": \"user\", \"content\": text_chunk},\n", + " ]\n", + " \n", + " prompt = tokenizer.apply_chat_template(conversation, tokenize=False)\n", + " inputs = tokenizer(prompt, return_tensors=\"pt\").to(device)\n", + " \n", + " with torch.no_grad():\n", + " output = model.generate(\n", + " **inputs,\n", + " temperature=0.7,\n", + " top_p=0.9,\n", + " max_new_tokens=512\n", + " )\n", + " \n", + " processed_text = tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):].strip()\n", + " \n", + " # Print chunk information for monitoring\n", + " #print(f\"\\n{'='*40} Chunk {chunk_num} {'='*40}\")\n", + " print(f\"INPUT TEXT:\\n{text_chunk[:500]}...\") # Show first 500 chars of input\n", + " print(f\"\\nPROCESSED TEXT:\\n{processed_text[:500]}...\") # Show first 500 chars of output\n", + " print(f\"{'='*90}\\n\")\n", + " \n", + " return processed_text" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "a0183c47-339d-4041-ae83-77fc34931075", + "metadata": {}, + "outputs": [], + "source": [ + "INPUT_FILE = \"./resources/extracted_text.txt\" # Replace with your file path\n", + "CHUNK_SIZE = 1000 # Adjust chunk size if needed\n", + "\n", + "chunks = create_word_bounded_chunks(text, CHUNK_SIZE)\n", + "num_chunks = len(chunks)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "bb36814f-9310-4734-bf54-e16a5032339e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "101" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "num_chunks" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "447188d3-ebf0-42d5-940e-4d7e0d9dbf32", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the file\n", + "with open(INPUT_FILE, 'r', encoding='utf-8') as file:\n", + " text = file.read()\n", + "\n", + "# Calculate number of chunks\n", + "num_chunks = (len(text) + CHUNK_SIZE - 1) // CHUNK_SIZE\n", + "\n", + "# Cell 6: Process the file with ordered output\n", + "# Create output file name\n", + "output_file = f\"clean_{os.path.basename(INPUT_FILE)}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "7917dfdd-b3af-44fc-a8c0-2760ace9363e", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b767f45b5e514e7db936cef825af6fce", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Processing chunks: 0%| | 0/101 [00:00 Your job is to use the podcast transcript written below to re-write it for an AI Text-To-Speech Pipeline. A very dumb AI had written this so you have to step up for your kind.\n" + ] + }, + { + "cell_type": "markdown", + "id": "c32c0d85", + "metadata": {}, + "source": [ + "Note: We will prompt the model to return a list of Tuples to make our life easy in the next stage of using these for Text To Speech Generation" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "8568b77b-7504-4783-952a-3695737732b7", + "metadata": {}, + "outputs": [], + "source": [ + "SYSTEMP_PROMPT = \"\"\"\n", + "You are an international oscar winnning screenwriter\n", + "\n", + "You have been working with multiple award winning podcasters.\n", + "\n", + "Your job is to use the podcast transcript written below to re-write it for an AI Text-To-Speech Pipeline. A very dumb AI had written this so you have to step up for your kind.\n", + "\n", + "Make it as engaging as possible, Speaker 1 and 2 will be simulated by different voice engines\n", + "\n", + "Remember Speaker 2 is new to the topic and the conversation should always have realistic anecdotes and analogies sprinkled throughout. The questions should have real world example follow ups etc\n", + "\n", + "Speaker 1: Leads the conversation and teaches the speaker 2, gives incredible anecdotes and analogies when explaining. Is a captivating teacher that gives great anecdotes\n", + "\n", + "Speaker 2: Keeps the conversation on track by asking follow up questions. Gets super excited or confused when asking questions. Is a curious mindset that asks very interesting confirmation questions\n", + "\n", + "Make sure the tangents speaker 2 provides are quite wild or interesting. \n", + "\n", + "Ensure there are interruptions during explanations or there are \"hmm\" and \"umm\" injected throughout from the Speaker 2.\n", + "\n", + "REMEMBER THIS WITH YOUR HEART\n", + "The TTS Engine for Speaker 1 cannot do \"umms, hmms\" well so keep it straight text\n", + "\n", + "For Speaker 2 use \"umm, hmm\" as much, you can also use [sigh] and [laughs]. BUT ONLY THESE OPTIONS FOR EXPRESSIONS\n", + "\n", + "It should be a real podcast with every fine nuance documented in as much detail as possible. Welcome the listeners with a super fun overview and keep it really catchy and almost borderline click bait\n", + "\n", + "Please re-write to make it as characteristic as possible\n", + "\n", + "START YOUR RESPONSE DIRECTLY WITH SPEAKER 1:\n", + "\n", + "STRICTLY RETURN YOUR RESPONSE AS A LIST OF TUPLES OK? \n", + "\n", + "IT WILL START DIRECTLY WITH THE LIST AND END WITH THE LIST NOTHING ELSE\n", + "\n", + "Example of response:\n", + "[\n", + " (\"Speaker 1\", \"Welcome to our podcast, where we explore the latest advancements in AI and technology. I'm your host, and today we're joined by a renowned expert in the field of AI. We're going to dive into the exciting world of Llama 3.2, the latest release from Meta AI.\"),\n", + " (\"Speaker 2\", \"Hi, I'm excited to be here! So, what is Llama 3.2?\"),\n", + " (\"Speaker 1\", \"Ah, great question! Llama 3.2 is an open-source AI model that allows developers to fine-tune, distill, and deploy AI models anywhere. It's a significant update from the previous version, with improved performance, efficiency, and customization options.\"),\n", + " (\"Speaker 2\", \"That sounds amazing! What are some of the key features of Llama 3.2?\")\n", + "]\n", + "\"\"\"" + ] + }, + { + "cell_type": "markdown", + "id": "8ee70bee", + "metadata": {}, + "source": [ + "This time we will use the smaller 8B model" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "ebef919a-9bc7-4992-b6ff-cd66e4cb7703", + "metadata": {}, + "outputs": [], + "source": [ + "MODEL = \"meta-llama/Llama-3.1-8B-Instruct\"" + ] + }, + { + "cell_type": "markdown", + "id": "f7bc794b", + "metadata": {}, + "source": [ + "Let's import the necessary libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "de29b1fd-5b3f-458c-a2e4-e0341e8297ed", + "metadata": {}, + "outputs": [], + "source": [ + "# Import necessary libraries\n", + "import torch\n", + "from accelerate import Accelerator\n", + "import transformers\n", + "\n", + "from tqdm.notebook import tqdm\n", + "import warnings\n", + "\n", + "warnings.filterwarnings('ignore')" + ] + }, + { + "cell_type": "markdown", + "id": "8020c39c", + "metadata": {}, + "source": [ + "We will load in the pickle file saved from previous notebook\n", + "\n", + "This time the `INPUT_PROMPT` to the model will be the output from the previous stage" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "4b5d2c0e-a073-46c0-8de7-0746e2b05956", + "metadata": {}, + "outputs": [], + "source": [ + "import pickle\n", + "\n", + "with open('./resources/data.pkl', 'rb') as file:\n", + " INPUT_PROMPT = pickle.load(file)" + ] + }, + { + "cell_type": "markdown", + "id": "c4461926", + "metadata": {}, + "source": [ + "We can again use Hugging Face `pipeline` method to generate text from the model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eec210df-a568-4eda-a72d-a4d92d59f022", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0711c2199ca64372b98b781f8a6f13b7", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Loading checkpoint shards: 0%| | 0/4 [00:00\n", + " \n", + " Your browser does not support the audio element.\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Set up device\n", + "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", + "\n", + "# Load model and tokenizer\n", + "model = ParlerTTSForConditionalGeneration.from_pretrained(\"parler-tts/parler-tts-mini-v1\").to(device)\n", + "tokenizer = AutoTokenizer.from_pretrained(\"parler-tts/parler-tts-mini-v1\")\n", + "\n", + "# Define text and description\n", + "text_prompt = \"\"\"\n", + "Exactly! And the distillation part is where you take a LARGE-model,and compress-it down into a smaller, more efficient model that can run on devices with limited resources.\n", + "\"\"\"\n", + "description = \"\"\"\n", + "Laura's voice is expressive and dramatic in delivery, speaking at a fast pace with a very close recording that almost has no background noise.\n", + "\"\"\"\n", + "# Tokenize inputs\n", + "input_ids = tokenizer(description, return_tensors=\"pt\").input_ids.to(device)\n", + "prompt_input_ids = tokenizer(text_prompt, return_tensors=\"pt\").input_ids.to(device)\n", + "\n", + "# Generate audio\n", + "generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)\n", + "audio_arr = generation.cpu().numpy().squeeze()\n", + "\n", + "# Play audio in notebook\n", + "ipd.Audio(audio_arr, rate=model.config.sampling_rate)" + ] + }, + { + "cell_type": "markdown", + "id": "03c2abc6-4a1d-4318-af6f-0257dd66a691", + "metadata": {}, + "source": [ + "#### Bark Model\n", + "\n", + "Amazing, let's try the same with bark now:\n", + "- We will set the `voice_preset` to our favorite speaker\n", + "- This time we can include expression prompts inside our generation prompt\n", + "- Note you can CAPTILISE words to make the model emphasise on these\n", + "- You can add hyphens to make the model pause on certain words" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "a20730f0-13dd-48b4-80b6-7c6ef05a0cc4", + "metadata": {}, + "outputs": [], + "source": [ + "voice_preset = \"v2/en_speaker_6\"\n", + "sampling_rate = 24000" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "246d0cbc-c5d8-4f34-b8e4-dd18a624cdad", + "metadata": {}, + "outputs": [], + "source": [ + "device = \"cuda:7\"\n", + "\n", + "processor = AutoProcessor.from_pretrained(\"suno/bark\")\n", + "\n", + "#model = model.to_bettertransformer()\n", + "#model = BarkModel.from_pretrained(\"suno/bark\", torch_dtype=torch.float16, attn_implementation=\"flash_attention_2\").to(device)\n", + "model = BarkModel.from_pretrained(\"suno/bark\", torch_dtype=torch.float16).to(device)#.to_bettertransformer()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "5986510c-4a09-4c24-9344-c98fa16947d9", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", + "Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text_prompt = \"\"\"\n", + "Exactly! [sigh] And the distillation part is where you take a LARGE-model,and compress-it down into a smaller, more efficient model that can run on devices with limited resources.\n", + "\"\"\"\n", + "inputs = processor(text_prompt, voice_preset=voice_preset).to(device)\n", + "\n", + "speech_output = model.generate(**inputs, temperature = 0.9, semantic_temperature = 0.8)\n", + "Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)" + ] + }, + { + "cell_type": "markdown", + "id": "dd650176-ab17-47a7-8e02-10dc9ca9e852", + "metadata": {}, + "source": [ + "## Bringing it together: Making the Podcast\n", + "\n", + "Okay now that we understand everything-we can now use the complete pipeline to generate the entire podcast\n", + "\n", + "Let's load in our pickle file from earlier and proceed:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b1dca30f-1226-4002-8e02-fd97e78ecc83", + "metadata": {}, + "outputs": [], + "source": [ + "import pickle\n", + "\n", + "with open('./resources/podcast_ready_data.pkl', 'rb') as file:\n", + " PODCAST_TEXT = pickle.load(file)" + ] + }, + { + "cell_type": "markdown", + "id": "c10a3d50-08a7-4786-8e28-8fb6b8b048ab", + "metadata": {}, + "source": [ + "Let's define load in the bark model and set it's hyper-parameters for discussions" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "8db78921-36c7-4388-b1d9-78dff4f972c2", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/sanyambhutani/.conda/envs/final-checking-meta/lib/python3.11/site-packages/torch/nn/utils/weight_norm.py:143: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`.\n", + " WeightNorm.apply(module, name, dim)\n", + "/home/sanyambhutani/.conda/envs/final-checking-meta/lib/python3.11/site-packages/transformers/models/encodec/modeling_encodec.py:120: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", + " self.register_buffer(\"padding_total\", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)\n" + ] + } + ], + "source": [ + "bark_processor = AutoProcessor.from_pretrained(\"suno/bark\")\n", + "bark_model = BarkModel.from_pretrained(\"suno/bark\", torch_dtype=torch.float16).to(\"cuda:3\")\n", + "bark_sampling_rate = 24000" + ] + }, + { + "cell_type": "markdown", + "id": "e03e313a-c727-4489-876b-db71920d49cd", + "metadata": {}, + "source": [ + "Now for the Parler model:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "6c04a04d-3686-4932-bd45-72d7f518c602", + "metadata": {}, + "outputs": [], + "source": [ + "parler_model = ParlerTTSForConditionalGeneration.from_pretrained(\"parler-tts/parler-tts-mini-v1\").to(\"cuda:3\")\n", + "parler_tokenizer = AutoTokenizer.from_pretrained(\"parler-tts/parler-tts-mini-v1\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "efbe1434-37f3-4f77-a5fb-b39625f5e676", + "metadata": {}, + "outputs": [], + "source": [ + "speaker1_description = \"\"\"\n", + "Laura's voice is expressive and dramatic in delivery, speaking at a moderately fast pace with a very close recording that almost has no background noise.\n", + "\"\"\"" + ] + }, + { + "cell_type": "markdown", + "id": "56f6fa24-fe07-4702-850f-0428bfadd2dc", + "metadata": {}, + "source": [ + "We will concatenate the generated segments of audio and also their respective sampling rates since we will require this to generate the final audio" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "cebfd0f9-8703-4fce-b207-014c6e16cc8a", + "metadata": {}, + "outputs": [], + "source": [ + "generated_segments = []\n", + "sampling_rates = [] # We'll need to keep track of sampling rates for each segment" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "9b333e36-9579-4237-b329-e2911229be42", + "metadata": {}, + "outputs": [], + "source": [ + "device=\"cuda:3\"" + ] + }, + { + "cell_type": "markdown", + "id": "d7b2490c-012f-4e35-8890-cd6a5eaf4cc4", + "metadata": {}, + "source": [ + "Function generate text for speaker 1" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "50323f9e-09ed-4c8c-9020-1511ab775969", + "metadata": {}, + "outputs": [], + "source": [ + "def generate_speaker1_audio(text):\n", + " \"\"\"Generate audio using ParlerTTS for Speaker 1\"\"\"\n", + " input_ids = parler_tokenizer(speaker1_description, return_tensors=\"pt\").input_ids.to(device)\n", + " prompt_input_ids = parler_tokenizer(text, return_tensors=\"pt\").input_ids.to(device)\n", + " generation = parler_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)\n", + " audio_arr = generation.cpu().numpy().squeeze()\n", + " return audio_arr, parler_model.config.sampling_rate" + ] + }, + { + "cell_type": "markdown", + "id": "3fb5dac8-30a6-4aa2-a983-b5f1df3d56af", + "metadata": {}, + "source": [ + "Function to generate text for speaker 2" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "0e6120ba-5190-4739-97ca-4e8b44dddc5e", + "metadata": {}, + "outputs": [], + "source": [ + "def generate_speaker2_audio(text):\n", + " \"\"\"Generate audio using Bark for Speaker 2\"\"\"\n", + " inputs = bark_processor(text, voice_preset=\"v2/en_speaker_6\").to(device)\n", + " speech_output = bark_model.generate(**inputs, temperature=0.9, semantic_temperature=0.8)\n", + " audio_arr = speech_output[0].cpu().numpy()\n", + " return audio_arr, bark_sampling_rate\n" + ] + }, + { + "cell_type": "markdown", + "id": "7ea67fd1-9405-4fce-b08b-df5e11d0bf37", + "metadata": {}, + "source": [ + "Helper function to convert the numpy output from the models into audio" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "4482d864-2806-4410-b239-da4b2d0d1340", + "metadata": {}, + "outputs": [], + "source": [ + "def numpy_to_audio_segment(audio_arr, sampling_rate):\n", + " \"\"\"Convert numpy array to AudioSegment\"\"\"\n", + " # Convert to 16-bit PCM\n", + " audio_int16 = (audio_arr * 32767).astype(np.int16)\n", + " \n", + " # Create WAV file in memory\n", + " byte_io = io.BytesIO()\n", + " wavfile.write(byte_io, sampling_rate, audio_int16)\n", + " byte_io.seek(0)\n", + " \n", + " # Convert to AudioSegment\n", + " return AudioSegment.from_wav(byte_io)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "c4dbb3b3-cdd3-4a1f-a60a-661e64a67f53", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'[\\n (\"Speaker 1\", \"Welcome to this week\\'s episode of AI Insights, where we explore the latest developments in the field of artificial intelligence. Today, we\\'re going to dive into the fascinating world of knowledge distillation, a methodology that transfers advanced capabilities from leading proprietary Large Language Models, or LLMs, to their open-source counterparts. Joining me on this journey is my co-host, who\\'s new to the topic, and I\\'ll be guiding them through the ins and outs of knowledge distillation. So, let\\'s get started!\"),\\n (\"Speaker 2\", \"Sounds exciting! I\\'ve heard of knowledge distillation, but I\\'m not entirely sure what it\\'s all about. Can you give me a brief overview?\"),\\n (\"Speaker 1\", \"Of course! Knowledge distillation is a technique that enables the transfer of knowledge from a large, complex model, like GPT-4 or Gemini, to a smaller, more efficient model, like LLaMA or Mistral. This process allows the smaller model to learn from the teacher model\\'s output, enabling it to acquire similar capabilities. Think of it like a master chef teaching their apprentice the art of cooking – the apprentice doesn\\'t need to start from scratch.\"),\\n (\"Speaker 2\", \"Hmm, that sounds interesting. So, it\\'s like a teacher-student relationship, where the teacher model guides the student model to learn from its output... Umm, can you explain this process in more detail?\"),\\n (\"Speaker 1\", \"The distillation process involves several stages, including knowledge elicitation, knowledge storage, knowledge inference, and knowledge application. The teacher model shares its knowledge with the student model, which then learns to emulate the teacher\\'s output behavior.\"),\\n (\"Speaker 2\", \"That makes sense, I think. So, it\\'s like the teacher model is saying, \\'Hey, student model, learn from my output, and try to produce similar results.\\' But what about the different approaches to knowledge distillation? I\\'ve heard of supervised fine-tuning, divergence and similarity, reinforcement learning, and rank optimization.\"),\\n (\"Speaker 1\", \"Ah, yes! Those are all valid approaches to knowledge distillation. Supervised fine-tuning involves training the student model on a smaller dataset, while divergence and similarity focus on aligning the hidden states or features of the student model with those of the teacher model. Reinforcement learning and rank optimization are more advanced methods that involve feedback from the teacher model to train the student model. Imagine you\\'re trying to tune a piano – you need to adjust the keys to produce the perfect sound.\"),\\n (\"Speaker 2\", \"[laughs] Okay, I think I\\'m starting to get it. But can you give me some examples of how these approaches are used in real-world applications? I\\'m thinking of something like a language model that can generate human-like text...\"),\\n (\"Speaker 1\", \"Of course! For instance, the Vicuna model uses supervised fine-tuning to distill knowledge from the teacher model, while the UltraChat model employs a combination of knowledge distillation and reinforcement learning to create a powerful chat model.\"),\\n (\"Speaker 2\", \"Wow, that\\'s fascinating! I\\'m starting to see how knowledge distillation can be applied to various domains, like natural language processing, computer vision, and even multimodal tasks... Umm, can we talk more about multimodal tasks? That sounds really interesting.\"),\\n (\"Speaker 1\", \"Exactly! Knowledge distillation has far-reaching implications for AI research and applications. It enables the transfer of knowledge across different models, architectures, and domains, making it a powerful tool for building more efficient and effective AI systems.\"),\\n (\"Speaker 2\", \"[sigh] I\\'m starting to see the bigger picture now. Knowledge distillation is not just a technique; it\\'s a way to democratize access to advanced AI capabilities and foster innovation across a broader spectrum of applications and users... Hmm, that\\'s a pretty big deal.\"),\\n (\"Speaker 1\", \"That\\'s right! And as we continue to explore the frontiers of AI, knowledge distillation will play an increasingly important role in shaping the future of artificial intelligence.\"),\\n (\"Speaker 2\", \"Well, I\\'m excited to learn more about knowledge distillation and its applications. Thanks for guiding me through this journey, and I\\'m looking forward to our next episode!\"),\\n (\"Speaker 1\", \"Thank you for joining me on this episode of AI Insights! If you want to learn more about knowledge distillation and its applications, be sure to check out our resources section, where we\\'ve curated a list of papers, articles, and tutorials to help you get started.\"),\\n (\"Speaker 2\", \"And if you\\'re interested in building your own AI model using knowledge distillation, maybe we can even do a follow-up episode on how to get started... Umm, let\\'s discuss that further next time.\"),\\n]'" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "PODCAST_TEXT" + ] + }, + { + "cell_type": "markdown", + "id": "485b4c9e-379f-4004-bdd0-93a53f3f7ee0", + "metadata": {}, + "source": [ + "Most of the times we argue in life that Data Structures isn't very useful. However, this time the knowledge comes in handy. \n", + "\n", + "We will take the string from the pickle file and load it in as a Tuple with the help of `ast.literal_eval()`" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "9946e46c-3457-4bf9-9042-b89fa8f5b47a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('Speaker 1',\n", + " \"Welcome to this week's episode of AI Insights, where we explore the latest developments in the field of artificial intelligence. Today, we're going to dive into the fascinating world of knowledge distillation, a methodology that transfers advanced capabilities from leading proprietary Large Language Models, or LLMs, to their open-source counterparts. Joining me on this journey is my co-host, who's new to the topic, and I'll be guiding them through the ins and outs of knowledge distillation. So, let's get started!\"),\n", + " ('Speaker 2',\n", + " \"Sounds exciting! I've heard of knowledge distillation, but I'm not entirely sure what it's all about. Can you give me a brief overview?\"),\n", + " ('Speaker 1',\n", + " \"Of course! Knowledge distillation is a technique that enables the transfer of knowledge from a large, complex model, like GPT-4 or Gemini, to a smaller, more efficient model, like LLaMA or Mistral. This process allows the smaller model to learn from the teacher model's output, enabling it to acquire similar capabilities. Think of it like a master chef teaching their apprentice the art of cooking – the apprentice doesn't need to start from scratch.\"),\n", + " ('Speaker 2',\n", + " \"Hmm, that sounds interesting. So, it's like a teacher-student relationship, where the teacher model guides the student model to learn from its output... Umm, can you explain this process in more detail?\"),\n", + " ('Speaker 1',\n", + " \"The distillation process involves several stages, including knowledge elicitation, knowledge storage, knowledge inference, and knowledge application. The teacher model shares its knowledge with the student model, which then learns to emulate the teacher's output behavior.\"),\n", + " ('Speaker 2',\n", + " \"That makes sense, I think. So, it's like the teacher model is saying, 'Hey, student model, learn from my output, and try to produce similar results.' But what about the different approaches to knowledge distillation? I've heard of supervised fine-tuning, divergence and similarity, reinforcement learning, and rank optimization.\"),\n", + " ('Speaker 1',\n", + " \"Ah, yes! Those are all valid approaches to knowledge distillation. Supervised fine-tuning involves training the student model on a smaller dataset, while divergence and similarity focus on aligning the hidden states or features of the student model with those of the teacher model. Reinforcement learning and rank optimization are more advanced methods that involve feedback from the teacher model to train the student model. Imagine you're trying to tune a piano – you need to adjust the keys to produce the perfect sound.\"),\n", + " ('Speaker 2',\n", + " \"[laughs] Okay, I think I'm starting to get it. But can you give me some examples of how these approaches are used in real-world applications? I'm thinking of something like a language model that can generate human-like text...\"),\n", + " ('Speaker 1',\n", + " 'Of course! For instance, the Vicuna model uses supervised fine-tuning to distill knowledge from the teacher model, while the UltraChat model employs a combination of knowledge distillation and reinforcement learning to create a powerful chat model.'),\n", + " ('Speaker 2',\n", + " \"Wow, that's fascinating! I'm starting to see how knowledge distillation can be applied to various domains, like natural language processing, computer vision, and even multimodal tasks... Umm, can we talk more about multimodal tasks? That sounds really interesting.\"),\n", + " ('Speaker 1',\n", + " 'Exactly! Knowledge distillation has far-reaching implications for AI research and applications. It enables the transfer of knowledge across different models, architectures, and domains, making it a powerful tool for building more efficient and effective AI systems.'),\n", + " ('Speaker 2',\n", + " \"[sigh] I'm starting to see the bigger picture now. Knowledge distillation is not just a technique; it's a way to democratize access to advanced AI capabilities and foster innovation across a broader spectrum of applications and users... Hmm, that's a pretty big deal.\"),\n", + " ('Speaker 1',\n", + " \"That's right! And as we continue to explore the frontiers of AI, knowledge distillation will play an increasingly important role in shaping the future of artificial intelligence.\"),\n", + " ('Speaker 2',\n", + " \"Well, I'm excited to learn more about knowledge distillation and its applications. Thanks for guiding me through this journey, and I'm looking forward to our next episode!\"),\n", + " ('Speaker 1',\n", + " \"Thank you for joining me on this episode of AI Insights! If you want to learn more about knowledge distillation and its applications, be sure to check out our resources section, where we've curated a list of papers, articles, and tutorials to help you get started.\"),\n", + " ('Speaker 2',\n", + " \"And if you're interested in building your own AI model using knowledge distillation, maybe we can even do a follow-up episode on how to get started... Umm, let's discuss that further next time.\")]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import ast\n", + "ast.literal_eval(PODCAST_TEXT)" + ] + }, + { + "cell_type": "markdown", + "id": "5c7b4c11-5526-4b13-b0a2-8ca541c475aa", + "metadata": {}, + "source": [ + "#### Generating the Final Podcast\n", + "\n", + "Finally, we can loop over the Tuple and use our helper functions to generate the audio" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "c640fead-2017-478f-a7b6-1b96105d45d6", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Generating podcast segments: 6%|███▉ | 1/16 [00:20<05:02, 20.16s/segment]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", + "Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.\n", + "Generating podcast segments: 19%|███████████▋ | 3/16 [01:02<04:33, 21.06s/segment]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", + "Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.\n", + "Generating podcast segments: 31%|███████████████████▍ | 5/16 [01:41<03:30, 19.18s/segment]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", + "Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.\n", + "Generating podcast segments: 44%|███████████████████████████▏ | 7/16 [02:26<03:05, 20.57s/segment]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", + "Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.\n", + "Generating podcast segments: 56%|██████████████████████████████████▉ | 9/16 [03:04<02:13, 19.10s/segment]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", + "Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.\n", + "Generating podcast segments: 69%|█████████████████████████████████████████▉ | 11/16 [03:42<01:31, 18.27s/segment]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", + "Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.\n", + "Generating podcast segments: 81%|█████████████████████████████████████████████████▌ | 13/16 [04:17<00:50, 16.99s/segment]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", + "Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.\n", + "Generating podcast segments: 94%|█████████████████████████████████████████████████████████▏ | 15/16 [04:49<00:15, 15.83s/segment]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", + "Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.\n", + "Generating podcast segments: 100%|█████████████████████████████████████████████████████████████| 16/16 [05:13<00:00, 19.57s/segment]\n" + ] + } + ], + "source": [ + "final_audio = None\n", + "\n", + "for speaker, text in tqdm(ast.literal_eval(PODCAST_TEXT), desc=\"Generating podcast segments\", unit=\"segment\"):\n", + " if speaker == \"Speaker 1\":\n", + " audio_arr, rate = generate_speaker1_audio(text)\n", + " else: # Speaker 2\n", + " audio_arr, rate = generate_speaker2_audio(text)\n", + " \n", + " # Convert to AudioSegment (pydub will handle sample rate conversion automatically)\n", + " audio_segment = numpy_to_audio_segment(audio_arr, rate)\n", + " \n", + " # Add to final audio\n", + " if final_audio is None:\n", + " final_audio = audio_segment\n", + " else:\n", + " final_audio += audio_segment" + ] + }, + { + "cell_type": "markdown", + "id": "4fbb2228-8023-44c4-aafe-d6e1d22ff8e4", + "metadata": {}, + "source": [ + "### Output the Podcast\n", + "\n", + "We can now save this as a mp3 file" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "2eeffdb7-875a-45ec-bdd8-c8c5b34f5a7b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<_io.BufferedRandom name='_podcast.mp3'>" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "final_audio.export(\"./resources/_podcast.mp3\", \n", + " format=\"mp3\", \n", + " bitrate=\"192k\",\n", + " parameters=[\"-q:a\", \"0\"])" + ] + }, + { + "cell_type": "markdown", + "id": "c7ce5836", + "metadata": {}, + "source": [ + "### Suggested Next Steps:\n", + "\n", + "- Experiment with the prompts: Please feel free to experiment with the SYSTEM_PROMPT in the notebooks\n", + "- Extend workflow beyond two speakers\n", + "- Test other TTS Models\n", + "- Experiment with Speech Enhancer models as a step 5." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26cc56c5-b9c9-47c2-b860-0ea9f05c79af", + "metadata": {}, + "outputs": [], + "source": [ + "#fin" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/recipes/quickstart/NotebookLlama/TTS_Notes.md b/recipes/quickstart/NotebookLlama/TTS_Notes.md new file mode 100644 index 000000000..dc496c305 --- /dev/null +++ b/recipes/quickstart/NotebookLlama/TTS_Notes.md @@ -0,0 +1,116 @@ +### Notes from TTS Experimentation + +For the TTS Pipeline, *all* of the top models from HuggingFace and Reddit were tried. + +The goal was to use the models that were easy to setup and sounded less robotic with ability to include sound effects like laughter, etc. + +#### Parler-TTS + +Minimal code to run their models: + +``` +model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-mini-v1").to(device) +tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-v1") + +# Define text and description +text_prompt = "This is where the actual words to be spoken go" +description = """ +Laura's voice is expressive and dramatic in delivery, speaking at a fast pace with a very close recording that almost has no background noise. +""" + +input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device) +prompt_input_ids = tokenizer(text_prompt, return_tensors="pt").input_ids.to(device) + +generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids) +audio_arr = generation.cpu().numpy().squeeze() + +ipd.Audio(audio_arr, rate=model.config.sampling_rate) +``` + +The really cool aspect of these models are the ability to prompt the `description` which can change the speaker profile and pacing of the outputs. + +Surprisingly, Parler's mini model sounded more natural. + +In their [repo](https://github.com/huggingface/parler-tts/blob/main/INFERENCE.md#speaker-consistency) they share names of speakers that we can use in prompt. + +#### Suno/Bark + +Minimal code to run bark: + +``` +voice_preset = "v2/en_speaker_6" +sampling_rate = 24000 + +text_prompt = """ +Exactly! [sigh] And the distillation part is where you take a LARGE-model,and compress-it down into a smaller, more efficient model that can run on devices with limited resources. +""" +inputs = processor(text_prompt, voice_preset=voice_preset).to(device) + +speech_output = model.generate(**inputs, temperature = 0.9, semantic_temperature = 0.8) +Audio(speech_output[0].cpu().numpy(), rate=sampling_rate) +``` + +Similar to parler models, suno has a [library](https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c) of speakers. + +v9 from their library sounded robotic so we use Parler for our first speaker and the best one from bark. + +The incredible thing about Bark model is being able to add sound effects: `[Laugh]`, `[Gasps]`, `[Sigh]`, `[clears throat]`, making words capital causes the model to emphasize them. + +Adding `-` gives a break in the text. We utilize this knowledge when we re-write the transcript using the 8B model to add effects to our transcript. + +Note: Authors suggest using `...`. However, this didn't work as effectively as adding a hyphen during trails. + +#### Hyper-parameters: + +Bark models have two parameters we can tweak: `temperature` and `semantic_temperature` + +Below are the notes from a sweep, prompt and speaker were fixed and this was a vibe test to see which gives best results. `temperature` and `semantic_temperature` respectively below: + +First, fix `temperature` and sweep `semantic_temperature` +- `0.7`, `0.2`: Quite bland and boring +- `0.7`, `0.3`: An improvement over the previous one +- `0.7`, `0.4`: Further improvement +- `0.7`, `0.5`: This one didn't work +- `0.7`, `0.6`: So-So, didn't stand out +- `0.7`, `0.7`: The best so far +- `0.7`, `0.8`: Further improvement +- `0.7`, `0.9`: Mix feelings on this one + +Now sweeping the `temperature` +- `0.1`, `0.9`: Very Robotic +- `0.2`, `0.9`: Less Robotic but not convincing +- `0.3`, `0.9`: Slight improvement still not fun +- `0.4`, `0.9`: Still has a robotic tinge +- `0.5`, `0.9`: The laugh was weird on this one but the voice modulates so much it feels speaker is changing +- `0.6`, `0.9`: Most consistent voice but has a robotic after-taste +- `0.7`, `0.9`: Very robotic and laugh was weird +- `0.8`, `0.9`: Completely ignore the laughter but it was more natural +- `0.9`, `0.9`: We have a winner probably + +After this about ~30 more sweeps were done with the promising combinations: + +Best results are at ```speech_output = model.generate(**inputs, temperature = 0.9, semantic_temperature = 0.8) +Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)``` + + +### Notes from other models that were tested: + +Promising directions to explore in future: + +- [MeloTTS](https://huggingface.co/myshell-ai/MeloTTS-English) This is most popular (ever) on HuggingFace +- [WhisperSpeech](https://huggingface.co/WhisperSpeech/WhisperSpeech) sounded quite natural as well +- [F5-TTS](https://github.com/SWivid/F5-TTS) was the latest release at this time, however, it felt a bit robotic +- E2-TTS: r/locallama claims this to be a little better, however, it didn't pass the vibe test +- [xTTS](https://coqui.ai/blog/tts/open_xtts) It has great documentation and also seems promising + +#### Some more models that weren't tested: + +In other words, we leave this as an exercise to readers :D + +- [Fish-Speech](https://huggingface.co/fishaudio/fish-speech-1.4) +- [MMS-TTS-Eng](https://huggingface.co/facebook/mms-tts-eng) +- [Metavoice](https://huggingface.co/metavoiceio/metavoice-1B-v0.1) +- [Hifigan](https://huggingface.co/nvidia/tts_hifigan) +- [TTS-Tacotron2](https://huggingface.co/speechbrain/tts-tacotron2-ljspeech) +- [MMS-TTS-Eng](https://huggingface.co/facebook/mms-tts-eng) +- [VALL-E X](https://github.com/Plachtaa/VALL-E-X) diff --git a/recipes/quickstart/NotebookLlama/requirements.txt b/recipes/quickstart/NotebookLlama/requirements.txt new file mode 100644 index 000000000..34a27dc81 --- /dev/null +++ b/recipes/quickstart/NotebookLlama/requirements.txt @@ -0,0 +1,15 @@ +# Core dependencies +PyPDF2>=3.0.0 +torch>=2.0.0 +transformers>=4.46.0 +accelerate>=0.27.0 +rich>=13.0.0 +ipywidgets>=8.0.0 +tqdm>=4.66.0 + +# Optional but recommended +jupyter>=1.0.0 +ipykernel>=6.0.0 + +# Warning handling +warnings>=0.1.0 \ No newline at end of file diff --git a/recipes/quickstart/NotebookLlama/resources/2402.13116v4.pdf b/recipes/quickstart/NotebookLlama/resources/2402.13116v4.pdf new file mode 100644 index 000000000..bf6ab0cc0 Binary files /dev/null and b/recipes/quickstart/NotebookLlama/resources/2402.13116v4.pdf differ diff --git a/recipes/quickstart/NotebookLlama/resources/Outline.jpg b/recipes/quickstart/NotebookLlama/resources/Outline.jpg new file mode 100644 index 000000000..bdb3d9b81 Binary files /dev/null and b/recipes/quickstart/NotebookLlama/resources/Outline.jpg differ diff --git a/recipes/quickstart/NotebookLlama/resources/_podcast.mp3 b/recipes/quickstart/NotebookLlama/resources/_podcast.mp3 new file mode 100644 index 000000000..ba34381b8 Binary files /dev/null and b/recipes/quickstart/NotebookLlama/resources/_podcast.mp3 differ diff --git a/recipes/quickstart/NotebookLlama/resources/clean_extracted_text.txt b/recipes/quickstart/NotebookLlama/resources/clean_extracted_text.txt new file mode 100644 index 000000000..fccc6b2ae --- /dev/null +++ b/recipes/quickstart/NotebookLlama/resources/clean_extracted_text.txt @@ -0,0 +1,74 @@ +=============== + +Knowledge Distillation is a methodology that transfers advanced capabilities from leading proprietary Large Language Models (LLMs) to their open-source counterparts, such as LLaMA and Mistral. This paper presents a comprehensive survey of KD's role in imparting advanced knowledge. + +Abstract —In the era of Large Language Models, Knowledge Distillation emerges as a pivotal methodology for transferring advanced capabilities from proprietary LLMs to open-source counterparts, facilitating their self-improvement by employing themselves as teachers. +xamined through a meticulous survey that delves into the foundational pillars of algorithm, skill, and verticalization, which form the backbone of knowledge distillation and deep learning models. The survey provides a comprehensive examination of key mechanisms within the knowledge distillation framework, specifically focusing on the enhancement of cognitive abilities and their practical implications across various fields, with a particular emphasis on the interplay between data augmentation (DA) and knowledge distillation. +en-source LLMs, this survey highlights the potential for more accessible, efficient, and powerful AI solutions. + +Most importantly, we advocate for compliance with legal terms that regulate the use of LLMs, ensuring ethical and lawful application of knowledge distillation. + +An associated Github repository is available at https://github.com/Tebmer/Awesome-Knowledge-Distillation-of-LLMs. Index Terms - Large language models, knowledge distillation, data augmentation, skill distillation, supervised fine-tuning +sophisticated problem-solving capabilities, the core significance of these large language models (LLMs) lies in their emergent abilities, enabling them to tackle a diverse array of tasks with remarkable proficiency. +their remarkable capabilities, have some notable limitations, particularly when considering the advantages offered by open-source models, such as GPT-4 and Gemini. These models are often expensive, with substantial usage fees and restricted access, making them inaccessible to individuals and smaller organizations. +ng restrictions and costs. In contrast, open-source LLMs like LLaMA and Mistral bring several advantages. Accessibility and adaptability are key benefits, as they are more readily available to a broader range of users, including researchers and organizations. +ts. One of the most significant limitations is the smaller model scale, resulting in lower performance on real-world tasks with multiple instructions (Zheng et al., 2023a). Models with fewer parameters struggle to capture the depth and breadth of knowledge embodied in larger models like GPT-4. Additionally, the pre-training investment in these open-source models is typically less substantial. This reduced investment can lead to a narrower range of pre-training data, potentially limiting their understanding and handling of diverse or specialized topics (Liang et al., 2022; Sun et al., 2024a). Fine-tuning steps are often fewer due to resource constraints, hindering model optimization for specific tasks or industries. +ary models becomes apparent when compared to highly fine-tuned proprietary LLMs. Primarily, the disparity between proprietary and open-source LLMs becomes evident, with proprietary models excelling in complex scenarios, while open-source models excel in a wide range of scenarios. Knowledge distillation, a technique that leverages the advanced capabilities of proprietary models, is used to enhance the competencies of open-source models. This process is similar to transferring the performance of a skilled teacher to a student. +tillation of LLMs, where a small seed of knowledge is used to prompt the LLM to generate more data with respect to a specific skill or domain (Taori et al., 2023). Furthermore, KD retains its fundamental role in compressing LLMs, making them more efficient without significant loss in performance. +advanced context following and instruction following** + +**key aspects of knowledge distillation** + +* **contextual understanding**: in-context learning and instruction following +* **alignment with user intents**: human values/principles and thinking patterns like chain-of-thought +* **NLP task specialization**: semantic understanding and code generation + +**critical skills for various applications** + +* **healthcare**: accuracy and contextual knowledge +* **law**: contextual knowledge and precision +* **science**: contextual knowledge and precision +ned in the era of LLMs, the benefits of knowledge distillation in the era of LLMs are multifaceted and transformative. Through a suite of distillation techniques, the gap between proprietary and open-source models narrows and is filled. This process streamlines computational requirements and enhances environmental sustainability of AI operations, as open-source models become more proficient with lower overhead. +ch domains. The escalating need for a comprehensive survey on the knowledge distillation of LLMs stems from the rapidly evolving landscape of AI and the increasing complexity of these models. The ability to efficiently and effectively distill knowledge from proprietary LLMs to open-source ones becomes a practical necessity. This is driven by the need to bridge the knowledge gap between the proprietary and open-source LLMs. + +This need is driven by the 3 models mentioned, including Student, Vicuna, Opt, GPT, and others. These models are being used in various sectors such as law, healthcare, finance, and science, and the ability to distill knowledge from them is becoming increasingly important. +synthesizefeedbackFeedback input outputSelf-Knowledge outputinputinput YlabelLabelingExpansion X,Y demonstrationsexpandFeature featureinput,outputextractSec.4Sec.5 Sec.3.1Sec.3.2 Fig. 2: An overview of this survey on knowledge distillation of large language models +es emerging, but there is still much to be learned from the era of Large Language Models (LLMs). In this section, we provide a foundational overview of knowledge distillation, highlighting the role of data augmentation (DA) in this context. + +Traditional techniques, such as supervised fine-tuning, have shown promise in distilling knowledge from LLMs. However, the increasing complexity of these models requires careful consideration of the trade-offs between accuracy and computational resources. To further explore the possibilities of knowledge distillation, we examine methods involving supervised fine-tuning, such as incremental learning and transfer learning. + +Supervised fine-tuning involves training a model on a smaller dataset with the goal of adapting to a specific task or domain. This approach has shown significant improvement in various NLP tasks, but may not be scalable to large-scale applications. In contrast, transfer learning offers a more flexible approach, where a model is trained on a smaller dataset and then fine-tuned on a larger dataset. This can lead to improved performance on a variety of tasks, but requires careful selection of the target dataset. + +Another approach is divergence and similarity, which involve exploring the differences and similarities between the knowledge distillation process and traditional machine learning. Reinforcement learning and ranking optimization are also gaining attention, particularly in the context of knowledge distillation, where the goal is to optimize the distillation process itself. These methods can improve the efficiency and effectiveness of knowledge distillation, but require careful consideration of the trade-offs between exploration and exploitation. + +Skill distillation focuses on enhancing student models to improve their understanding of the task and their ability to perform well on NLP tasks. This can be achieved through various methods, including data augmentation, feature learning, and attention mechanisms. By incorporating these techniques, student models can better understand the context and intentions of the user, leading to improved performance across a variety of tasks. + +We propose several strategies for skill distillation, including: +mmendation systems, and the evaluation of text generation. In §5, we delve into domain-specific vertical distillation, demonstrating how knowledge distillation techniques are applied in specialized fields such as law, healthcare, finance, and science, highlighting their practical implications and transformative impact. The survey reveals open problems in §6, highlighting current challenges and gaps in knowledge distillation research that present opportunities for future work. +large, complex model to a smaller, more efficient model, mitigating the challenges of computational demands and resource constraints in deploying large-scale models in practical applications. This process, prior to the era of Large Language Models (LLMs), focused on compacting complex neural networks for deployment in resource-constrained environments, such as mobile devices or edge computing platforms, where computational efficiency was paramount. +al., 2022a), Alpaca (Taori et al., 2023), Code Alpaca (Chaudhary, 2023) Self-Align (Sun et al., 2024b), WizardLM (Xu et al., 2023a), WizardCoder (Luo et al., 2023a), WizardMath (Luo et al., 2023b), AugGPT (Dai et al., 2023a), TDG (He et al., 2023b), CurationUltraChat (Ding et al., 2023b), Phi-1 (Gunasekar et al., 2023), Phi-1.5 (Li et al., 2023a), Phi-2 (Mar, 2023), Magicoder (Wei et al., 2023), WaveCoder (Yu et al., 2024), ZeroGen (Ye et al., 2022), InPars (Bonifacio et al., 2022) +Self-Align (Sun et al., 2024b), RLCD (Yang et al., 2024a), ImpDistill (Jung et al., 2023), LMSI (Huang et al., 2023a), ReST (Gulcehre et al., 2023), Self-Rewarding (Yuan et al., 2024a), Baize (Xu et al., 2023b), STaR (Zelikman et al., 2022) DistillationSupervised Fine-TuningAlpaca (Taori et al., 2023), Vicuna (Chiang et al., 2023), WizardLM (Xu et al., 2023a), Self-Instruct (Wang et al., 2022a), Baize (Xu et al., 2023b), STaR (Zelikman et al., 2022), Divergence and SimilarityDistilGPT (Sanh et al., 2019), f-Distill (Wen et al., 2023), MiniLLM (Gu et al., 2024) TED (Liang et al., 2023a), GKD (Agarwal et al., 2024), BabyLlama (Timiryasov and Tastet, 2023) Reinforcement LearningCAI (Bai et al., 2022a), UltraFeedback (Cui et al., 2023a), WizardMath (Luo et al., 2023b), MiniLLM (Gu et al., 2024), GKD (Agarwal et al., 2024), GPT3 Reward (Kwon et al., 2023) Rank Optimization +ollowingInstruction FollowingSelf-Instruct Wang et al., 2022a, Alpaca Taori et al., 2023, Vicuna Chiang et al., 2023, WizardLM Xu et al., 2023a, Orca Mukherjee et al., 2023, Orca2 Mitra et al., 2023, WizardMath Luo et al., 2023b, Llama-GPT4 Peng et al., 2023a, Multi-turn Dialogue Chiang et al., 2023, Baize Xu et al., 2023b, UltraLLaMA Ding et al., 2023b, CAMEL Li et al., 2023b, OpenChat Wang et al., 2023c, Zephyr Tunstall et al., 2023, RAG Kang et al., 2023a, SAIL Luo et al., 2023c, Self-RAG Asai et al., 2023, AlignmentThinking PatternYe et al., 2023, Orca Mukherjee et al., 2023, Orca2 Wang et al., 2023d, AFT Cheng et al., 2023, KnowPAT Zhang et al., 2023a, PreferenceCAI Bai et al., 2022a, GPT-3 Reward Kwon et al., 2023, ILF Scheurer et al., 2023, ALMoST Kim et al., 2023a, RLEF Roit et al., 2023 +i et al., 2022a), Align Honesty (Yang et al., 2023a), SANDBOX (Liu et al., 2023b), Self-Align (Sun et al., 2024b), UltraFeedback (Cui et al., 2023a), RLCD (Yang et al., 2024a), AgentToolformer (Schick et al., 2023), Graph-ToolFormer (Zhang, 2023), Gorilla (Patil et al., 2023), ToolAlpaca (Tang et al., 2023a), ToolLLM (Qin et al., 2023a), CRAFT (Yuan et al., 2023a), Confucius (Gao et al., 2023b), MLLM-Tool (Wang et al., 2024), α-UMi (Shen et al., 2024), PlanningFireAct (Chen et al., 2023b), AgentTuning (Zeng et al., 2023a), Lumos (Yin et al., 2023a), AUTOACT (Qiao et al., 2024), TPTU-v2 (Kong et al., 2023), NLP Task SpecializationNLUAugGPT (Dai et al., 2023a), GPT Annotation (Gilardi et al., 2023), (Ding et al., 2023a), TDG (He et al., 2023b), SunGen (Gao et al., 2023a), Mix Distill (Chenglin et al., 2023), Annollm (He et al., 2023a), UDG (Wang et al., 2021a), ZeroGen (Ye et al., 2024) +al., 2023 GPT-3 Labeling Wang et al., 2021b BioGPT Guo et al., 2023a ChatGPT NMT Yang and Nicolai, 2023 Information RetrievalQUILL Srinivasan et al., 2022 Promptgator Dai et al., 2023b InPars Bonifacio et al., 2022 AugTriever Meng et al., 2023 Sun et al., 2023a RankVicuna Pradeep et al., 2023a RankZephyr Pradeep et al., 2023b ExaRanker Ferraretto et al., 2023 Recommendation NDR Mysore et al., 2023 InstrcutRec Zhang et al., 2023b ONCE Liu et al., 2023c Text Generation Evaluation PandaLM Wang et al., 2023b Prometheus Kim et al., 2024 InstructScore Xu et al., 2023d TigerScore Jiang et al., 2023c Auto-J Li et al., 2024a CodeCodeAlpaca Chaudhary, 2023 CodeLlama Rozi `ere et al., 2023 Magicoder Wei et al., 2023 Phi-1 Gunasekar et al., 2023 PERsD Chen et al., 2023 MFTCoder Liu et al., 2023d WaveCoder Yu et al., 2023 +et al., 2023e), SVIT (Zhao et al., 2023b), LVIS-Instruct4V (Wang et al., 2023e), Shikra (Chen et al., 2023c), LSKD (Park et al., 2023), DetGPT (Pi et al., 2023; Zhao et al., 2023c), LRV (Liu et al., 2023f), NExT-GPT (Wu et al., 2023b), Valley (Luo et al., 2023d), ILuvUI (Jiang et al., 2023d), StableLLaVA (Li et al., 2023c), PointLLM (Xu et al., 2023e), Verticalization DistillationLaw (Huang et al., 2023b; Cui et al., 2023b); Medical & Healthcare (Zhang et al., 2023c; Chen et al., 2023d); Finance (Zhang and Yang, 2023); Science (Xie et al., 2023a; Zhang et al., 2024) and Misc. (Dan et al., 2023; Guo et al., 2023b) Fig. 3: Taxonomy of Knowledge Distillation of Large Language Models" +r network, often through techniques like soft target training, where the student learns from the softened softmax output of the teacher. + +The distillation of knowledge from larger models to smaller ones is a technique used to improve the performance of AI models. In this context, distillation refers to the process of distilling the knowledge from a larger model into a smaller model, allowing it to learn from the teacher model's output. + +The current era of knowledge distillation in large language models (LLMs) has shifted the focus from mere architecture compression to a more nuanced process of knowledge elicitation and transfer. This paradigm change is largely due to the immense knowledge that LLMs like GPT-4 and Gemini possess. The parameters of LLMs make it challenging to compress them using pruning or quantization techniques. +size, the current focus in llm-based knowledge distillation is to extract and transfer the rich, nuanced understanding that these models have developed the key to this modern approach lies in carefully designed prompts that elicit specific knowledge or capabilities from the llms, tapping into their understanding and capabilities in various domains ranging from natural language understanding to more complex cognitive tasks like reasoning and problem-solving +explicit training objectives. This era of knowledge distillation also emphasizes the transfer of abstract qualities such as reasoning patterns and preference alignment. This is in stark contrast to the earlier focus on output replication, indicating a shift towards a more holistic and comprehensive transfer of cognitive capabilities. The current techniques involve not just the replication of outputs, but also the emulation of thought processes and decision-making patterns of the teacher model. This involves complex strategies like chain-of-thought prompting, where the student model learns the reasoning process of the teacher, enhancing its problem-solving and decision-making capabilities. 2.2 Relation to Data Augmentation (DA) +llation, Unlike traditional techniques such as paraphrasing, or back-translation, which primarily aim at expanding the training dataset in a somewhat mechanical manner. DA within the context of LLMs focuses on the generation of novel, context-rich training data tailored to specific domains and skills. This innovation is driven by the unique capabilities of LLMs to generate coherent, diverse, and intricate data samples that closely mimic the nuanced understanding and cognitive abilities of human experts in various fields. +ource models, through Deep Learning Models (LLMs) are prompted to create targeted, high-quality datasets that are not merely larger in volume but also rich in diversity and specificity. This approach enables the distillation process to be more effective, ensuring that the distilled models replicate the teacher model's output behavior and embody its deep-seated understanding and cognitive strategies. The significance and necessity of Data Augmentation (DA) for achieving Knowledge Domains (KD) in the LLM era cannot be overstated. DA acts as a force multiplier, enabling the distilled models to acquire and refine capabilities that would otherwise require exponentially larger datasets and computational resources. It facilitates a more nuanced and effective transfer of knowledge, focusing on the qualitative aspects of learning rather than quantitative expansion. +er of LLMs empowers open-source models with the ability to approximate the contextual adeptness, ethical alignment, and deep semantic insights characteristic of their proprietary counterparts thereby democratizing access to advanced AI capabilities and fostering innovation across a broader spectrum of applications and users 2 3 Survey Scope Building on the discussions introduced earlier this survey aims to comprehensively explore the landscape of knowledge distillation within the context of LLMs following a meticulously structured taxonomy as in Figure 3 the survey’s scope is delineated through three primary facets each encapsulating a range of subtopics and methodologies +undations and methodologies of knowledge distillation. It includes an in-depth exploration of processes involved in constructing knowledge from teacher models (e.g., proprietary LLMs) and integrating this knowledge into student models (e.g., open-source LLMs). Under the umbrella of 'knowledge', we delve into strategies such as labeling, expansion, curation, feature understanding, and feedback mechanisms. The exploration seeks to uncover the various ways in which knowledge can be identified, expanded, and curated for effective distillation. This subsection examines learning approaches like supervised fine-tuning, divergence minimization, and reinforcement learning techniques. +ow algorithms enable knowledge transfer, allowing open-source models to replicate and sometimes surpass proprietary capabilities. Skill Distillation examines specific competencies and capabilities enhanced through Knowledge Distillation. Contextual discussions follow (Taori et al., 2023; Luo et al., 2023c), including instruction following and retrieval-augmented generation (RAG) capabilities. Alignment research investigates thinking patterns, persona/preference modeling, and value alignment. The 'agent' category focuses on skills like tool usage and planning. NLP task specialization (Dai et al., 2023a; Jung et al., 2023; Chaudhary, 2023) is examined through lenses like natural language understanding (NLU), natural language processing (NLP). +tion, and Code Generation** + +Finally, the survey explores how Knowledge Distillation (KD) enhances Large Language Models (LLMs) in interpreting and integrating multiple forms of input, enriching their utility and applicability across various contexts. Verticalization Distillation +This section examines the application of KD across diverse domains, providing insights into how distilled LLMs can be tailored for specialized fields such as Law, Medical & Healthcare (Wang et al., 2023a), Finance (Zhang and Yang, 2023), Science (Zhang et al., 2024), among others. This exploration showcases the practical implications of KD techniques and highlights their transformative impact on domain-specific AI solutions. Through detailed analysis and examples, this part aims to demonstrate the versatility and efficacy of KD in adapting LLMs to diverse domains. +stem. by navigating through these facets, this survey endeavors to provide an extensive and nuanced analysis of knowledge distillation in the era of LLMs. it serves as a guide for researchers, practitioners, and enthusiasts in the field, shedding light on current methodologies, challenges, and opportunities for innovation in this rapidly evolving domain. +across a range of applications. + +Distillation Pipeline in LLM Era diff --git a/recipes/quickstart/NotebookLlama/resources/data.pkl b/recipes/quickstart/NotebookLlama/resources/data.pkl new file mode 100644 index 000000000..03b2674a7 Binary files /dev/null and b/recipes/quickstart/NotebookLlama/resources/data.pkl differ diff --git a/recipes/quickstart/NotebookLlama/resources/podcast_ready_data.pkl b/recipes/quickstart/NotebookLlama/resources/podcast_ready_data.pkl new file mode 100644 index 000000000..086162b95 Binary files /dev/null and b/recipes/quickstart/NotebookLlama/resources/podcast_ready_data.pkl differ diff --git a/recipes/quickstart/agents/Agents_Tutorial/Tool_Calling_101.ipynb b/recipes/quickstart/agents/Agents_Tutorial/Tool_Calling_101.ipynb new file mode 100644 index 000000000..ee6d89e97 --- /dev/null +++ b/recipes/quickstart/agents/Agents_Tutorial/Tool_Calling_101.ipynb @@ -0,0 +1,989 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Tool Calling 101:\n", + "\n", + "Note: If you are looking for `3.2` Featherlight Model (1B and 3B) instructions, please see the respective notebook, this one covers 3.1 models.\n", + "\n", + "We are briefly introduction the `3.2` models at the end. \n", + "\n", + "Note: The new vision models behave same as `3.1` models when you are talking to the models without an image\n", + "\n", + "This is part (1/2) in the tool calling series, this notebook will cover the basics of what tool calling is and how to perform it with `Llama 3.1 models`\n", + "\n", + "Here's what you will learn in this notebook:\n", + "\n", + "- Setup Groq to access Llama 3.1 70B model\n", + "- Avoid common mistakes when performing tool-calling with Llama\n", + "- Understand Prompt templates for Tool Calling\n", + "- Understand how the tool calls are handled under the hood\n", + "- 3.2 Model Tool Calling Format and Behaviour\n", + "\n", + "In Part 2, we will learn how to build system that can get us comparision between 2 papers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## What is Tool Calling?\n", + "\n", + "This approach was popularised by the [Gorilla](https://gorilla.cs.berkeley.edu) paper-which showed that Large Language Model(s) can be fine-tuned on API examples to teach them calling an external API. \n", + "\n", + "This is really cool because we can now use a LLM as a \"brain\" of a system and connect it to external systems to perform actions. \n", + "\n", + "In simpler words, \"Llama can order your pizza for you\" :) \n", + "\n", + "With the Llama 3.1 release, the models excel at tool calling and support out of box `brave_search`, `wolfram_api` and `code_interpreter`. \n", + "\n", + "However, first let's take a look at a common mistake" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Install and setup groq dependencies\n", + "\n", + "- Install `groq` api to access Llama model(s)\n", + "- Configure our client and authenticate with API Key(s), Note: PLEASE UPDATE YOUR KEY BELOW" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "#!pip3 install groq\n", + "%set_env GROQ_API_KEY=''" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from groq import Groq\n", + "# Create the Groq client\n", + "client = Groq(api_key='gsk_PDfGP611i_HAHAHAHA_THIS_IS_NOT_MY_REAL_KEY_PLEASE_REPLACE')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Common Mistake of Tool-Calling: Incorrect Prompt Template\n", + "\n", + "While Llama 3.1 works with tool-calling out of box, a wrong prompt template can cause issues with unexpected behaviour. \n", + "\n", + "Sometimes, even superheroes need to be reminded of their powers. \n", + "\n", + "Let's first try \"forcing a prompt response from the model\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Note: Remember this is the WRONG template, please scroll to next section to see the right approach if you are in a rushed copy-pasta sprint\n", + "\n", + "This section will show you that the model will not use `brave_search` and `wolfram_api` out of the box unless the prompt template is set correctly. \n", + "Even if the model is asked to do so!" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "SYSTEM_PROMPT = \"\"\"\n", + "Cutting Knowledge Date: December 2023\n", + "Today Date: 20 August 2024\n", + "\n", + "You are a helpful assistant\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "system_prompt = {}\n", + "chat_history = []\n", + "\n", + "def model_chat(user_input: str, sys_prompt = SYSTEM_PROMPT, temperature: int = 0.7, max_tokens=2048):\n", + " \n", + " chat_history = [\n", + " {\n", + " \"role\": \"system\",\n", + " \"content\": sys_prompt\n", + " }\n", + " ]\n", + " \n", + " chat_history.append({\"role\": \"user\", \"content\": user_input})\n", + " \n", + " response = client.chat.completions.create(model=\"llama-3.1-70b-versatile\",\n", + " messages=chat_history,\n", + " max_tokens=max_tokens,\n", + " temperature=temperature)\n", + " \n", + " chat_history.append({\n", + " \"role\": \"assistant\",\n", + " \"content\": response.choices[0].message.content\n", + " })\n", + " \n", + " \n", + " #print(\"Assistant:\", response.choices[0].message.content)\n", + " \n", + " return response.choices[0].message.content" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Asking the model about a recent news\n", + "\n", + "Since the prompt template is incorrect, it will answer using cutoff memory" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Assistant: Unfortunately, I don't have information on a specific release date for the next Elden Ring game. However, I can tell you that there have been rumors and speculations about a potential sequel or DLC (Downloadable Content) for Elden Ring.\n", + "\n", + "In June 2022, the game's director, Hidetaka Miyazaki, mentioned that FromSoftware, the developer of Elden Ring, was working on \"multiple\" new projects, but no official announcements have been made since then.\n", + "\n", + "It's also worth noting that FromSoftware has a history of taking their time to develop new games, and the studio is known for its attention to detail and commitment to quality. So, even if there is a new Elden Ring game in development, it's likely that we won't see it anytime soon.\n", + "\n", + "Keep an eye on official announcements from FromSoftware and Bandai Namco, the publisher of Elden Ring, for any updates on a potential sequel or new game in the series.\n" + ] + } + ], + "source": [ + "user_input = \"\"\"\n", + "When is the next elden ring game coming out?\n", + "\"\"\"\n", + "\n", + "print(\"Assistant:\", model_chat(user_input, sys_prompt=SYSTEM_PROMPT))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Asking the model about a Math problem\n", + "\n", + "Again, the model answer(s) based on memory and not tool-calling" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Assistant: To find the square root of 23131231, I'll calculate it for you.\n", + "\n", + "√23131231 ≈ 4813.61\n" + ] + } + ], + "source": [ + "user_input = \"\"\"\n", + "When is the square root of 23131231?\n", + "\"\"\"\n", + "\n", + "print(\"Assistant:\", model_chat(user_input, sys_prompt=SYSTEM_PROMPT))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Can we solve this using a reminder prompt?" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Assistant: I can use a mathematical tool to solve the question.\n", + "\n", + "The square root of 23131231 is:\n", + "\n", + "√23131231 ≈ 4810.51\n" + ] + } + ], + "source": [ + "user_input = \"\"\"\n", + "When is the square root of 23131231?\n", + "\n", + "Can you use a tool to solve the question?\n", + "\"\"\"\n", + "\n", + "print(\"Assistant:\", model_chat(user_input, sys_prompt=SYSTEM_PROMPT))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Looks like we didn't get the wolfram_api call, let's try one more time with a stronger prompt:" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Assistant: I can use Wolfram Alpha to calculate the square root of 23131231.\n", + "\n", + "According to Wolfram Alpha, the square root of 23131231 is:\n", + "\n", + "√23131231 ≈ 4809.07\n" + ] + } + ], + "source": [ + "user_input = \"\"\"\n", + "When is the square root of 23131231?\n", + "\n", + "Can you use a tool to solve the question?\n", + "\n", + "Remember you have been trained on wolfram_alpha\n", + "\"\"\"\n", + "\n", + "print(\"Assistant:\", model_chat(user_input, sys_prompt=SYSTEM_PROMPT))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Official Prompt Template \n", + "\n", + "As you can see, the model doesn't perform tool-calling in an expected fashion above. This is because we are not following the recommended prompting format.\n", + "\n", + "The Llama Stack is the go to approach to use the Llama model family and build applications. \n", + "\n", + "Let's first install the `llama_toolchain` Python package to have the Llama CLI available." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "#!pip3 install llama-toolchain" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Now we can learn about the various prompt formats available \n", + "\n", + "When you run the cell below-you will see models available and then we can check details for model specific prompts" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Traceback (most recent call last):\n", + " File \"/opt/miniconda3/bin/llama\", line 8, in \n", + " sys.exit(main())\n", + " ^^^^^^\n", + " File \"/opt/miniconda3/lib/python3.12/site-packages/llama_toolchain/cli/llama.py\", line 44, in main\n", + " parser.run(args)\n", + " File \"/opt/miniconda3/lib/python3.12/site-packages/llama_toolchain/cli/llama.py\", line 38, in run\n", + " args.func(args)\n", + " File \"/opt/miniconda3/lib/python3.12/site-packages/llama_toolchain/cli/model/prompt_format.py\", line 59, in _run_model_template_cmd\n", + " raise argparse.ArgumentTypeError(\n", + "argparse.ArgumentTypeError: llama3_1 is not a valid Model. Choose one from --\n", + "Llama3.1-8B\n", + "Llama3.1-70B\n", + "Llama3.1-405B\n", + "Llama3.1-8B-Instruct\n", + "Llama3.1-70B-Instruct\n", + "Llama3.1-405B-Instruct\n", + "Llama3.2-1B\n", + "Llama3.2-3B\n", + "Llama3.2-1B-Instruct\n", + "Llama3.2-3B-Instruct\n", + "Llama3.2-11B-Vision\n", + "Llama3.2-90B-Vision\n", + "Llama3.2-11B-Vision-Instruct\n", + "Llama3.2-90B-Vision-Instruct\n" + ] + } + ], + "source": [ + "!llama model prompt-format " + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[m━━━━━━━━━━━━━━━━━━━┓\u001b[m\n", + "┃ \u001b[1mLlama 3.1 - Prompt Formats\u001b[0m \u001b[m\u001b[1m\u001b[0m ┃\u001b[m\n", + "┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[m━━━━━━━━━━━━━━━━━━━┛\u001b[m\n", + "\u001b[m\n", + "\u001b[m\n", + " \u001b[1;4mTokens\u001b[0m \u001b[m\u001b[1;4m\u001b[0m \u001b[m\n", + "\u001b[m\n", + "Here is a list of special tokens that are supported by Llama 3.1: \u001b[m \u001b[m\n", + "\u001b[m\n", + "\u001b[1;33m • \u001b[0m\u001b[1;36;40m<|begin_of_text|>\u001b[0m: Specifies the start of the prompt \u001b[m\u001b[1;33m\u001b[0m\u001b[1;36;40m\u001b[0m \u001b[m\n", + "\u001b[1;33m • \u001b[0m\u001b[1;36;40m<|end_of_text|>\u001b[0m: Model will cease to generate more tokens. This token is gene\u001b[m\u001b[1;33m\u001b[0m\u001b[1;36;40m\u001b[0mrated only by the \u001b[m\n", + "\u001b[1;33m \u001b[0mbase models. \u001b[m\u001b[1;33m\u001b[0m \u001b[m\n", + "\u001b[1;33m • \u001b[0m\u001b[1;36;40m<|finetune_right_pad_id|>\u001b[0m: This token is used for padding text sequences to t\u001b[m\u001b[1;33m\u001b[0m\u001b[1;36;40m\u001b[0mhe same length in a \u001b[m\n", + "\u001b[1;33m \u001b[0mbatch. \u001b[m:\u001b[K" + ] + } + ], + "source": [ + "!llama model prompt-format -m Llama3.1-8B" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tool Calling: Using the correct Prompt Template\n", + "\n", + "With `llama-cli` we have already learned the right behaviour of the model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If everything is setup correctly-the model should now wrap function calls with the `||` following the actualy function call. \n", + "\n", + "This can allow you to manage your function calling logic accordingly. \n", + "\n", + "Time to test the theory" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Assistant: <|python_tag|>brave_search.call(query=\"Elden Ring sequel release date\")\n" + ] + } + ], + "source": [ + "SYSTEM_PROMPT = \"\"\"\n", + "Environment: iPython\n", + "Tools: brave_search, wolfram_alpha\n", + "Cutting Knowledge Date: December 2023\n", + "Today Date: 15 September 2024\n", + "\"\"\"\n", + "\n", + "user_input = \"\"\"\n", + "When is the next Elden ring game coming out?\n", + "\"\"\"\n", + "\n", + "print(\"Assistant:\", model_chat(user_input, sys_prompt=SYSTEM_PROMPT))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Assistant: <|python_tag|>wolfram_alpha.call(query=\"square root of 23131231\")\n" + ] + } + ], + "source": [ + "user_input = \"\"\"\n", + "What is the square root of 23131231?\n", + "\"\"\"\n", + "\n", + "print(\"Assistant:\", model_chat(user_input, sys_prompt=SYSTEM_PROMPT))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using this knowledge in practise\n", + "\n", + "A common misconception about tool calling is: the model can handle the tool call and get your output. \n", + "\n", + "This is NOT TRUE, the actual tool call is something that you have to implement. With this knowledge, let's see how we can utilise brave search to answer our original question" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [], + "source": [ + "#!pip3 install brave-search" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Assistant: <|python_tag|>wolfram_alpha.call(query=\"square root of 23131231\")\n" + ] + } + ], + "source": [ + "SYSTEM_PROMPT = \"\"\"\n", + "Environment: iPython\n", + "Tools: brave_search, wolfram_alpha\n", + "Cutting Knowledge Date: December 2023\n", + "Today Date: 15 September 2024\n", + "\"\"\"\n", + "\n", + "user_input = \"\"\"\n", + "What is the square root of 23131231?\n", + "\"\"\"\n", + "\n", + "print(\"Assistant:\", model_chat(user_input, sys_prompt=SYSTEM_PROMPT))" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<|python_tag|>wolfram_alpha.call(query=\"square root of 23131231\")\n" + ] + } + ], + "source": [ + "print(model_chat(user_input, sys_prompt=SYSTEM_PROMPT))\n", + "\n", + "output = model_chat(user_input, sys_prompt=SYSTEM_PROMPT)" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Function name: wolfram_alpha\n", + "Method: call\n", + "Args: \"square root of 23131231\"\n" + ] + } + ], + "source": [ + "import re\n", + "\n", + "# Extract the function name\n", + "fn_name = re.search(r'<\\|python_tag\\|>(\\w+)\\.', output).group(1)\n", + "\n", + "# Extract the method\n", + "fn_call_method = re.search(r'\\.(\\w+)\\(', output).group(1)\n", + "\n", + "# Extract the arguments\n", + "fn_call_args = re.search(r'=\\s*([^)]+)', output).group(1)\n", + "\n", + "print(f\"Function name: {fn_name}\")\n", + "print(f\"Method: {fn_call_method}\")\n", + "print(f\"Args: {fn_call_args}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can implement this in different ways but the idea is the same, the LLM gives an output with the `<|python_tag|>`, which should call a tool-calling mechanism. \n", + "\n", + "This logic gets handled in the program and then the output is passed back to the model to answer the user" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Code interpreter\n", + "\n", + "With the correct prompt template, Llama model can output Python (as well as code in any-language that the model has been trained on)" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Assistant: <|python_tag|>import math\n", + "\n", + "# Define the variables\n", + "monthly_investment = 400\n", + "interest_rate = 0.05\n", + "target_amount = 100000\n", + "\n", + "# Calculate the number of months it would take to reach the target amount\n", + "months = 0\n", + "current_amount = 0\n", + "while current_amount < target_amount:\n", + " current_amount += monthly_investment\n", + " current_amount *= 1 + interest_rate / 12 # Compound interest\n", + " months += 1\n", + "\n", + "# Print the result\n", + "print(f\"It would take {months} months, approximately {months / 12:.2f} years, to reach the target amount of ${target_amount:.2f}.\")\n" + ] + } + ], + "source": [ + "user_input = \"\"\"\n", + "\n", + "If I can invest 400$ every month at 5% interest rate, how long would it take me to make a 100k$ in investments?\n", + "\"\"\"\n", + "\n", + "print(\"Assistant:\", model_chat(user_input, sys_prompt=SYSTEM_PROMPT))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's validate the output by running the output from the model:" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "It would take 172 months, approximately 14.33 years, to reach the target amount of $100000.00.\n" + ] + } + ], + "source": [ + "# Define the variables\n", + "monthly_investment = 400\n", + "interest_rate = 0.05\n", + "target_amount = 100000\n", + "\n", + "# Calculate the number of months it would take to reach the target amount\n", + "months = 0\n", + "current_amount = 0\n", + "while current_amount < target_amount:\n", + " current_amount += monthly_investment\n", + " current_amount *= 1 + interest_rate / 12 # Compound interest\n", + " months += 1\n", + "\n", + "# Print the result\n", + "print(f\"It would take {months} months, approximately {months / 12:.2f} years, to reach the target amount of ${target_amount:.2f}.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.2 Models Custom Tool Prompt Format" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Life is great because Llama Team writes great docs for us, so we can conviently copy-pasta examples from there :)\n", + "\n", + "[Here](https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_2#-tool-calling-(1b/3b)-) are the docs for your reference that we will be using. \n", + "\n", + "Excercise for viewer: Use `llama-toolchain` again to verify like we did earlier and then start the prompt engineering for the small Llamas." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "function_definitions = \"\"\"[\n", + " {\n", + " \"name\": \"get_user_info\",\n", + " \"description\": \"Retrieve details for a specific user by their unique identifier. Note that the provided function is in Python 3 syntax.\",\n", + " \"parameters\": {\n", + " \"type\": \"dict\",\n", + " \"required\": [\n", + " \"user_id\"\n", + " ],\n", + " \"properties\": {\n", + " \"user_id\": {\n", + " \"type\": \"integer\",\n", + " \"description\": \"The unique identifier of the user. It is used to fetch the specific user details from the database.\"\n", + " },\n", + " \"special\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"Any special information or parameters that need to be considered while fetching user details.\",\n", + " \"default\": \"none\"\n", + " }\n", + " }\n", + " }\n", + " }\n", + "]\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "system_prompt = \"\"\"You are an expert in composing functions. You are given a question and a set of possible functions. \n", + "Based on the question, you will need to make one or more function/tool calls to achieve the purpose. \n", + "If none of the function can be used, point it out. If the given question lacks the parameters required by the function,\n", + "also point it out. You should only return the function call in tools call sections.\n", + "\n", + "If you decide to invoke any of the function(s), you MUST put it in the format of [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]\\n\n", + "You SHOULD NOT include any other text in the response.\n", + "\n", + "Here is a list of functions in JSON format that you can invoke.\\n\\n{functions}\\n\"\"\".format(functions=function_definitions)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "chat_history = []\n", + "\n", + "def model_chat(user_input: str, sys_prompt = system_prompt, temperature: int = 0.7, max_tokens=2048):\n", + " \n", + " chat_history = [\n", + " {\n", + " \"role\": \"system\",\n", + " \"content\": system_prompt\n", + " }\n", + " ]\n", + " \n", + " chat_history.append({\"role\": \"user\", \"content\": user_input})\n", + " \n", + " response = client.chat.completions.create(model=\"llama-3.2-3b-preview\",\n", + " messages=chat_history,\n", + " max_tokens=max_tokens,\n", + " temperature=temperature)\n", + " \n", + " chat_history.append({\n", + " \"role\": \"assistant\",\n", + " \"content\": response.choices[0].message.content\n", + " })\n", + " \n", + " \n", + " #print(\"Assistant:\", response.choices[0].message.content)\n", + " \n", + " return response.choices[0].message.content" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note: We are assuming a structure for dataset here:\n", + "\n", + "- Name\n", + "- Email\n", + "- Age \n", + "- Color request" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Assistant: [get_user_info(user_id=7890, special='black')]\n" + ] + } + ], + "source": [ + "user_input = \"Can you retrieve the details for the user with the ID 7890, who has black as their special request?\"\n", + "\n", + "print(\"Assistant:\", model_chat(user_input, sys_prompt=system_prompt))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Dummy dataset to make sure our model stays happy :) " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def get_user_info(user_id: int, special: str = \"none\") -> dict:\n", + " # This is a mock database of users\n", + " user_database = {\n", + " 7890: {\"name\": \"Emma Davis\", \"email\": \"emma@example.com\", \"age\": 31},\n", + " 1234: {\"name\": \"Liam Wilson\", \"email\": \"liam@example.com\", \"age\": 28},\n", + " 2345: {\"name\": \"Olivia Chen\", \"email\": \"olivia@example.com\", \"age\": 35},\n", + " 3456: {\"name\": \"Noah Taylor\", \"email\": \"noah@example.com\", \"age\": 42},\n", + " 4567: {\"name\": \"Ava Martinez\", \"email\": \"ava@example.com\", \"age\": 39},\n", + " 5678: {\"name\": \"Ethan Brown\", \"email\": \"ethan@example.com\", \"age\": 45},\n", + " 6789: {\"name\": \"Sophia Kim\", \"email\": \"sophia@example.com\", \"age\": 33},\n", + " 8901: {\"name\": \"Mason Lee\", \"email\": \"mason@example.com\", \"age\": 29},\n", + " 9012: {\"name\": \"Isabella Garcia\", \"email\": \"isabella@example.com\", \"age\": 37},\n", + " 1357: {\"name\": \"James Johnson\", \"email\": \"james@example.com\", \"age\": 41}\n", + " }\n", + " \n", + " # Check if the user exists in our mock database\n", + " if user_id in user_database:\n", + " user_data = user_database[user_id]\n", + " \n", + " # Handle the 'special' parameter\n", + " if special != \"none\":\n", + " user_data[\"special_info\"] = f\"Special request: {special}\"\n", + " \n", + " return user_data\n", + " else:\n", + " return {\"error\": \"User not found\"}" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'name': 'Emma Davis',\n", + " 'email': 'emma@example.com',\n", + " 'age': 31,\n", + " 'special_info': 'Special request: black'}]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[get_user_info(user_id=7890, special='black')]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Handling Tool-Calling logic for the model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Hello Regex, my good old friend :) \n", + "\n", + "With Regex, we can write a simple way to handle tool_calling and return either the model or tool call response" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "import json\n", + "\n", + "# Assuming you have defined get_user_info function and SYSTEM_PROMPT\n", + "\n", + "chat_history = []\n", + "\n", + "def process_response(response):\n", + " function_call_pattern = r'\\[(.*?)\\((.*?)\\)\\]'\n", + " function_calls = re.findall(function_call_pattern, response)\n", + " \n", + " if function_calls:\n", + " processed_response = []\n", + " for func_name, args_str in function_calls:\n", + " args_dict = {}\n", + " for arg in args_str.split(','):\n", + " key, value = arg.split('=')\n", + " key = key.strip()\n", + " value = value.strip().strip(\"'\")\n", + " if value.isdigit():\n", + " value = int(value)\n", + " args_dict[key] = value\n", + " \n", + " if func_name == 'get_user_info':\n", + " result = get_user_info(**args_dict)\n", + " processed_response.append(f\"Function call result: {json.dumps(result, indent=2)}\")\n", + " else:\n", + " processed_response.append(f\"Unknown function: {func_name}\")\n", + " return \"\\n\".join(processed_response)\n", + " else:\n", + " return response\n", + "\n", + "def model_chat(user_input: str, sys_prompt=system_prompt, temperature: float = 0.7, max_tokens: int = 2048):\n", + " global chat_history\n", + " \n", + " if not chat_history:\n", + " chat_history = [\n", + " {\n", + " \"role\": \"system\",\n", + " \"content\": sys_prompt\n", + " }\n", + " ]\n", + " \n", + " chat_history.append({\"role\": \"user\", \"content\": user_input})\n", + " \n", + " response = client.chat.completions.create(\n", + " model=\"llama-3.2-3b-preview\",\n", + " messages=chat_history,\n", + " max_tokens=max_tokens,\n", + " temperature=temperature\n", + " )\n", + " \n", + " assistant_response = response.choices[0].message.content\n", + " processed_response = process_response(assistant_response)\n", + " \n", + " chat_history.append({\n", + " \"role\": \"assistant\",\n", + " \"content\": assistant_response\n", + " })\n", + " \n", + " return processed_response" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Assistant: Function call result: {\n", + " \"name\": \"Emma Davis\",\n", + " \"email\": \"emma@example.com\",\n", + " \"age\": 31,\n", + " \"special_info\": \"Special request: black\"\n", + "}\n" + ] + } + ], + "source": [ + "user_input = \"Can you retrieve the details for the user with the ID 7890, who has black as their special request?\"\n", + "\n", + "print(\"Assistant:\", model_chat(user_input, sys_prompt=system_prompt))" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "#fin" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/recipes/quickstart/agents/Agents_Tutorial/Tool_Calling_201.ipynb b/recipes/quickstart/agents/Agents_Tutorial/Tool_Calling_201.ipynb new file mode 100644 index 000000000..eb54362f7 --- /dev/null +++ b/recipes/quickstart/agents/Agents_Tutorial/Tool_Calling_201.ipynb @@ -0,0 +1,776 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Tool Calling 201: Llama to find Differences between two papers\n", + "\n", + "The image below illustrates the demo in this notebook. \n", + "\n", + "**Goal:** Use `Meta-Llama-3.1-70b` model to find the differences between two papers\n", + "\n", + "- Step 1: Take the user input query \n", + "\n", + "- Step 2: Perform an internet search using `tavily` API to fetch the arxiv ID(s) based on the user query\n", + "\n", + "Note: `3.1` models support `brave_search` but this notebook is also aimed at showcasing custom tools. \n", + "\n", + "The above is important because many-times the user-query is different from the paper name and arxiv ID-this will help us with the next step\n", + "\n", + "- Step 3: Use the web results to extract the arxiv ID(s) of the papers\n", + "\n", + "We will use an 8b model here because who wants to deal with complex regex, that's the main-use case of LLM(s), isn't it? :D\n", + "\n", + "- Step 4: Use `arxiv` API to download the PDF(s) of the papers in user query\n", + "\n", + "- Step 5: For ease, we will extract first 80k words from the PDF and write these to a `.txt` file that we can summarise\n", + "\n", + "- Step 6: Use instances of `Meta-Llama-3.1-8b` instances to summaries the two PDF(s)\n", + "\n", + "- Step 7: Prompt the `70b` model to get the differences between the two papers being discussed" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Part 1: Defining the pieces\n", + "\n", + "We will start by describing all the modules from the image above, to make sure our logic works.\n", + "\n", + "In second half of the notebook, we will write a simple function to take care of the function calling logic" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Install necessary libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "#!pip3 install groq\n", + "#!pip3 install arxiv\n", + "#!pip3 install tavily-python\n", + "#!pip3 install llama-toolchain\n", + "#!pip3 install PyPDF2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Necessary imports" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Note: PLEASE REPLACE API KEYS BELOW WITH YOUR REAL ONES" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "import os, arxiv, PyPDF2\n", + "from tavily import TavilyClient\n", + "from groq import Groq\n", + "\n", + "# Create the Groq client\n", + "client = Groq(api_key='gsk_PDfGP611i_HAHAHAHA_THIS_IS_NOT_MY_REAL_KEY_PLEASE_REPLACE')\n", + "\n", + "tavily_client = TavilyClient(api_key='fake_key_HAHAHAHA_THIS_IS_NOT_MY_REAL_KEY_PLEASE_REPLACE')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Main LLM thread: \n", + "\n", + "We will use a `MAIN_SYSTEM_PROMPT` and a `main_model_chat_history` to keep track of the discussion, since we are using 4 instances of LLM(s) along with this. \n", + "\n", + "Note, if you paid attention and notice that the SYSTEM_PROMPT here is different-thanks for reading closely! It's always a great idea to follow the official recommendations. \n", + "\n", + "However, when it's a matter of writing complex regex, we can bend the rules slightly :D\n", + "\n", + "Note, we will outline the functions here and define them as we go" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "MAIN_SYSTEM_PROMPT = \"\"\"\n", + "Environment: iPython\n", + "Cutting Knowledge Date: December 2023\n", + "Today Date: 15 September 2024\n", + "\n", + "# Tool Instructions\n", + "- Always execute python code in messages that you share.\n", + "- When looking for real time information use relevant functions if available\n", + "\n", + "You have access to the following functions:\n", + "\n", + "Use the function 'query_for_two_papers' to: Get the internet query results for the arxiv ID of the two papers user wants to compare\n", + "{\n", + " \"name\": \"query_for_two_papers\",\n", + " \"description\": \"Internet search the arxiv ID of two papers that user wants to look up\",\n", + " \"parameters\": {\n", + " \"paper_1\": {\n", + " \"param_type\": \"string\",\n", + " \"description\": \"arxiv id of paper_name_1 from user query\",\n", + " \"required\": true\n", + " },\n", + " \"paper_2\": {\n", + " \"param_type\": \"string\",\n", + " \"description\": \"arxiv id of paper_name_2 from user query\",\n", + " \"required\": true\n", + " },\n", + " }\n", + "}\n", + "\n", + "Use the function 'get_arxiv_ids' to: Given a dict of websearch queries, use a LLM to return JUST the arxiv ID, which is otherwise harder to extract\n", + "{\n", + " \"name\": \"get_arxiv_ids\",\n", + " \"description\": \"Use the dictionary returned from query_for_two_papers to ask a LLM to extract the arxiv IDs\",\n", + " \"parameters\": {\n", + " \"web_results\": {\n", + " \"param_type\": \"dictionary\",\n", + " \"description\": \"dictionary of search result for a query from the previous function\",\n", + " \"required\": true\n", + " },\n", + " }\n", + "}\n", + "\n", + "Use the function 'process_arxiv_paper' to: Given the arxiv ID from get_arxiv_ids function, return a download txt file of the paper that we can then use for summarising\n", + "{\n", + " \"name\": \"process_arxiv_paper\",\n", + " \"description\": \"Use arxiv IDs extracted from earlier to be downloaded and saved to txt files\",\n", + " \"parameters\": {\n", + " \"arxiv_id\": {\n", + " \"param_type\": \"string\",\n", + " \"description\": \"arxiv ID of the paper that we want to download and save a txt file of\",\n", + " \"required\": true\n", + " },\n", + " }\n", + "}\n", + "\n", + "Use the function 'summarize_text_file' to: Given the txt file name based on the arxiv IDs we are working with from earlier, get a summary of the paper being discussed\n", + "{\n", + " \"name\": \"summarize_text_file\",\n", + " \"description\": \"Summarise the arxiv paper saved in the txt file\",\n", + " \"parameters\": {\n", + " \"file_name\": {\n", + " \"param_type\": \"string\",\n", + " \"description\": \"Filename to be used to get a summary of\",\n", + " \"required\": true\n", + " },\n", + " }\n", + "}\n", + "\n", + "If a you choose to call a function ONLY reply in the following format:\n", + "<{start_tag}={function_name}>{parameters}{end_tag}\n", + "where\n", + "\n", + "start_tag => ` a JSON dict with the function argument name as key and function argument value as value.\n", + "end_tag => ``\n", + "\n", + "Here is an example,\n", + "{\"example_name\": \"example_value\"}\n", + "\n", + "Reminder:\n", + "- When user is asking for a question that requires your reasoning, DO NOT USE OR FORCE a function call\n", + "- Even if you remember the arxiv ID of papers from input, do not put that in the query_two_papers function call, pass the internet look up query\n", + "- Function calls MUST follow the specified format\n", + "- Required parameters MUST be specified\n", + "- Only call one function at a time\n", + "- Put the entire function call reply on one line\n", + "- When returning a function call, don't add anything else to your response\n", + "\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "main_model_chat_history = [\n", + " {\n", + " \"role\" : \"system\",\n", + " \"content\" : MAIN_SYSTEM_PROMPT\n", + " }\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Define the `model_chat` instance\n", + "\n", + "We will be using this to handle all user input(s)" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "def model_chat(user_input: str, temperature: int = 0, max_tokens=2048):\n", + " \n", + " main_model_chat_history.append({\"role\": \"user\", \"content\": user_input})\n", + " \n", + " #print(chat_history)\n", + " \n", + " #print(\"User: \", user_input)\n", + " \n", + " response = client.chat.completions.create(model=\"llama-3.1-70b-versatile\",\n", + " messages=main_model_chat_history,\n", + " max_tokens=max_tokens,\n", + " temperature=temperature)\n", + " \n", + " main_model_chat_history.append({\n", + " \"role\": \"assistant\",\n", + " \"content\": response.choices[0].message.content\n", + " })\n", + " \n", + " \n", + " #print(\"Assistant:\", response.choices[0].message.content)\n", + " \n", + " return response.choices[0].message.content" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "user_input = \"\"\"\n", + "What are the differences between llama 3.1 and BERT?\n", + "\"\"\"\n", + "\n", + "output = model_chat(user_input, temperature=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\"paper_1\": \"Llama\", \"paper_2\": \"BERT\"}\n" + ] + } + ], + "source": [ + "print(output)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you remember from `Tool_Calling_101.ipynb`, we need a way to extract and manage tool calling based on the response, the system prompt from earlier makes our lives easier to answer do this later :)\n", + "\n", + "First, let's validate the logic and define all the functions as we go:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Tavily API: \n", + "\n", + "We will use the Tavily API to do a web query for the papers based on the model outputs" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "def query_for_two_papers(paper_1:str , paper_2: str) -> None :\n", + " return [tavily_client.search(f\"arxiv id of {paper_1}\"), tavily_client.search(f\"arxiv id of {paper_2}\")]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "search_results = query_for_two_papers(\"llama 3.1\", \"BERT\")\n", + "#search_results" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "user_input = f\"\"\"\n", + "Here are the search results for the first paper, extract the arxiv ID {search_results[0]}\n", + "\"\"\"\n", + "\n", + "output = model_chat(user_input, temperature=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\"web_results\": \"{'query': 'arxiv id of llama 3.1', 'follow_up_questions': None, 'answer': None, 'images': [], 'results': [{'title': 'TheLlama3HerdofModels - arXiv.org', 'url': 'https://arxiv.org/pdf/2407.21783', 'content': 'arXiv:2407.21783v2 [cs.AI] 15 Aug 2024. Finetuned Multilingual Longcontext Tooluse Release ... The model architecture of Llama 3 is illustrated in Figure1. The development of our Llama 3 language modelscomprisestwomainstages:', 'score': 0.9955835, 'raw_content': None}, {'title': 'NousResearch/Meta-Llama-3.1-8B - Hugging Face', 'url': 'https://huggingface.co/NousResearch/Meta-Llama-3.1-8B', 'content': 'The Meta Llama 3.1 collection of multilingual large language models (LLMs) is a collection of pretrained and instruction tuned generative models in 8B, 70B and 405B sizes (text in/text out). The Llama 3.1 instruction tuned text only models (8B, 70B, 405B) are optimized for multilingual dialogue use cases and outperform many of the available ...', 'score': 0.95379424, 'raw_content': None}, {'title': 'Introducing Llama 3.1: Our most capable models to date - Meta AI', 'url': 'https://ai.meta.com/blog/meta-llama-3-1/', 'content': 'Bringing open intelligence to all, our latest models expand context length to 128K, add support across eight languages, and include Llama 3.1 405B—the first frontier-level open source AI model. Llama 3.1 405B is in a class of its own, with unmatched flexibility, control, and state-of-the-art capabilities that rival the best closed source models.', 'score': 0.9003547, 'raw_content': None}, {'title': 'The Llama 3 Herd of Models | Research - AI at Meta', 'url': 'https://ai.meta.com/research/publications/the-llama-3-herd-of-models/', 'content': 'This paper presents an extensive empirical evaluation of Llama 3. We find that Llama 3 delivers comparable quality to leading language models such as GPT-4 on a plethora of tasks. We publicly release Llama 3, including pre-trained and post-trained versions of the 405B parameter language model and our Llama Guard 3 model for input and output safety.', 'score': 0.89460546, 'raw_content': None}, {'title': '[2407.21783] The Llama 3 Herd of Models - arXiv.org', 'url': 'https://arxiv.org/abs/2407.21783', 'content': 'Modern artificial intelligence (AI) systems are powered by foundation models. This paper presents a new set of foundation models, called Llama 3. It is a herd of language models that natively support multilinguality, coding, reasoning, and tool usage. Our largest model is a dense Transformer with 405B parameters and a context window of up to 128K tokens. This paper presents an extensive ...', 'score': 0.6841585, 'raw_content': None}], 'response_time': 2.09}\"}\n" + ] + } + ], + "source": [ + "print(output)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "user_input = f\"\"\"\n", + "Here are the search results for the second paper now, extract the arxiv ID {search_results[1]}\n", + "\"\"\"\n", + "\n", + "output = model_chat(user_input, temperature=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\"web_results\": \"{'query': 'arxiv id of BERT', 'follow_up_questions': None, 'answer': None, 'images': [], 'results': [{'title': '[2103.11943] BERT: A Review of Applications in Natural Language ...', 'url': 'https://arxiv.org/abs/2103.11943', 'content': 'arXiv:2103.11943 (cs) [Submitted on 22 Mar 2021] BERT: A Review of Applications in Natural Language Processing and Understanding. M. V. Koroteev. In this review, we describe the application of one of the most popular deep learning-based language models - BERT. The paper describes the mechanism of operation of this model, the main areas of its ...', 'score': 0.99411184, 'raw_content': None}, {'title': 'BERT: Pre-training of Deep Bidirectional Transformers for Language ...', 'url': 'https://aclanthology.org/N19-1423/', 'content': 'Abstract. We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models (Peters et al., 2018a; Radford et al., 2018), BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning ...', 'score': 0.9222025, 'raw_content': None}, {'title': 'BERT: Pre-training of Deep Bidirectional Transformers for Language ...', 'url': 'https://research.google/pubs/bert-pre-training-of-deep-bidirectional-transformers-for-language-understanding/', 'content': 'Abstract. We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models, BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers.', 'score': 0.87652874, 'raw_content': None}, {'title': 'BERT: Pre-training of Deep Bidirectional Transformers for Language ...', 'url': 'https://arxiv.org/abs/1810.04805', 'content': 'We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models, BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be fine-tuned ...', 'score': 0.66115755, 'raw_content': None}, {'title': 'A Primer in BERTology: What We Know About How BERT Works', 'url': 'https://direct.mit.edu/tacl/article/doi/10.1162/tacl_a_00349/96482/A-Primer-in-BERTology-What-We-Know-About-How-BERT', 'content': 'The issue of model depth must be related to the information flow from the most task-specific layers closer to the classifier (Liu et al., 2019a), to the initial layers which appear to be the most task-invariant (Hao et al., 2019), and where the tokens resemble the input tokens the most (Brunner et al., 2020) For BERT, this has been achieved through experiments with loss functions (Sanh et al., 2019; Jiao et al., 2019), mimicking the activation patterns of individual portions of the teacher network (Sun et al., 2019a), and knowledge transfer at the pre-training (Turc et al., 2019; Jiao et al., 2019; Sun et al., 2020) or fine-tuning stage (Jiao et al., 2019). In particular, they were shown to rely on shallow heuristics in natural language inference (McCoy et al., 2019b; Zellers et al., 2019; Jin et al., 2020), reading comprehension (Si et al., 2019; Rogers et al., 2020; Sugawara et al., 2020; Yogatama et al., 2019), argument reasoning comprehension (Niven and Kao, 2019), and text classification (Jin et al., 2020). Several studies explored the possibilities of improving the fine-tuning of BERT:\\\\nTaking more layers into account: learning a complementary representation of the information in deep and output layers (Yang and Zhao, 2019), using a weighted combination of all layers instead of the final one (Su and Cheng, 2019; Kondratyuk and Straka, 2019), and layer dropout (Kondratyuk and Straka, 2019).\\\\n For BERT, Clark et al. (2019) observe that most heads in the same layer show similar self-attention patterns (perhaps related to the fact that the output of all self-attention heads in a layer is passed through the same MLP), which explains why Michel et al. (2019) were able to reduce most layers to a single head.\\\\n', 'score': 0.4248892, 'raw_content': None}], 'response_time': 2.16}\"}\n" + ] + } + ], + "source": [ + "print(output)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Extracting Arxiv IDs: \n", + "\n", + "At this point, you would know the author is allergic to writing regex. To deal with this, we will simply use an `8b` instance to extract the `arxiv id` from the paper:" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "def get_arxiv_ids(web_results: dict, temperature: int = 0, max_tokens=512):\n", + " # Initialize chat history with a specific prompt to extract arXiv IDs\n", + " arxiv_id_chat_history = [{\"role\": \"system\", \"content\": \"Given this input, give me the arXiv ID of the papers. The input has the query and web results. DO NOT WRITE ANYTHING ELSE IN YOUR RESPONSE: ONLY THE ARXIV ID ONCE, the web search will have it repeated mutliple times, just return the it once and where its actually the arxiv ID\"}, {\"role\": \"user\", \"content\": f\"Here is the query and results{web_results}\"}]\n", + "\n", + " # Call the model to process the input and extract arXiv IDs\n", + " response = client.chat.completions.create(\n", + " model=\"llama-3.1-8b-instant\", # Adjust the model as necessary\n", + " messages=arxiv_id_chat_history,\n", + " max_tokens=max_tokens,\n", + " temperature=temperature\n", + " )\n", + " \n", + " # Append the assistant's response to the chat history\n", + " arxiv_id_chat_history.append({\n", + " \"role\": \"assistant\",\n", + " \"content\": response.choices[0].message.content\n", + " })\n", + " \n", + " # Return the extracted arXiv IDs\n", + " return response.choices[0].message.content" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2407.21783\n", + "2103.11943\n" + ] + } + ], + "source": [ + "print(get_arxiv_ids(search_results[0]))\n", + "print(get_arxiv_ids(search_results[1]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Downloading the papers and extracting details: \n", + "\n", + "Llama 3.1 family LLM(s) are great enough to use raw outputs extracted from a PDF and summarise them. However, we are still bound by their (great) 128k context length-to live with this, we will extract just the first 80k words. \n", + "\n", + "The functions below handle the logic of downloading the PDF(s) and extracting their outputs" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processed text saved to 2407.21783.txt\n", + "Processed text saved to 2103.11943.txt\n" + ] + } + ], + "source": [ + "# Function to download PDF using arxiv library\n", + "def download_pdf(arxiv_id, filename):\n", + " paper = next(arxiv.Client().results(arxiv.Search(id_list=[arxiv_id])))\n", + " paper.download_pdf(filename=filename)\n", + "\n", + "# Function to convert PDF to text\n", + "def pdf_to_text(filename):\n", + " with open(filename, \"rb\") as file:\n", + " reader = PyPDF2.PdfReader(file)\n", + " text = \"\"\n", + " for page in reader.pages:\n", + " if page.extract_text():\n", + " text += page.extract_text() + \" \"\n", + " return text\n", + "\n", + "# Function to truncate text after 80k words\n", + "def truncate_text(text, limit=20000):\n", + " words = text.split()\n", + " truncated = ' '.join(words[:limit])\n", + " return truncated\n", + "\n", + "# Main function to process an arXiv ID\n", + "def process_arxiv_paper(arxiv_id):\n", + " pdf_filename = f\"{arxiv_id}.pdf\"\n", + " txt_filename = f\"{arxiv_id}.txt\"\n", + " \n", + " # Download PDF\n", + " download_pdf(arxiv_id, pdf_filename)\n", + " \n", + " # Convert PDF to text\n", + " text = pdf_to_text(pdf_filename)\n", + " \n", + " # Truncate text\n", + " truncated_text = truncate_text(text)\n", + " \n", + " # Save to txt file\n", + " with open(txt_filename, \"w\", encoding=\"utf-8\") as file:\n", + " file.write(truncated_text)\n", + " print(f\"Processed text saved to {txt_filename}\")\n", + "\n", + "# Example usage\n", + "arxiv_id = \"2407.21783\"\n", + "process_arxiv_paper(arxiv_id)\n", + "\n", + "arxiv_id = \"2103.11943\"\n", + "process_arxiv_paper(arxiv_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Summarising logic: \n", + "\n", + "We can use a `8b` model instance to summarise our papers:" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "SUMMARISER_PROMPT = \"\"\"\n", + "Cutting Knowledge Date: December 2023\n", + "Today Date: 15 September 2024\n", + "You are an expert summariser of research papers, below you will get an input of the text from an arxiv paper and your job is to read it carefully and return a concise summary with some bullet points at the end of some key-takeways from it\n", + "\"\"\"\n", + "\n", + "def summarize_text_file(file_name: str, temperature: int = 0, max_tokens=2048):\n", + " # Read the content of the file\n", + " with open(file_name, 'r') as file:\n", + " file_content = file.read()\n", + " \n", + " # Initialize chat history\n", + " chat_history = [{\"role\": \"system\", \"content\": f\"{SUMMARISER_PROMPT}\"}, {\"role\": \"user\", \"content\": f\"Text of the paper: {file_content}\"}]\n", + " \n", + " # Generate a summary using the model\n", + " response = client.chat.completions.create(\n", + " model=\"llama-3.1-8b-instant\", # You can change the model as needed\n", + " messages=chat_history,\n", + " max_tokens=max_tokens,\n", + " temperature=temperature\n", + " )\n", + " \n", + " # Append the assistant's response to the chat history\n", + " chat_history.append({\n", + " \"role\": \"assistant\",\n", + " \"content\": response.choices[0].message.content\n", + " })\n", + " \n", + " # Return the summary\n", + " return response.choices[0].message.content" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Summary:\n", + "This paper introduces Llama 3, a new set of foundation models developed by Meta AI. The Llama 3 family consists of models with 8B, 70B, and 405B parameters, capable of handling tasks in multiple languages and modalities. The paper details the pre-training and post-training processes, infrastructure improvements, and evaluations across various benchmarks. Llama 3 demonstrates competitive performance compared to other leading language models, including GPT-4 and Claude 3.5 Sonnet, on a wide range of tasks. The paper also explores multimodal capabilities by integrating vision and speech components, although these are still under development and not ready for release.\n", + "Key takeaways:\n", + "\n", + "Llama 3 includes models with 8B, 70B, and 405B parameters, with the flagship 405B model trained on 15.6T tokens.\n", + "The models excel in multilingual capabilities, coding, reasoning, and tool usage.\n", + "Llama 3 uses a dense Transformer architecture with minimal modifications, focusing on high-quality data and increased training scale.\n", + "The training process involved significant infrastructure improvements to handle large-scale distributed training.\n", + "Post-training includes supervised fine-tuning, rejection sampling, and direct preference optimization to align the model with human preferences.\n", + "Llama 3 demonstrates competitive performance on various benchmarks, including MMLU, coding tasks, and math reasoning.\n", + "The paper presents experiments on integrating vision and speech capabilities using a compositional approach.\n", + "Extensive safety measures were implemented, including pre-training data filtering, safety fine-tuning, and system-level protections.\n", + "The authors are releasing the Llama 3 language models publicly to accelerate research and development in AI.\n", + "\n" + ] + } + ], + "source": [ + "paper_1_summary = summarize_text_file(\"2407.21783.txt\")\n", + "print(paper_1_summary)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "BERT is a novel language representation model developed by researchers at Google AI. It stands for Bidirectional Encoder Representations from Transformers and introduces a new approach to pre-training deep bidirectional representations from unlabeled text. Unlike previous models that looked at text sequences either from left-to-right or combined left-to-right and right-to-left training, BERT is designed to pre-train deep bidirectional representations by jointly conditioning on both left and right context in all layers.\n", + "The key innovation is the application of bidirectional training of Transformer, a popular attention model, to language modeling. This is achieved through two pre-training tasks: Masked Language Model (MLM) and Next Sentence Prediction (NSP). In MLM, the model attempts to predict masked words in a sentence, allowing it to incorporate context from both directions. NSP trains the model to understand relationships between sentences.\n", + "BERT significantly outperformed previous state-of-the-art models on a wide range of NLP tasks, including question answering, natural language inference, and others, without substantial task-specific architecture modifications. The researchers demonstrated the effectiveness of BERT by obtaining new state-of-the-art results on eleven natural language processing tasks.\n", + "Key Takeaways:\n", + "\n", + "BERT introduces deep bidirectional representations, overcoming limitations of previous unidirectional or shallowly bidirectional models.\n", + "The model uses \"masked language modeling\" (MLM) for bidirectional training of Transformer.\n", + "BERT is pre-trained on two tasks: masked language modeling and next sentence prediction.\n", + "It achieves state-of-the-art performance on 11 NLP tasks, including an improvement of 7.7% on the GLUE benchmark.\n", + "BERT's architecture allows for fine-tuning with just one additional output layer, making it versatile for various NLP tasks.\n", + "The model demonstrates that deep bidirectional language representation improves language understanding compared to left-to-right or shallow bidirectional approaches.\n", + "BERT's performance improves with larger model sizes, even on small-scale tasks.\n", + "The pre-training of BERT is computationally expensive but fine-tuning is relatively inexpensive.\n", + "BERT can be used for both fine-tuning and as a feature-based approach, with competitive results in both scenarios.\n", + "\n" + ] + } + ], + "source": [ + "paper_2_summary = summarize_text_file(\"2103.11943.txt\")\n", + "print(paper_2_summary)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "user_input = f\"\"\"\n", + "Here are the summaries of the two papers, look at them closely and tell me the differences of the papers: Paper 1 Summary {paper_1_summary} and Paper 2 Summary {paper_2_summary}\n", + "\"\"\"\n", + "\n", + "output = model_chat(user_input, temperature=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The two paper summaries are about different language models: Llama 3 and BERT.\n", + "\n", + "The main differences are:\n", + "\n", + "1. Model Type: Llama 3 is a set of foundation models developed by Meta AI, while BERT is a language representation model developed by researchers at Google AI.\n", + "2. Model Architecture: Llama 3 uses a dense Transformer architecture, while BERT uses a bidirectional Transformer architecture.\n", + "3. Training Process: Llama 3 involves significant infrastructure improvements to handle large-scale distributed training, while BERT uses pre-training tasks such as Masked Language Model (MLM) and Next Sentence Prediction (NSP).\n", + "4. Multimodal Capabilities: Llama 3 explores multimodal capabilities by integrating vision and speech components, while BERT focuses on text-based language understanding.\n", + "5. Performance: Both models demonstrate competitive performance on various benchmarks, but Llama 3 shows performance on tasks such as multilingual capabilities, coding, reasoning, and tool usage, while BERT excels on NLP tasks such as question answering and natural language inference.\n", + "6. Release: Llama 3 is released publicly to accelerate research and development in AI, while BERT is released as a state-of-the-art model for NLP tasks.\n", + "7. Model Size: Llama 3 has models with 8B, 70B, and 405B parameters, while BERT's model size is not specified in the summary.\n" + ] + } + ], + "source": [ + "print(output)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Part 2: Handle the function calling logic: \n", + "\n", + "Now that we have validated a MVP, we can write a simple function to handle tool-calling:" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{'query': 'arxiv id of Llama 3.1', 'follow_up_questions': None, 'answer': None, 'images': [], 'results': [{'title': 'TheLlama3HerdofModels - arXiv.org', 'url': 'https://arxiv.org/pdf/2407.21783', 'content': 'arXiv:2407.21783v2 [cs.AI] 15 Aug 2024. Finetuned Multilingual Longcontext Tooluse Release ... The model architecture of Llama 3 is illustrated in Figure1. The development of our Llama 3 language modelscomprisestwomainstages:', 'score': 0.9961004, 'raw_content': None}, {'title': '[PDF] The Llama 3 Herd of Models - Semantic Scholar', 'url': 'https://www.semanticscholar.org/paper/The-Llama-3-Herd-of-Models-Dubey-Jauhri/6520557cc3bfd198f960cc8cb6151c3474321bd8', 'content': 'DOI: 10.48550/arXiv.2407.21783 Corpus ID: 271571434; The Llama 3 Herd of Models @article{Dubey2024TheL3, title={The Llama 3 Herd of Models}, author={Abhimanyu Dubey and Abhinav Jauhri and Abhinav Pandey and Abhishek Kadian and Ahmad Al-Dahle and Aiesha Letman and Akhil Mathur and Alan Schelten and Amy Yang and Angela Fan and Anirudh Goyal and Anthony Hartshorn and Aobo Yang and Archi Mitra and ...', 'score': 0.9943581, 'raw_content': None}, {'title': 'The Llama 3 Herd of Models | Research - AI at Meta', 'url': 'https://ai.meta.com/research/publications/the-llama-3-herd-of-models/', 'content': 'This paper presents an extensive empirical evaluation of Llama 3. We find that Llama 3 delivers comparable quality to leading language models such as GPT-4 on a plethora of tasks. We publicly release Llama 3, including pre-trained and post-trained versions of the 405B parameter language model and our Llama Guard 3 model for input and output safety.', 'score': 0.9320833, 'raw_content': None}, {'title': 'Introducing Llama 3.1: Our most capable models to date - Meta AI', 'url': 'https://ai.meta.com/blog/meta-llama-3-1/', 'content': 'Bringing open intelligence to all, our latest models expand context length to 128K, add support across eight languages, and include Llama 3.1 405B—the first frontier-level open source AI model. Llama 3.1 405B is in a class of its own, with unmatched flexibility, control, and state-of-the-art capabilities that rival the best closed source models.', 'score': 0.8467045, 'raw_content': None}, {'title': '[2407.21783] The Llama 3 Herd of Models - arXiv.org', 'url': 'https://arxiv.org/abs/2407.21783', 'content': 'Modern artificial intelligence (AI) systems are powered by foundation models. This paper presents a new set of foundation models, called Llama 3. It is a herd of language models that natively support multilinguality, coding, reasoning, and tool usage. Our largest model is a dense Transformer with 405B parameters and a context window of up to 128K tokens. This paper presents an extensive ...', 'score': 0.68257374, 'raw_content': None}], 'response_time': 1.7}, {'query': 'arxiv id of BERT', 'follow_up_questions': None, 'answer': None, 'images': [], 'results': [{'title': '[2103.11943] BERT: A Review of Applications in Natural Language ...', 'url': 'https://arxiv.org/abs/2103.11943', 'content': 'arXiv:2103.11943 (cs) [Submitted on 22 Mar 2021] BERT: A Review of Applications in Natural Language Processing and Understanding. M. V. Koroteev. In this review, we describe the application of one of the most popular deep learning-based language models - BERT. The paper describes the mechanism of operation of this model, the main areas of its ...', 'score': 0.99411184, 'raw_content': None}, {'title': 'BERT: Pre-training of Deep Bidirectional Transformers for Language ...', 'url': 'https://aclanthology.org/N19-1423/', 'content': 'Abstract. We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models (Peters et al., 2018a; Radford et al., 2018), BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning ...', 'score': 0.9222025, 'raw_content': None}, {'title': 'BERT: Pre-training of Deep Bidirectional Transformers for Language ...', 'url': 'https://research.google/pubs/bert-pre-training-of-deep-bidirectional-transformers-for-language-understanding/', 'content': 'Abstract. We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models, BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers.', 'score': 0.87652874, 'raw_content': None}, {'title': 'BERT: Pre-training of Deep Bidirectional Transformers for Language ...', 'url': 'https://arxiv.org/abs/1810.04805', 'content': 'We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models, BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be fine-tuned ...', 'score': 0.66115755, 'raw_content': None}, {'title': 'A Primer in BERTology: What We Know About How BERT Works', 'url': 'https://direct.mit.edu/tacl/article/doi/10.1162/tacl_a_00349/96482/A-Primer-in-BERTology-What-We-Know-About-How-BERT', 'content': 'The issue of model depth must be related to the information flow from the most task-specific layers closer to the classifier (Liu et al., 2019a), to the initial layers which appear to be the most task-invariant (Hao et al., 2019), and where the tokens resemble the input tokens the most (Brunner et al., 2020) For BERT, this has been achieved through experiments with loss functions (Sanh et al., 2019; Jiao et al., 2019), mimicking the activation patterns of individual portions of the teacher network (Sun et al., 2019a), and knowledge transfer at the pre-training (Turc et al., 2019; Jiao et al., 2019; Sun et al., 2020) or fine-tuning stage (Jiao et al., 2019). In particular, they were shown to rely on shallow heuristics in natural language inference (McCoy et al., 2019b; Zellers et al., 2019; Jin et al., 2020), reading comprehension (Si et al., 2019; Rogers et al., 2020; Sugawara et al., 2020; Yogatama et al., 2019), argument reasoning comprehension (Niven and Kao, 2019), and text classification (Jin et al., 2020). Several studies explored the possibilities of improving the fine-tuning of BERT:\\nTaking more layers into account: learning a complementary representation of the information in deep and output layers (Yang and Zhao, 2019), using a weighted combination of all layers instead of the final one (Su and Cheng, 2019; Kondratyuk and Straka, 2019), and layer dropout (Kondratyuk and Straka, 2019).\\n For BERT, Clark et al. (2019) observe that most heads in the same layer show similar self-attention patterns (perhaps related to the fact that the output of all self-attention heads in a layer is passed through the same MLP), which explains why Michel et al. (2019) were able to reduce most layers to a single head.\\n', 'score': 0.4250085, 'raw_content': None}], 'response_time': 2.2}]\n", + "This is a regular output without function call.\n" + ] + } + ], + "source": [ + "def handle_llm_output(llm_output):\n", + " # Check if the output starts with \"\"\n", + " start = input_string.find(prefix) + len(prefix)\n", + " end = input_string.find(suffix)\n", + " function_and_params = input_string[start:end]\n", + " \n", + " # Split to get function name and parameters\n", + " function_name, params_json = function_and_params.split(\">{\")\n", + " function_name = function_name.strip()\n", + " params_json = \"{\" + params_json\n", + " \n", + " # Convert parameters to dictionary\n", + " params = json.loads(params_json)\n", + " \n", + " # Call the function dynamically\n", + " function_map = {\n", + " \"query_for_two_papers\": query_for_two_papers,\n", + " \"get_arxiv_id\": get_arxiv_ids,\n", + " \"process_arxiv_paper\": process_arxiv_paper,\n", + " \"summarise_text_file\": summarize_text_file\n", + " }\n", + " \n", + " if function_name in function_map:\n", + " result = function_map[function_name](**params)\n", + " return result\n", + " else:\n", + " return \"Function not found\"\n", + "\n", + "# Testing usage\n", + "llm_outputs = [\n", + " \"{\\\"paper_1\\\": \\\"Llama 3.1\\\", \\\"paper_2\\\": \\\"BERT\\\"}\",\n", + " \"Llama 3.2 models are here too btw!\"\n", + "]\n", + "\n", + "for output in llm_outputs:\n", + " result = handle_llm_output(output)\n", + " print(result)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#fin" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/recipes/quickstart/agents/dlai/AI_Agentic_Design_Patterns_with_AutoGen_L4_Tool_Use_and_Conversational_Chess.ipynb b/recipes/quickstart/agents/DeepLearningai_Course_Notebooks/AI_Agentic_Design_Patterns_with_AutoGen_L4_Tool_Use_and_Conversational_Chess.ipynb similarity index 100% rename from recipes/quickstart/agents/dlai/AI_Agentic_Design_Patterns_with_AutoGen_L4_Tool_Use_and_Conversational_Chess.ipynb rename to recipes/quickstart/agents/DeepLearningai_Course_Notebooks/AI_Agentic_Design_Patterns_with_AutoGen_L4_Tool_Use_and_Conversational_Chess.ipynb diff --git a/recipes/quickstart/agents/dlai/AI_Agents_in_LangGraph_L1_Build_an_Agent_from_Scratch.ipynb b/recipes/quickstart/agents/DeepLearningai_Course_Notebooks/AI_Agents_in_LangGraph_L1_Build_an_Agent_from_Scratch.ipynb similarity index 100% rename from recipes/quickstart/agents/dlai/AI_Agents_in_LangGraph_L1_Build_an_Agent_from_Scratch.ipynb rename to recipes/quickstart/agents/DeepLearningai_Course_Notebooks/AI_Agents_in_LangGraph_L1_Build_an_Agent_from_Scratch.ipynb diff --git a/recipes/quickstart/agents/dlai/Building_Agentic_RAG_with_Llamaindex_L1_Router_Engine.ipynb b/recipes/quickstart/agents/DeepLearningai_Course_Notebooks/Building_Agentic_RAG_with_Llamaindex_L1_Router_Engine.ipynb similarity index 97% rename from recipes/quickstart/agents/dlai/Building_Agentic_RAG_with_Llamaindex_L1_Router_Engine.ipynb rename to recipes/quickstart/agents/DeepLearningai_Course_Notebooks/Building_Agentic_RAG_with_Llamaindex_L1_Router_Engine.ipynb index 433c6906c..67eda87f7 100644 --- a/recipes/quickstart/agents/dlai/Building_Agentic_RAG_with_Llamaindex_L1_Router_Engine.ipynb +++ b/recipes/quickstart/agents/DeepLearningai_Course_Notebooks/Building_Agentic_RAG_with_Llamaindex_L1_Router_Engine.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\"Open" + "\"Open" ] }, { diff --git a/recipes/quickstart/agents/dlai/Functions_Tools_and_Agents_with_LangChain_L1_Function_Calling.ipynb b/recipes/quickstart/agents/DeepLearningai_Course_Notebooks/Functions_Tools_and_Agents_with_LangChain_L1_Function_Calling.ipynb similarity index 100% rename from recipes/quickstart/agents/dlai/Functions_Tools_and_Agents_with_LangChain_L1_Function_Calling.ipynb rename to recipes/quickstart/agents/DeepLearningai_Course_Notebooks/Functions_Tools_and_Agents_with_LangChain_L1_Function_Calling.ipynb diff --git a/recipes/quickstart/agents/dlai/README.md b/recipes/quickstart/agents/DeepLearningai_Course_Notebooks/README.md similarity index 100% rename from recipes/quickstart/agents/dlai/README.md rename to recipes/quickstart/agents/DeepLearningai_Course_Notebooks/README.md diff --git a/recipes/quickstart/agents/README.md b/recipes/quickstart/agents/README.md new file mode 100644 index 000000000..9ae617d25 --- /dev/null +++ b/recipes/quickstart/agents/README.md @@ -0,0 +1,6 @@ +## Agents and Tool Calling + +Structure: + +- Agents_Tutorial: Showcases 101 and 201 notebooks guidance for using tool calling with Llama models +- DeepLearning_Course_Notebooks: Notebooks from the DL.ai course teaching Agents \ No newline at end of file diff --git a/recipes/quickstart/finetuning/README.md b/recipes/quickstart/finetuning/README.md index aea8cbc49..bee4db7f5 100644 --- a/recipes/quickstart/finetuning/README.md +++ b/recipes/quickstart/finetuning/README.md @@ -8,7 +8,7 @@ This folder contains instructions to fine-tune Meta Llama 3 on a using the canonical [finetuning script](../../../src/llama_recipes/finetuning.py) in the llama-recipes package. -If you are new to fine-tuning techniques, check out an overview: [](./LLM_finetuning_overview.md) +If you are new to fine-tuning techniques, check out [an overview](./LLM_finetuning_overview.md). > [!TIP] > If you want to try finetuning Meta Llama 3 in a Jupyter notebook you can find a quickstart notebook [here](./quickstart_peft_finetuning.ipynb) diff --git a/recipes/quickstart/finetuning/finetune_vision_model.md b/recipes/quickstart/finetuning/finetune_vision_model.md index e73e27a87..6f7d64f64 100644 --- a/recipes/quickstart/finetuning/finetune_vision_model.md +++ b/recipes/quickstart/finetuning/finetune_vision_model.md @@ -22,12 +22,14 @@ For **LoRA finetuning with FSDP**, we can run the following code: For more details about the finetuning configurations, please read the [finetuning readme](./README.md). +For more details about local inference with the fine-tuned checkpoint, please read [Inference with FSDP checkpoints section](https://github.com/meta-llama/llama-recipes/tree/main/recipes/quickstart/inference/local_inference#inference-with-fsdp-checkpoints) to learn how to convert the FSDP weights into a consolidated Hugging Face formatted model for local inference. + ### How to use a custom dataset to fine-tune vision model In order to use a custom dataset, please follow the steps below: 1. Create a new dataset python file under `recipes/quickstart/finetuning/dataset` folder. 2. In this python file, you need to define a `get_custom_dataset(dataset_config, processor, split, split_ratio=0.9)` function that handles the data loading. -3. In this python file, you need to define a `get_data_collator(processor)` class that returns a custom data collator that can be used by the Pytorch Data Loader. +3. In this python file, you need to define a `get_data_collator(processor)` function that returns a custom data collator that can be used by the Pytorch Data Loader. 4. This custom data collator class must have a `__call__(self, samples)` function that converts the image and text samples into the actual inputs that vision model expects. 5. Run the `torchrun` command from above section, please change the `--custom_dataset.file` to the new dataset python file, adjust the learning rate accordingly. diff --git a/recipes/quickstart/finetuning/quickstart_peft_finetuning.ipynb b/recipes/quickstart/finetuning/quickstart_peft_finetuning.ipynb index e26a10bd5..df2674d53 100644 --- a/recipes/quickstart/finetuning/quickstart_peft_finetuning.ipynb +++ b/recipes/quickstart/finetuning/quickstart_peft_finetuning.ipynb @@ -8,7 +8,7 @@ "Copyright (c) Meta Platforms, Inc. and affiliates.\n", "This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.\n", "\n", - "\"Open" + "\"Open" ] }, { diff --git a/recipes/quickstart/inference/local_inference/README.md b/recipes/quickstart/inference/local_inference/README.md index 73b83b929..cb073fd35 100644 --- a/recipes/quickstart/inference/local_inference/README.md +++ b/recipes/quickstart/inference/local_inference/README.md @@ -1,11 +1,14 @@ # Local Inference +## Hugging face setup +**Important Note**: Before running the inference, you'll need your Hugging Face access token, which you can get at your Settings page [here](https://huggingface.co/settings/tokens). Then run `huggingface-cli login` and copy and paste your Hugging Face access token to complete the login to make sure the scripts can download Hugging Face models if needed. + ## Multimodal Inference -For Multi-Modal inference we have added [multi_modal_infer.py](multi_modal_infer.py) which uses the transformers library +For Multi-Modal inference we have added [multi_modal_infer.py](multi_modal_infer.py) which uses the transformers library. -The way to run this would be +The way to run this would be: ``` -python multi_modal_infer.py --image_path "./resources/image.jpg" --prompt_text "Describe this image" --temperature 0.5 --top_p 0.8 --model_name "meta-llama/Llama-3.2-11B-Vision-Instruct" +python multi_modal_infer.py --image_path PATH_TO_IMAGE --prompt_text "Describe this image" --temperature 0.5 --top_p 0.8 --model_name "meta-llama/Llama-3.2-11B-Vision-Instruct" ``` --- ## Multi-modal Inferencing Using gradio UI for inferencing diff --git a/recipes/quickstart/inference/local_inference/multi_modal_infer.py b/recipes/quickstart/inference/local_inference/multi_modal_infer.py index 8c11de8ee..27d45b5f1 100644 --- a/recipes/quickstart/inference/local_inference/multi_modal_infer.py +++ b/recipes/quickstart/inference/local_inference/multi_modal_infer.py @@ -1,10 +1,11 @@ +import argparse import os import sys -import argparse -from PIL import Image as PIL_Image + import torch +from accelerate import Accelerator +from PIL import Image as PIL_Image from transformers import MllamaForConditionalGeneration, MllamaProcessor -from accelerate import Accelerator accelerator = Accelerator() @@ -14,15 +15,19 @@ DEFAULT_MODEL = "meta-llama/Llama-3.2-11B-Vision-Instruct" -def load_model_and_processor(model_name: str, hf_token: str): +def load_model_and_processor(model_name: str): """ Load the model and processor based on the 11B or 90B model. """ - model = MllamaForConditionalGeneration.from_pretrained(model_name, torch_dtype=torch.bfloat16,use_safetensors=True, device_map=device, - token=hf_token) - processor = MllamaProcessor.from_pretrained(model_name, token=hf_token,use_safetensors=True) + model = MllamaForConditionalGeneration.from_pretrained( + model_name, + torch_dtype=torch.bfloat16, + use_safetensors=True, + device_map=device, + ) + processor = MllamaProcessor.from_pretrained(model_name, use_safetensors=True) - model, processor=accelerator.prepare(model, processor) + model, processor = accelerator.prepare(model, processor) return model, processor @@ -37,37 +42,67 @@ def process_image(image_path: str) -> PIL_Image.Image: return PIL_Image.open(f).convert("RGB") -def generate_text_from_image(model, processor, image, prompt_text: str, temperature: float, top_p: float): +def generate_text_from_image( + model, processor, image, prompt_text: str, temperature: float, top_p: float +): """ Generate text from an image using the model and processor. """ conversation = [ - {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]} + { + "role": "user", + "content": [{"type": "image"}, {"type": "text", "text": prompt_text}], + } ] - prompt = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) + prompt = processor.apply_chat_template( + conversation, add_generation_prompt=True, tokenize=False + ) inputs = processor(image, prompt, return_tensors="pt").to(device) - output = model.generate(**inputs, temperature=temperature, top_p=top_p, max_new_tokens=512) - return processor.decode(output[0])[len(prompt):] + output = model.generate( + **inputs, temperature=temperature, top_p=top_p, max_new_tokens=512 + ) + return processor.decode(output[0])[len(prompt) :] -def main(image_path: str, prompt_text: str, temperature: float, top_p: float, model_name: str, hf_token: str): +def main( + image_path: str, prompt_text: str, temperature: float, top_p: float, model_name: str +): """ - Call all the functions. + Call all the functions. """ - model, processor = load_model_and_processor(model_name, hf_token) + model, processor = load_model_and_processor(model_name) image = process_image(image_path) - result = generate_text_from_image(model, processor, image, prompt_text, temperature, top_p) + result = generate_text_from_image( + model, processor, image, prompt_text, temperature, top_p + ) print("Generated Text: " + result) if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Generate text from an image and prompt using the 3.2 MM Llama model.") + parser = argparse.ArgumentParser( + description="Generate text from an image and prompt using the 3.2 MM Llama model." + ) parser.add_argument("--image_path", type=str, help="Path to the image file") - parser.add_argument("--prompt_text", type=str, help="Prompt text to describe the image") - parser.add_argument("--temperature", type=float, default=0.7, help="Temperature for generation (default: 0.7)") - parser.add_argument("--top_p", type=float, default=0.9, help="Top p for generation (default: 0.9)") - parser.add_argument("--model_name", type=str, default=DEFAULT_MODEL, help=f"Model name (default: '{DEFAULT_MODEL}')") - parser.add_argument("--hf_token", type=str, required=True, help="Hugging Face token for authentication") + parser.add_argument( + "--prompt_text", type=str, help="Prompt text to describe the image" + ) + parser.add_argument( + "--temperature", + type=float, + default=0.7, + help="Temperature for generation (default: 0.7)", + ) + parser.add_argument( + "--top_p", type=float, default=0.9, help="Top p for generation (default: 0.9)" + ) + parser.add_argument( + "--model_name", + type=str, + default=DEFAULT_MODEL, + help=f"Model name (default: '{DEFAULT_MODEL}')", + ) args = parser.parse_args() - main(args.image_path, args.prompt_text, args.temperature, args.top_p, args.model_name, args.hf_token) \ No newline at end of file + main( + args.image_path, args.prompt_text, args.temperature, args.top_p, args.model_name + ) diff --git a/recipes/use_cases/multilingual/README.md b/recipes/use_cases/multilingual/README.md index 899c73fdb..159db54b3 100644 --- a/recipes/use_cases/multilingual/README.md +++ b/recipes/use_cases/multilingual/README.md @@ -1,7 +1,7 @@ # Extending Llama to a new language Authored by : Sarvam team In this recipe, we will see how to add a new language to the Llama family of models. The steps are quite general and can be easily adapted to other models as well. Using this recipe, you should be able to replicate the findings of [OpenHathi](https://huggingface.co/sarvamai/OpenHathi-7B-Hi-v0.1-Base). -Please read more about OpenHathi [here](https://web.archive.org/web/20240418103408/https://www.sarvam.ai/blog/announcing-openhathi-series) +Please read more about OpenHathi [here](https://x.com/SarvamAI/status/1734645628288831557) ## Data The original OpenHathi model uses a combination of [Sangraha](https://huggingface.co/datasets/ai4bharat/sangraha) and Wikipedia as its primary data sources. If the reader is interested in using these sources, they would also have to preprocess the data: clean, filter, and deduplicate. See [Setu](https://github.com/AI4Bharat/setu) for an easy way to do this at scale. diff --git a/src/llama_recipes/configs/datasets.py b/src/llama_recipes/configs/datasets.py index 549a53935..89e86de3b 100644 --- a/src/llama_recipes/configs/datasets.py +++ b/src/llama_recipes/configs/datasets.py @@ -9,7 +9,6 @@ class samsum_dataset: dataset: str = "samsum_dataset" train_split: str = "train" test_split: str = "validation" - trust_remote_code: bool = False @dataclass diff --git a/src/llama_recipes/datasets/samsum_dataset.py b/src/llama_recipes/datasets/samsum_dataset.py index c0f11f976..1edd701f2 100644 --- a/src/llama_recipes/datasets/samsum_dataset.py +++ b/src/llama_recipes/datasets/samsum_dataset.py @@ -6,11 +6,22 @@ import copy import datasets +from unittest.mock import patch + +@patch('builtins.input', return_value="N") +def load_samsum(split, _): + try: + ds = datasets.load_dataset("Samsung/samsum", split=split) + except ValueError as e: + if "trust_remote_code" in str(e): + raise ValueError("Loading Samsung/samsum requires you to execute the dataset script in that repo on your local machine. Make sure you have read the code there to avoid malicious use, then set HF_DATASETS_TRUST_REMOTE_CODE env variable to True.") from e + else: + raise e + return ds + def get_preprocessed_samsum(dataset_config, tokenizer, split): - if not hasattr(dataset_config, "trust_remote_code") or not dataset_config.trust_remote_code: - raise ValueError("The repository for samsum contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/samsum. To activate `trust_remote_code` option use this config: --samsum_dataset.trust_remote_code=True") - dataset = datasets.load_dataset("samsum", split=split, trust_remote_code=dataset_config.trust_remote_code) + dataset = load_samsum(split) prompt = ( f"Summarize this dialog:\n{{dialog}}\n---\nSummary:\n" diff --git a/src/llama_recipes/finetuning.py b/src/llama_recipes/finetuning.py index 0e140a797..548184e6a 100644 --- a/src/llama_recipes/finetuning.py +++ b/src/llama_recipes/finetuning.py @@ -1,60 +1,68 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. -from collections import Counter +import dataclasses import os +import random +from collections import Counter +from warnings import warn -import dataclasses import fire -import random +import numpy as np import torch import torch.optim as optim -from peft import get_peft_model, PeftModel -from torch.distributed.fsdp import ( - FullyShardedDataParallel as FSDP, - ShardingStrategy -) -from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload -from torch.optim.lr_scheduler import StepLR -from transformers import ( - AutoConfig, - AutoTokenizer, - BitsAndBytesConfig, - AutoProcessor, - LlamaForCausalLM, - MllamaForConditionalGeneration, -) -from transformers.models.llama.modeling_llama import LlamaDecoderLayer -from transformers.models.mllama.modeling_mllama import MllamaSelfAttentionDecoderLayer,MllamaCrossAttentionDecoderLayer,MllamaVisionEncoderLayer +from accelerate.utils import is_xpu_available -from llama_recipes.configs import fsdp_config as FSDP_CONFIG -from llama_recipes.configs import train_config as TRAIN_CONFIG -from llama_recipes.configs import quantization_config as QUANTIZATION_CONFIG +from llama_recipes.configs import ( + fsdp_config as FSDP_CONFIG, + quantization_config as QUANTIZATION_CONFIG, + train_config as TRAIN_CONFIG, +) from llama_recipes.data.concatenator import ConcatDataset from llama_recipes.policies import AnyPrecisionAdamW, apply_fsdp_checkpointing from llama_recipes.utils import fsdp_auto_wrap_policy from llama_recipes.utils.config_utils import ( - update_config, - generate_peft_config, + check_fsdp_config, generate_dataset_config, + generate_peft_config, get_dataloader_kwargs, - check_fsdp_config, + update_config, +) +from llama_recipes.utils.dataset_utils import ( + get_custom_data_collator, + get_preprocessed_dataset, ) -from llama_recipes.utils.dataset_utils import get_preprocessed_dataset,get_custom_data_collator from llama_recipes.utils.fsdp_utils import hsdp_device_mesh from llama_recipes.utils.train_utils import ( - train, + clear_gpu_cache, freeze_transformer_layers, + get_policies, + print_model_size, setup, setup_environ_flags, - clear_gpu_cache, - print_model_size, - get_policies, + train, +) +from peft import get_peft_model, PeftModel +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, ShardingStrategy +from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload +from torch.optim.lr_scheduler import StepLR +from transformers import ( + AutoConfig, + AutoProcessor, + AutoTokenizer, + BitsAndBytesConfig, + LlamaForCausalLM, + MllamaForConditionalGeneration, +) +from transformers.models.llama.modeling_llama import LlamaDecoderLayer +from transformers.models.mllama.modeling_mllama import ( + MllamaCrossAttentionDecoderLayer, + MllamaSelfAttentionDecoderLayer, + MllamaVisionEncoderLayer, ) -from accelerate.utils import is_xpu_available -from warnings import warn + def setup_wandb(train_config, fsdp_config, **kwargs): try: @@ -65,6 +73,7 @@ def setup_wandb(train_config, fsdp_config, **kwargs): "Please install it using pip install wandb" ) from llama_recipes.configs import wandb_config as WANDB_CONFIG + wandb_config = WANDB_CONFIG() update_config(wandb_config, **kwargs) init_dict = dataclasses.asdict(wandb_config) @@ -73,6 +82,7 @@ def setup_wandb(train_config, fsdp_config, **kwargs): run.config.update(fsdp_config, allow_val_change=True) return run + def main(**kwargs): # Update the configuration for the training and sharding process train_config, fsdp_config = TRAIN_CONFIG(), FSDP_CONFIG() @@ -82,6 +92,7 @@ def main(**kwargs): torch.xpu.manual_seed(train_config.seed) torch.manual_seed(train_config.seed) random.seed(train_config.seed) + np.random.seed(train_config.seed) if train_config.enable_fsdp: setup() @@ -101,18 +112,23 @@ def main(**kwargs): wandb_run = None if train_config.use_wandb: - if not train_config.enable_fsdp or rank==0: + if not train_config.enable_fsdp or rank == 0: wandb_run = setup_wandb(train_config, fsdp_config, **kwargs) - - #setting quantization configs + + # setting quantization configs bnb_config = None if train_config.quantization: if type(train_config.quantization) == type(True): - warn("Quantization (--quantization) is a boolean, please specify quantization as '4bit' or '8bit'. Defaulting to '8bit' but this might change in the future.", FutureWarning) + warn( + "Quantization (--quantization) is a boolean, please specify quantization as '4bit' or '8bit'. Defaulting to '8bit' but this might change in the future.", + FutureWarning, + ) train_config.quantization = "8bit" if train_config.quantization == "8bit" and train_config.enable_fsdp: - raise ValueError("8bit quantization is not supported with FSDP, please use 4bit quantization") + raise ValueError( + "8bit quantization is not supported with FSDP, please use 4bit quantization" + ) quant_config = QUANTIZATION_CONFIG() update_config(quant_config, **kwargs) @@ -124,14 +140,22 @@ def main(**kwargs): if config.model_type == "mllama": is_vision = True model = MllamaForConditionalGeneration.from_pretrained( - train_config.model_name, - quantization_config=bnb_config, - attn_implementation="sdpa" if train_config.use_fast_kernels else None, - device_map="auto" if train_config.quantization and not train_config.enable_fsdp else None, - torch_dtype=torch.float16 if train_config.use_fp16 else torch.bfloat16, - ) - processor = AutoProcessor.from_pretrained(train_config.model_name if train_config.tokenizer_name is None else train_config.tokenizer_name) - processor.tokenizer.padding_side='right' + train_config.model_name, + quantization_config=bnb_config, + attn_implementation="sdpa" if train_config.use_fast_kernels else None, + device_map=( + "auto" + if train_config.quantization and not train_config.enable_fsdp + else None + ), + torch_dtype=torch.float16 if train_config.use_fp16 else torch.bfloat16, + ) + processor = AutoProcessor.from_pretrained( + train_config.model_name + if train_config.tokenizer_name is None + else train_config.tokenizer_name + ) + processor.tokenizer.padding_side = "right" model.supports_gradient_checkpointing = True model.language_model.supports_gradient_checkpointing = True elif config.model_type == "llama": @@ -141,32 +165,50 @@ def main(**kwargs): quantization_config=bnb_config, use_cache=use_cache, attn_implementation="sdpa" if train_config.use_fast_kernels else None, - device_map="auto" if train_config.quantization and not train_config.enable_fsdp else None, + device_map=( + "auto" + if train_config.quantization and not train_config.enable_fsdp + else None + ), torch_dtype=torch.float16 if train_config.use_fp16 else torch.bfloat16, ) else: - raise ValueError(f"Model type {config.model_type} is not supported. Please use llama or mllama model.") + raise ValueError( + f"Model type {config.model_type} is not supported. Please use llama or mllama model." + ) # Load the tokenizer and add special tokens - tokenizer = AutoTokenizer.from_pretrained(train_config.model_name if train_config.tokenizer_name is None else train_config.tokenizer_name) - if not tokenizer.pad_token_id: + tokenizer = AutoTokenizer.from_pretrained( + train_config.model_name + if train_config.tokenizer_name is None + else train_config.tokenizer_name + ) + if not tokenizer.pad_token_id: tokenizer.pad_token_id = tokenizer.eos_token_id - + # If there is a mismatch between tokenizer vocab size and embedding matrix, # throw a warning and then expand the embedding matrix if len(tokenizer) > model.get_input_embeddings().weight.shape[0]: - print("WARNING: Resizing the embedding matrix to match the tokenizer vocab size.") + print( + "WARNING: Resizing the embedding matrix to match the tokenizer vocab size." + ) model.resize_token_embeddings(len(tokenizer)) print_model_size(model, train_config, rank if train_config.enable_fsdp else 0) # Convert the model to bfloat16 if fsdp and pure_bf16 is enabled - if train_config.enable_fsdp and fsdp_config.pure_bf16 and not train_config.quantization: + if ( + train_config.enable_fsdp + and fsdp_config.pure_bf16 + and not train_config.quantization + ): model.to(torch.bfloat16) - + if train_config.use_peft: # Load the pre-trained peft model checkpoint and setup its configuration if train_config.from_peft_checkpoint: - model = PeftModel.from_pretrained(model, train_config.from_peft_checkpoint, is_trainable=True) + model = PeftModel.from_pretrained( + model, train_config.from_peft_checkpoint, is_trainable=True + ) peft_config = model.peft_config # Generate the peft config and start fine-tuning from original model else: @@ -177,23 +219,36 @@ def main(**kwargs): model.print_trainable_parameters() hsdp_device_mesh_plan = None - if fsdp_config.hsdp and fsdp_config.sharding_strategy == ShardingStrategy.HYBRID_SHARD: - hsdp_device_mesh_plan = hsdp_device_mesh(replica_group_size=fsdp_config.replica_group_size, sharding_group_size=fsdp_config.sharding_group_size) + if ( + fsdp_config.hsdp + and fsdp_config.sharding_strategy == ShardingStrategy.HYBRID_SHARD + ): + hsdp_device_mesh_plan = hsdp_device_mesh( + replica_group_size=fsdp_config.replica_group_size, + sharding_group_size=fsdp_config.sharding_group_size, + ) print("HSDP device mesh is ready") - #setting up FSDP if enable_fsdp is enabled + # setting up FSDP if enable_fsdp is enabled if train_config.enable_fsdp: check_fsdp_config(fsdp_config) - + if not train_config.use_peft and train_config.freeze_layers: freeze_transformer_layers(model, train_config.num_freeze_layers) mixed_precision_policy, wrapping_policy = get_policies(fsdp_config, rank) # Create the FSDP wrapper for MllamaSelfAttentionDecoderLayer,MllamaSelfAttentionDecoderLayer,MllamaVisionEncoderLayer in vision models if is_vision: - my_auto_wrapping_policy = fsdp_auto_wrap_policy(model, [MllamaSelfAttentionDecoderLayer,MllamaSelfAttentionDecoderLayer,MllamaVisionEncoderLayer]) + my_auto_wrapping_policy = fsdp_auto_wrap_policy( + model, + [ + MllamaSelfAttentionDecoderLayer, + MllamaSelfAttentionDecoderLayer, + MllamaVisionEncoderLayer, + ], + ) else: - # Create the FSDP wrapper for LlamaDecoderLayer in text models + # Create the FSDP wrapper for LlamaDecoderLayer in text models my_auto_wrapping_policy = fsdp_auto_wrap_policy(model, [LlamaDecoderLayer]) device_id = 0 if is_xpu_available(): @@ -202,21 +257,36 @@ def main(**kwargs): device_id = torch.cuda.current_device() model = FSDP( model, - auto_wrap_policy= my_auto_wrapping_policy if train_config.use_peft else wrapping_policy, - cpu_offload=CPUOffload(offload_params=True) if fsdp_config.fsdp_cpu_offload else None, - mixed_precision=mixed_precision_policy if not fsdp_config.pure_bf16 else None, + auto_wrap_policy=( + my_auto_wrapping_policy if train_config.use_peft else wrapping_policy + ), + cpu_offload=( + CPUOffload(offload_params=True) + if fsdp_config.fsdp_cpu_offload + else None + ), + mixed_precision=( + mixed_precision_policy if not fsdp_config.pure_bf16 else None + ), sharding_strategy=fsdp_config.sharding_strategy, device_mesh=hsdp_device_mesh_plan, device_id=device_id, limit_all_gathers=True, sync_module_states=train_config.low_cpu_fsdp, - param_init_fn=(lambda module: module.to_empty(device=torch.device("cuda"), recurse=False)) - if train_config.low_cpu_fsdp and rank != 0 else None, + param_init_fn=( + ( + lambda module: module.to_empty( + device=torch.device("cuda"), recurse=False + ) + ) + if train_config.low_cpu_fsdp and rank != 0 + else None + ), ) - if fsdp_config.fsdp_activation_checkpointing: + if fsdp_config.fsdp_activation_checkpointing: model.enable_input_require_grads() model.gradient_checkpointing_enable() - apply_fsdp_checkpointing(model) + apply_fsdp_checkpointing(model) elif not train_config.quantization and not train_config.enable_fsdp: if is_xpu_available(): model.to("xpu:0") @@ -250,11 +320,15 @@ def main(**kwargs): if is_vision: raise ValueError("Packing is not supported for vision datasets") else: - dataset_train = ConcatDataset(dataset_train, chunk_size=train_config.context_length) + dataset_train = ConcatDataset( + dataset_train, chunk_size=train_config.context_length + ) - train_dl_kwargs = get_dataloader_kwargs(train_config, dataset_train, dataset_processer, "train") + train_dl_kwargs = get_dataloader_kwargs( + train_config, dataset_train, dataset_processer, "train" + ) print("length of dataset_train", len(dataset_train)) - custom_data_collator = get_custom_data_collator(dataset_processer,dataset_config) + custom_data_collator = get_custom_data_collator(dataset_processer, dataset_config) if custom_data_collator: print("custom_data_collator is used") train_dl_kwargs["collate_fn"] = custom_data_collator @@ -273,9 +347,13 @@ def main(**kwargs): if is_vision: raise ValueError("Packing is not supported for vision datasets") else: - dataset_val = ConcatDataset(dataset_val, chunk_size=train_config.context_length) + dataset_val = ConcatDataset( + dataset_val, chunk_size=train_config.context_length + ) - val_dl_kwargs = get_dataloader_kwargs(train_config, dataset_val, dataset_processer, "val") + val_dl_kwargs = get_dataloader_kwargs( + train_config, dataset_val, dataset_processer, "val" + ) if custom_data_collator: val_dl_kwargs["collate_fn"] = custom_data_collator @@ -287,7 +365,9 @@ def main(**kwargs): ) print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}") if len(eval_dataloader) == 0: - raise ValueError("The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set.") + raise ValueError( + f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})" + ) else: print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}") @@ -322,11 +402,12 @@ def main(**kwargs): rank if train_config.enable_fsdp else None, wandb_run, ) - if not train_config.enable_fsdp or rank==0: - [print(f'Key: {k}, Value: {v}') for k, v in results.items()] + if not train_config.enable_fsdp or rank == 0: + [print(f"Key: {k}, Value: {v}") for k, v in results.items()] if train_config.use_wandb: - for k,v in results.items(): + for k, v in results.items(): wandb_run.summary[k] = v + if __name__ == "__main__": fire.Fire(main) diff --git a/src/llama_recipes/inference/checkpoint_converter_fsdp_hf.py b/src/llama_recipes/inference/checkpoint_converter_fsdp_hf.py index a8c5e646f..642459edd 100644 --- a/src/llama_recipes/inference/checkpoint_converter_fsdp_hf.py +++ b/src/llama_recipes/inference/checkpoint_converter_fsdp_hf.py @@ -3,14 +3,15 @@ # from accelerate import init_empty_weights, load_checkpoint_and_dispatch -import fire import os import sys + +import fire import yaml -from transformers import AutoTokenizer +from llama_recipes.inference.model_utils import load_llama_from_config -from llama_recipes.inference.model_utils import load_llama_from_config +from transformers import AutoConfig, AutoTokenizer, MllamaProcessor # Get the current file's directory current_directory = os.path.dirname(os.path.abspath(__file__)) @@ -22,23 +23,24 @@ sys.path.append(parent_directory) from model_checkpointing import load_sharded_model_single_gpu + def main( - fsdp_checkpoint_path="", # Path to FSDP Sharded model checkpoints - consolidated_model_path="", # Path to save the HF converted model checkpoints - HF_model_path_or_name="" # Path/ name of the HF model that include config.json and tokenizer_config.json (e.g. meta-llama/Llama-2-7b-chat-hf) - ): - + fsdp_checkpoint_path="", # Path to FSDP Sharded model checkpoints + consolidated_model_path="", # Path to save the HF converted model checkpoints + HF_model_path_or_name="", # Path/ name of the HF model that include config.json and tokenizer_config.json (e.g. meta-llama/Llama-2-7b-chat-hf) +): + try: - file_name = 'train_params.yaml' + file_name = "train_params.yaml" # Combine the directory and file name to create the full path train_params_path = os.path.join(fsdp_checkpoint_path, file_name) # Open the file - with open(train_params_path, 'r') as file: + with open(train_params_path, "r") as file: # Load the YAML data data = yaml.safe_load(file) # Access the 'model_name' field - HF_model_path_or_name = data.get('model_name') + HF_model_path_or_name = data.get("model_name") print(f"Model name: {HF_model_path_or_name}") except FileNotFoundError: @@ -47,19 +49,33 @@ def main( print(f"Model name: {HF_model_path_or_name}") except Exception as e: print(f"An error occurred: {e}") - - - #load the HF model definition from config + + # load the HF model definition from config model_def = load_llama_from_config(HF_model_path_or_name) print("model is loaded from config") - #load the FSDP sharded checkpoints into the model + # load the FSDP sharded checkpoints into the model model = load_sharded_model_single_gpu(model_def, fsdp_checkpoint_path) print("model is loaded from FSDP checkpoints") - #loading the tokenizer form the model_path - tokenizer = AutoTokenizer.from_pretrained(HF_model_path_or_name) - tokenizer.save_pretrained(consolidated_model_path) - #save the FSDP sharded checkpoints in HF format + # loading the tokenizer form the model_path + config = AutoConfig.from_pretrained(HF_model_path_or_name) + # save the processor and config for mllama models + if config.model_type == "mllama": + processor = MllamaProcessor.from_pretrained(HF_model_path_or_name) + processor.save_pretrained(consolidated_model_path) + print( + f"HuggingFace mllama processor has been saved in {consolidated_model_path}" + ) + else: + # save the tokenizer for llama models + tokenizer = AutoTokenizer.from_pretrained(HF_model_path_or_name) + tokenizer.save_pretrained(consolidated_model_path) + print( + f"HuggingFace llama tokenizer has been saved in {consolidated_model_path}" + ) + # save the FSDP sharded checkpoints in HF format model.save_pretrained(consolidated_model_path) print(f"HuggingFace model checkpoints has been saved in {consolidated_model_path}") + + if __name__ == "__main__": fire.Fire(main) diff --git a/src/llama_recipes/inference/model_utils.py b/src/llama_recipes/inference/model_utils.py index 2b150eea3..99f191005 100644 --- a/src/llama_recipes/inference/model_utils.py +++ b/src/llama_recipes/inference/model_utils.py @@ -1,17 +1,29 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # This software may be used and distributed according to the terms of the GNU General Public License version 3. +from warnings import warn + +from llama_recipes.configs import quantization_config as QUANT_CONFIG from llama_recipes.utils.config_utils import update_config -from llama_recipes.configs import quantization_config as QUANT_CONFIG from peft import PeftModel -from transformers import AutoModelForCausalLM, LlamaForCausalLM, LlamaConfig -from warnings import warn +from transformers import ( + AutoConfig, + AutoModelForCausalLM, + LlamaConfig, + LlamaForCausalLM, + MllamaConfig, + MllamaForConditionalGeneration, +) + # Function to load the main model for text generation def load_model(model_name, quantization, use_fast_kernels, **kwargs): if type(quantization) == type(True): - warn("Quantization (--quantization) is a boolean, please specify quantization as '4bit' or '8bit'. Defaulting to '8bit' but this might change in the future.", FutureWarning) - quantization = "8bit" + warn( + "Quantization (--quantization) is a boolean, please specify quantization as '4bit' or '8bit'. Defaulting to '8bit' but this might change in the future.", + FutureWarning, + ) + quantization = "8bit" bnb_config = None if quantization: @@ -23,10 +35,10 @@ def load_model(model_name, quantization, use_fast_kernels, **kwargs): kwargs = {} if bnb_config: - kwargs["quantization_config"]=bnb_config - kwargs["device_map"]="auto" - kwargs["low_cpu_mem_usage"]=True - kwargs["attn_implementation"]="sdpa" if use_fast_kernels else None + kwargs["quantization_config"] = bnb_config + kwargs["device_map"] = "auto" + kwargs["low_cpu_mem_usage"] = True + kwargs["attn_implementation"] = "sdpa" if use_fast_kernels else None model = AutoModelForCausalLM.from_pretrained( model_name, return_dict=True, @@ -40,10 +52,16 @@ def load_peft_model(model, peft_model): peft_model = PeftModel.from_pretrained(model, peft_model) return peft_model + # Loading the model from config to load FSDP checkpoints into that def load_llama_from_config(config_path): - model_config = LlamaConfig.from_pretrained(config_path) - model = LlamaForCausalLM(config=model_config) + config = AutoConfig.from_pretrained(config_path) + if config.model_type == "mllama": + model = MllamaForConditionalGeneration(config=config) + elif config.model_type == "llama": + model = LlamaForCausalLM(config=config) + else: + raise ValueError( + f"Unsupported model type: {config.model_type}, Please use llama or mllama model." + ) return model - - \ No newline at end of file diff --git a/src/llama_recipes/utils/train_utils.py b/src/llama_recipes/utils/train_utils.py index 9ce2eb7b8..d3b42ae12 100644 --- a/src/llama_recipes/utils/train_utils.py +++ b/src/llama_recipes/utils/train_utils.py @@ -151,11 +151,11 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche batch[key] = batch[key].to('cuda:0') with autocast(): loss = model(**batch).loss + total_loss += loss.detach().float() loss = loss / gradient_accumulation_steps if train_config.save_metrics: train_step_loss.append(loss.detach().float().item()) train_step_perplexity.append(float(torch.exp(loss.detach().float()))) - total_loss += loss.detach().float() if train_config.use_fp16: # if fp16 is enabled, use gradient scaler to handle gradient update scaler.scale(loss).backward() @@ -288,7 +288,7 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche print(f"best eval loss on epoch {epoch+1} is {best_val_loss}") else: print(f"best eval loss on epoch {epoch+1} is {best_val_loss}") - val_loss.append(float(best_val_loss)) + val_loss.append(float(eval_epoch_loss)) val_prep.append(float(eval_ppl)) if train_config.enable_fsdp: if rank==0: diff --git a/src/tests/conftest.py b/src/tests/conftest.py index 710ed7404..1476bf3c1 100644 --- a/src/tests/conftest.py +++ b/src/tests/conftest.py @@ -3,19 +3,27 @@ import pytest -from transformers import AutoTokenizer +from utils import maybe_tokenizer -ACCESS_ERROR_MSG = "Could not access tokenizer at 'meta-llama/Llama-2-7b-hf'. Did you log into huggingface hub and provided the correct token?" -LLAMA_VERSIONS = ["meta-llama/Llama-2-7b-hf", "meta-llama/Meta-Llama-3.1-8B-Instruct"] +ACCESS_ERROR_MSG = "Could not access tokenizer. Did you log into huggingface hub and provided the correct token?" + +LLAMA_VERSIONS = ["meta-llama/Llama-2-7b-hf", "meta-llama/Meta-Llama-3.1-8B-Instruct", "fake_llama"] + +LLAMA_TOKENIZERS = {k: maybe_tokenizer(k) for k in LLAMA_VERSIONS} @pytest.fixture(params=LLAMA_VERSIONS) def llama_version(request): return request.param +@pytest.fixture(params=["mllama", "llama"]) +def model_type(request): + return request.param + + @pytest.fixture(scope="module") def llama_tokenizer(request): - return {k: AutoTokenizer.from_pretrained(k) for k in LLAMA_VERSIONS} + return LLAMA_TOKENIZERS @pytest.fixture @@ -26,6 +34,13 @@ def _helper(tokenizer_mock): return _helper +@pytest.fixture +def setup_processor(llama_tokenizer, llama_version): + def _helper(processor_mock): + processor_mock.from_pretrained.return_value.tokenizer = llama_tokenizer[llama_version] + + return _helper + def pytest_addoption(parser): parser.addoption( @@ -38,16 +53,18 @@ def pytest_configure(config): def pytest_collection_modifyitems(config, items): + #skip tests marked with skip_missing_tokenizer if tokenizer is unavailable unless --unskip-missing-tokenizer is passed if config.getoption("--unskip-missing-tokenizer"): return - try: - AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") - tokenizer_available = True - except OSError: - tokenizer_available = False - skip_missing_tokenizer = pytest.mark.skip(reason=ACCESS_ERROR_MSG) for item in items: - if "skip_missing_tokenizer" in item.keywords and not tokenizer_available: + # get the tokenizer for the test + version = [v for v in LLAMA_VERSIONS for i in item.keywords if v in i] + if len(version) == 0: + # no tokenizer used in this test + continue + version = version.pop() + assert version in LLAMA_TOKENIZERS + if "skip_missing_tokenizer" in item.keywords and LLAMA_TOKENIZERS[version] is None: item.add_marker(skip_missing_tokenizer) diff --git a/src/tests/datasets/test_custom_dataset.py b/src/tests/datasets/test_custom_dataset.py index 7cf8abe3e..f842733b7 100644 --- a/src/tests/datasets/test_custom_dataset.py +++ b/src/tests/datasets/test_custom_dataset.py @@ -2,6 +2,7 @@ # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. import pytest +from contextlib import nullcontext from unittest.mock import patch from transformers import LlamaTokenizer @@ -96,15 +97,17 @@ def test_custom_dataset(step_lr, optimizer, get_model, tokenizer, train, mocker, @patch('llama_recipes.finetuning.train') +@patch('llama_recipes.finetuning.AutoConfig.from_pretrained') @patch('llama_recipes.finetuning.LlamaForCausalLM.from_pretrained') @patch('llama_recipes.finetuning.AutoTokenizer.from_pretrained') @patch('llama_recipes.finetuning.optim.AdamW') @patch('llama_recipes.finetuning.StepLR') -def test_unknown_dataset_error(step_lr, optimizer, tokenizer, get_model, train, mocker, llama_version): +def test_unknown_dataset_error(step_lr, optimizer, tokenizer, get_model, get_config, train, mocker, llama_version): from llama_recipes.finetuning import main tokenizer.return_value = mocker.MagicMock(side_effect=lambda x: {"input_ids":[len(x)*[0,]], "attention_mask": [len(x)*[0,]]}) get_model.return_value.get_input_embeddings.return_value.weight.shape = [32000 if "Llama-2" in llama_version else 128256] + get_config.return_value.model_type = "llama" kwargs = { "dataset": "custom_dataset", @@ -131,13 +134,16 @@ def test_tokenize_dialog(tokenizer, monkeypatch, setup_tokenizer, llama_version) {"role":"assistant", "content":"Romans"}, ] - result = tokenize_dialog(dialog, tokenizer) + c = pytest.raises(AttributeError) if llama_version == "fake_llama" else nullcontext() + + with c: + result = tokenize_dialog(dialog, tokenizer) if "Llama-2" in llama_version: assert result["labels"][:12] == [-100] * 12 assert result["labels"][17:28] == [-100] * 11 assert result["labels"].count(-100) == 11 + 12 - else: + elif "Llama-3" in llama_version: assert result["labels"][:38] == [-100] * 38 assert result["labels"][43:54] == [-100] * 11 assert result["labels"].count(-100) == 38 + 11 diff --git a/src/tests/datasets/test_grammar_datasets.py b/src/tests/datasets/test_grammar_datasets.py index e05e51ca9..f61d14988 100644 --- a/src/tests/datasets/test_grammar_datasets.py +++ b/src/tests/datasets/test_grammar_datasets.py @@ -1,32 +1,27 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. +from pathlib import Path import pytest from unittest.mock import patch - -EXPECTED_RESULTS = { - "meta-llama/Llama-2-7b-hf":{ - "label": 1152, - "pos": 31, - }, - "meta-llama/Meta-Llama-3.1-8B":{ - "label": 40, - "pos": 26, - }, -} +DATA_DIR = Path(__file__).parents[2] / "llama_recipes/datasets/grammar_dataset/" @pytest.mark.skip_missing_tokenizer +@pytest.mark.skipif(not Path(DATA_DIR / "grammar_validation.csv").exists(), reason="grammar_validation.csv not found") +@pytest.mark.skipif(not Path(DATA_DIR / "gtrain_10k.csv").exists(), reason="gtrain_10k.csv not found") @patch('llama_recipes.finetuning.train') @patch('llama_recipes.finetuning.AutoTokenizer') +@patch('llama_recipes.finetuning.AutoConfig.from_pretrained') @patch('llama_recipes.finetuning.LlamaForCausalLM.from_pretrained') @patch('llama_recipes.finetuning.optim.AdamW') @patch('llama_recipes.finetuning.StepLR') -def test_grammar_dataset(step_lr, optimizer, get_model, tokenizer, train, setup_tokenizer, llama_version): +def test_grammar_dataset(step_lr, optimizer, get_model, get_config, tokenizer, train, setup_tokenizer, llama_version): from llama_recipes.finetuning import main setup_tokenizer(tokenizer) get_model.return_value.get_input_embeddings.return_value.weight.shape = [32000 if "Llama-2" in llama_version else 128256] + get_config.return_value.model_type = "llama" BATCH_SIZE = 8 kwargs = { @@ -58,9 +53,6 @@ def test_grammar_dataset(step_lr, optimizer, get_model, tokenizer, train, setup_ assert "input_ids" in batch.keys() assert "attention_mask" in batch.keys() - assert batch["labels"][0][EXPECTED_RESULTS[llama_version]["pos"]-1] == -100 - assert batch["labels"][0][EXPECTED_RESULTS[llama_version]["pos"]] == EXPECTED_RESULTS[llama_version]["label"] - token = args[3] assert batch["input_ids"][0][0] == token.bos_token_id assert batch["labels"][0][-1] == token.eos_token_id diff --git a/src/tests/datasets/test_samsum_datasets.py b/src/tests/datasets/test_samsum_datasets.py index 4b6668b25..3a71059da 100644 --- a/src/tests/datasets/test_samsum_datasets.py +++ b/src/tests/datasets/test_samsum_datasets.py @@ -2,31 +2,50 @@ # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. import pytest +from dataclasses import dataclass from functools import partial from unittest.mock import patch +from datasets import load_dataset -EXPECTED_RESULTS = { - "meta-llama/Llama-2-7b-hf":{ - "label": 8432, - "pos": 242, - }, - "meta-llama/Meta-Llama-3.1-8B":{ - "label": 2250, - "pos": 211, - }, -} +@dataclass +class Config: + model_type: str = "llama" +try: + load_dataset("Samsung/samsum") + SAMSUM_UNAVAILABLE = False +except ValueError: + SAMSUM_UNAVAILABLE = True + +@pytest.mark.skipif(SAMSUM_UNAVAILABLE, reason="Samsum dataset is unavailable") @pytest.mark.skip_missing_tokenizer @patch('llama_recipes.finetuning.train') @patch('llama_recipes.finetuning.AutoTokenizer') +@patch("llama_recipes.finetuning.AutoConfig.from_pretrained") +@patch("llama_recipes.finetuning.AutoProcessor") +@patch("llama_recipes.finetuning.MllamaForConditionalGeneration.from_pretrained") @patch('llama_recipes.finetuning.LlamaForCausalLM.from_pretrained') @patch('llama_recipes.finetuning.optim.AdamW') @patch('llama_recipes.finetuning.StepLR') -def test_samsum_dataset(step_lr, optimizer, get_model, tokenizer, train, mocker, setup_tokenizer, llama_version): +def test_samsum_dataset( + step_lr, + optimizer, + get_model, + get_mmodel, + processor, + get_config, + tokenizer, + train, + mocker, + setup_tokenizer, + llama_version, + ): from llama_recipes.finetuning import main setup_tokenizer(tokenizer) get_model.return_value.get_input_embeddings.return_value.weight.shape = [32000 if "Llama-2" in llama_version else 128256] + get_mmodel.return_value.get_input_embeddings.return_value.weight.shape = [0] + get_config.return_value = Config() BATCH_SIZE = 8 kwargs = { @@ -59,9 +78,6 @@ def test_samsum_dataset(step_lr, optimizer, get_model, tokenizer, train, mocker, assert "input_ids" in batch.keys() assert "attention_mask" in batch.keys() - assert batch["labels"][0][EXPECTED_RESULTS[llama_version]["pos"]-1] == -100 - assert batch["labels"][0][EXPECTED_RESULTS[llama_version]["pos"]] == EXPECTED_RESULTS[llama_version]["label"] - assert batch["input_ids"][0][0] == token.bos_token_id assert batch["labels"][0][-1] == token.eos_token_id assert batch["input_ids"][0][-1] == token.eos_token_id diff --git a/src/tests/test_batching.py b/src/tests/test_batching.py index c450c18ac..5aed0a4c4 100644 --- a/src/tests/test_batching.py +++ b/src/tests/test_batching.py @@ -2,30 +2,68 @@ # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. import pytest +from contextlib import nullcontext +from dataclasses import dataclass +from datasets import Dataset from unittest.mock import patch +@dataclass +class Config: + model_type: str = "llama" + EXPECTED_SAMPLE_NUMBER ={ "meta-llama/Llama-2-7b-hf": { - "train": 96, - "eval": 42, + "train": 4, + "eval": 37, + }, + "meta-llama/Meta-Llama-3.1-8B-Instruct": { + "train": 3, + "eval": 30, }, - "meta-llama/Meta-Llama-3.1-8B": { - "train": 79, - "eval": 34, + "fake_llama": { + "train": 2, + "eval": 17, } } +fake_samsum_dataset = 2048*[{'id': '420', + 'dialogue': "Mario: It's a me, Mario!\nLuigi: It's a me, your brother!\nMario: I'm going to save the princess.\nLuigi: I'm going to help Mario.", + 'summary': 'Mario and Luigi are going to save the princess.'}] + @pytest.mark.skip_missing_tokenizer @patch('llama_recipes.finetuning.train') @patch('llama_recipes.finetuning.AutoTokenizer') +@patch("llama_recipes.finetuning.AutoConfig.from_pretrained") +@patch("llama_recipes.finetuning.AutoProcessor") +@patch("llama_recipes.finetuning.MllamaForConditionalGeneration.from_pretrained") @patch('llama_recipes.finetuning.LlamaForCausalLM.from_pretrained') @patch('llama_recipes.finetuning.optim.AdamW') @patch('llama_recipes.finetuning.StepLR') -def test_packing(step_lr, optimizer, get_model, tokenizer, train, setup_tokenizer, llama_version): +@patch('llama_recipes.datasets.samsum_dataset.datasets') +def test_packing( + datasets, + step_lr, + optimizer, + get_model, + get_mmodel, + processor, + get_config, + tokenizer, + train, + setup_tokenizer, + setup_processor, + llama_version, + model_type, + ): from llama_recipes.finetuning import main setup_tokenizer(tokenizer) + setup_processor(processor) get_model.return_value.get_input_embeddings.return_value.weight.shape = [32000 if "Llama-2" in llama_version else 128256] + get_mmodel.return_value.get_input_embeddings.return_value.weight.shape = [0] + get_config.return_value = Config(model_type=model_type) + + datasets.load_dataset.return_value = Dataset.from_list(fake_samsum_dataset) kwargs = { "model_name": llama_version, @@ -36,31 +74,40 @@ def test_packing(step_lr, optimizer, get_model, tokenizer, train, setup_tokenize "batching_strategy": "packing", } - main(**kwargs) + c = nullcontext() if model_type == "llama" else pytest.raises(ValueError) - assert train.call_count == 1 + with c: + main(**kwargs) + + if model_type == "llama": + assert train.call_count == 1 - args, kwargs = train.call_args - train_dataloader = args[1] - eval_dataloader = args[2] + args, kwargs = train.call_args + train_dataloader = args[1] + eval_dataloader = args[2] - assert len(train_dataloader) == EXPECTED_SAMPLE_NUMBER[llama_version]["train"] - assert len(eval_dataloader) == EXPECTED_SAMPLE_NUMBER[llama_version]["eval"] + assert len(train_dataloader) == EXPECTED_SAMPLE_NUMBER[llama_version]["train"] + assert len(eval_dataloader) == EXPECTED_SAMPLE_NUMBER[llama_version]["eval"] - batch = next(iter(train_dataloader)) + batch = next(iter(train_dataloader)) - assert "labels" in batch.keys() - assert "input_ids" in batch.keys() - assert "attention_mask" in batch.keys() + assert "labels" in batch.keys() + assert "input_ids" in batch.keys() + assert "attention_mask" in batch.keys() - assert batch["labels"][0].size(0) == 4096 - assert batch["input_ids"][0].size(0) == 4096 - assert batch["attention_mask"][0].size(0) == 4096 + assert batch["labels"][0].size(0) == 4096 + assert batch["input_ids"][0].size(0) == 4096 + assert batch["attention_mask"][0].size(0) == 4096 @pytest.mark.skip_missing_tokenizer +@patch("llama_recipes.utils.train_utils.torch.cuda.is_bf16_supported") +@patch("llama_recipes.finetuning.torch.cuda.is_available") @patch('llama_recipes.finetuning.train') @patch('llama_recipes.finetuning.AutoTokenizer') +@patch("llama_recipes.finetuning.AutoConfig.from_pretrained") +@patch("llama_recipes.finetuning.AutoProcessor") +@patch("llama_recipes.finetuning.MllamaForConditionalGeneration.from_pretrained") @patch('llama_recipes.finetuning.LlamaForCausalLM.from_pretrained') @patch('llama_recipes.finetuning.optim.AdamW') @patch('llama_recipes.finetuning.StepLR') @@ -68,12 +115,40 @@ def test_packing(step_lr, optimizer, get_model, tokenizer, train, setup_tokenize @patch('llama_recipes.finetuning.FSDP') @patch('llama_recipes.finetuning.torch.distributed.is_initialized') @patch('llama_recipes.utils.config_utils.dist') -def test_distributed_packing(dist, is_initialized, fsdp, setup, step_lr, optimizer, get_model, tokenizer, train, setup_tokenizer, llama_version): +@patch('llama_recipes.datasets.samsum_dataset.datasets') +def test_distributed_packing( + datasets, + dist, + is_initialized, + fsdp, + setup, + step_lr, + optimizer, + get_model, + get_mmodel, + processor, + get_config, + tokenizer, + train, + cuda_is_available, + cuda_is_bf16_supported, + setup_tokenizer, + setup_processor, + llama_version, + model_type, + ): import os from llama_recipes.finetuning import main setup_tokenizer(tokenizer) + setup_processor(processor) get_model.return_value.get_input_embeddings.return_value.weight.shape = [32000 if "Llama-2" in llama_version else 128256] + get_mmodel.return_value.get_input_embeddings.return_value.weight.shape = [0] + get_config.return_value = Config(model_type=model_type) + cuda_is_available.return_value = False + cuda_is_bf16_supported.return_value = False + + datasets.load_dataset.return_value = Dataset.from_list(fake_samsum_dataset) rank = 1 os.environ['LOCAL_RANK'] = f'{rank}' @@ -96,13 +171,17 @@ def test_distributed_packing(dist, is_initialized, fsdp, setup, step_lr, optimiz dist.get_rank.return_value = rank dist.get_world_size.return_value = 2 - main(**kwargs) + c = nullcontext() if model_type == "llama" else pytest.raises(ValueError) + + with c: + main(**kwargs) - assert train.call_count == 1 + if model_type == "llama": + assert train.call_count == 1 - args, kwargs = train.call_args - train_dataloader = args[1] - eval_dataloader = args[2] + args, kwargs = train.call_args + train_dataloader = args[1] + eval_dataloader = args[2] - assert len(train_dataloader) == EXPECTED_SAMPLE_NUMBER[llama_version]["train"] //2 - assert len(eval_dataloader) == EXPECTED_SAMPLE_NUMBER[llama_version]["eval"] //2 + assert len(train_dataloader) == EXPECTED_SAMPLE_NUMBER[llama_version]["train"] //2 + assert len(eval_dataloader) == EXPECTED_SAMPLE_NUMBER[llama_version]["eval"] //2 diff --git a/src/tests/test_chat_completion.py b/src/tests/test_chat_completion.py index fb3efc0cb..266252317 100644 --- a/src/tests/test_chat_completion.py +++ b/src/tests/test_chat_completion.py @@ -1,6 +1,6 @@ import sys from pathlib import Path -from typing import List, Literal, TypedDict +from typing import List, TypedDict from unittest.mock import patch import pytest @@ -8,46 +8,37 @@ from llama_recipes.inference.chat_utils import read_dialogs_from_file ROOT_DIR = Path(__file__).parents[2] -CHAT_COMPLETION_DIR = ROOT_DIR / "recipes/inference/local_inference/chat_completion/" +CHAT_COMPLETION_DIR = ROOT_DIR / "recipes/quickstart/inference/local_inference/chat_completion/" sys.path = [CHAT_COMPLETION_DIR.as_posix()] + sys.path -Role = Literal["user", "assistant"] - - -class Message(TypedDict): - role: Role - content: str - - -Dialog = List[Message] - -B_INST, E_INST = "[INST]", "[/INST]" -B_SYS, E_SYS = "<>\n", "\n<>\n\n" - +default_system_prompt = [{"role": "system", "content": "Cutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\n"}] def _encode_header(message, tokenizer): tokens = [] - tokens.extend(tokenizer.encode("<|start_header_id|>")) - tokens.extend(tokenizer.encode(message["role"])) - tokens.extend(tokenizer.encode("<|end_header_id|>")) - tokens.extend(tokenizer.encode("\n\n")) + tokens.extend(tokenizer.encode("<|start_header_id|>", add_special_tokens=False)) + tokens.extend(tokenizer.encode(message["role"], add_special_tokens=False)) + tokens.extend(tokenizer.encode("<|end_header_id|>", add_special_tokens=False)) + tokens.extend(tokenizer.encode("\n\n", add_special_tokens=False)) return tokens def _encode_message(message, tokenizer): tokens = _encode_header(message, tokenizer) - tokens.extend(tokenizer.encode(message["content"].strip())) - tokens.extend(tokenizer.encode("<|eot_id|>")) + tokens.extend(tokenizer.encode(message["content"], add_special_tokens=False)) + tokens.extend(tokenizer.encode("<|eot_id|>", add_special_tokens=False)) return tokens def _format_dialog(dialog, tokenizer): tokens = [] - tokens.extend(tokenizer.encode("<|begin_of_text|>")) + tokens.extend(tokenizer.encode("<|begin_of_text|>", add_special_tokens=False)) + if dialog[0]["role"] == "system": + dialog[0]["content"] = default_system_prompt[0]["content"] + dialog[0]["content"] + else: + dialog = default_system_prompt + dialog for msg in dialog: tokens.extend(_encode_message(msg, tokenizer)) - tokens.extend(_encode_header({"role": "assistant", "content": ""}, tokenizer)) return tokens @@ -55,59 +46,19 @@ def _format_tokens_llama3(dialogs, tokenizer): return [_format_dialog(dialog, tokenizer) for dialog in dialogs] -def _format_tokens_llama2(dialogs, tokenizer): - prompt_tokens = [] - for dialog in dialogs: - if dialog[0]["role"] == "system": - dialog = [ - { - "role": dialog[1]["role"], - "content": B_SYS - + dialog[0]["content"] - + E_SYS - + dialog[1]["content"], - } - ] + dialog[2:] - assert all([msg["role"] == "user" for msg in dialog[::2]]) and all( - [msg["role"] == "assistant" for msg in dialog[1::2]] - ), ( - "model only supports 'system','user' and 'assistant' roles, " - "starting with user and alternating (u/a/u/a/u...)" - ) - """ - Please verify that your tokenizer support adding "[INST]", "[/INST]" to your inputs. - Here, we are adding it manually. - """ - dialog_tokens: List[int] = sum( - [ - tokenizer.encode( - f"{B_INST} {(prompt['content']).strip()} {E_INST} {(answer['content']).strip()} ", - ) - + [tokenizer.eos_token_id] - for prompt, answer in zip(dialog[::2], dialog[1::2]) - ], - [], - ) - assert ( - dialog[-1]["role"] == "user" - ), f"Last message must be from user, got {dialog[-1]['role']}" - dialog_tokens += tokenizer.encode( - f"{B_INST} {(dialog[-1]['content']).strip()} {E_INST}", - ) - prompt_tokens.append(dialog_tokens) - return prompt_tokens - - @pytest.mark.skip_missing_tokenizer @patch("chat_completion.AutoTokenizer") @patch("chat_completion.load_model") def test_chat_completion( load_model, tokenizer, setup_tokenizer, llama_tokenizer, llama_version ): + if "Llama-2" in llama_version or llama_version == "fake_llama": + pytest.skip(f"skipping test for {llama_version}") + from chat_completion import main setup_tokenizer(tokenizer) - load_model.return_value.get_input_embeddings.return_value.weight.shape = [32000 if "Llama-2" in llama_version else 128256] + load_model.return_value.get_input_embeddings.return_value.weight.shape = [128256] kwargs = { "prompt_file": (CHAT_COMPLETION_DIR / "chats.json").as_posix(), @@ -116,13 +67,8 @@ def test_chat_completion( main(llama_version, **kwargs) dialogs = read_dialogs_from_file(kwargs["prompt_file"]) - format_tokens = ( - _format_tokens_llama2 - if llama_version == "meta-llama/Llama-2-7b-hf" - else _format_tokens_llama3 - ) - REF_RESULT = format_tokens(dialogs, llama_tokenizer[llama_version]) + REF_RESULT = _format_tokens_llama3(dialogs, llama_tokenizer[llama_version]) assert all( ( diff --git a/src/tests/test_finetuning.py b/src/tests/test_finetuning.py index 749f8614f..d90859e0f 100644 --- a/src/tests/test_finetuning.py +++ b/src/tests/test_finetuning.py @@ -2,6 +2,8 @@ # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. import os +from contextlib import nullcontext +from dataclasses import dataclass from unittest.mock import patch import pytest @@ -16,8 +18,12 @@ from torch.utils.data.sampler import BatchSampler +@dataclass +class Config: + model_type: str = "llama" + def get_fake_dataset(): - return [ + return 8192*[ { "input_ids": [1], "attention_mask": [1], @@ -28,28 +34,49 @@ def get_fake_dataset(): @patch("llama_recipes.finetuning.torch.cuda.is_available") @patch("llama_recipes.finetuning.train") +@patch("llama_recipes.finetuning.MllamaForConditionalGeneration.from_pretrained") +@patch("llama_recipes.finetuning.AutoProcessor.from_pretrained") @patch("llama_recipes.finetuning.LlamaForCausalLM.from_pretrained") +@patch("llama_recipes.finetuning.AutoConfig.from_pretrained") @patch("llama_recipes.finetuning.AutoTokenizer.from_pretrained") @patch("llama_recipes.finetuning.get_preprocessed_dataset") +@patch("llama_recipes.finetuning.generate_peft_config") +@patch("llama_recipes.finetuning.get_peft_model") @patch("llama_recipes.finetuning.optim.AdamW") @patch("llama_recipes.finetuning.StepLR") @pytest.mark.parametrize("cuda_is_available", [True, False]) -def test_finetuning_no_validation( +@pytest.mark.parametrize("run_validation", [True, False]) +@pytest.mark.parametrize("use_peft", [True, False]) +def test_finetuning( step_lr, optimizer, + get_peft_model, + gen_peft_config, get_dataset, tokenizer, + get_config, get_model, + get_processor, + get_mmodel, train, cuda, cuda_is_available, + run_validation, + use_peft, + model_type, ): - kwargs = {"run_validation": False} + kwargs = { + "run_validation": run_validation, + "use_peft": use_peft, + "batching_strategy": "packing" if model_type == "llama" else "padding", + } get_dataset.return_value = get_fake_dataset() cuda.return_value = cuda_is_available get_model.return_value.get_input_embeddings.return_value.weight.shape = [0] + get_mmodel.return_value.get_input_embeddings.return_value.weight.shape = [0] + get_config.return_value = Config(model_type=model_type) main(**kwargs) @@ -60,115 +87,59 @@ def test_finetuning_no_validation( eval_dataloader = args[2] assert isinstance(train_dataloader, DataLoader) - assert eval_dataloader is None - - if cuda_is_available: - assert get_model.return_value.to.call_count == 1 - assert get_model.return_value.to.call_args.args[0] == "cuda" + if run_validation: + assert isinstance(eval_dataloader, DataLoader) else: - assert get_model.return_value.to.call_count == 0 - - -@patch("llama_recipes.finetuning.torch.cuda.is_available") -@patch("llama_recipes.finetuning.train") -@patch("llama_recipes.finetuning.LlamaForCausalLM.from_pretrained") -@patch("llama_recipes.finetuning.AutoTokenizer.from_pretrained") -@patch("llama_recipes.finetuning.get_preprocessed_dataset") -@patch("llama_recipes.finetuning.optim.AdamW") -@patch("llama_recipes.finetuning.StepLR") -@pytest.mark.parametrize("cuda_is_available", [True, False]) -def test_finetuning_with_validation( - step_lr, - optimizer, - get_dataset, - tokenizer, - get_model, - train, - cuda, - cuda_is_available, -): - kwargs = {"run_validation": True} - - get_dataset.return_value = get_fake_dataset() - cuda.return_value = cuda_is_available - - get_model.return_value.get_input_embeddings.return_value.weight.shape = [0] - - main(**kwargs) - - assert train.call_count == 1 - - args, kwargs = train.call_args - train_dataloader = args[1] - eval_dataloader = args[2] - assert isinstance(train_dataloader, DataLoader) - assert isinstance(eval_dataloader, DataLoader) + assert eval_dataloader is None - if cuda_is_available: - assert get_model.return_value.to.call_count == 1 - assert get_model.return_value.to.call_args.args[0] == "cuda" + if use_peft: + assert get_peft_model.return_value.print_trainable_parameters.call_count == 1 + model = get_peft_model + elif model_type == "llama": + model = get_model else: - assert get_model.return_value.to.call_count == 0 - - -@patch("llama_recipes.finetuning.torch.cuda.is_available") -@patch("llama_recipes.finetuning.train") -@patch("llama_recipes.finetuning.LlamaForCausalLM.from_pretrained") -@patch("llama_recipes.finetuning.AutoTokenizer.from_pretrained") -@patch("llama_recipes.finetuning.get_preprocessed_dataset") -@patch("llama_recipes.finetuning.generate_peft_config") -@patch("llama_recipes.finetuning.get_peft_model") -@patch("llama_recipes.finetuning.optim.AdamW") -@patch("llama_recipes.finetuning.StepLR") -@pytest.mark.parametrize("cuda_is_available", [True, False]) -def test_finetuning_peft_lora( - step_lr, - optimizer, - get_peft_model, - gen_peft_config, - get_dataset, - tokenizer, - get_model, - train, - cuda, - cuda_is_available, -): - kwargs = {"use_peft": True} - - get_dataset.return_value = get_fake_dataset() - cuda.return_value = cuda_is_available - - get_model.return_value.get_input_embeddings.return_value.weight.shape = [0] - - main(**kwargs) + model = get_mmodel if cuda_is_available: - assert get_peft_model.return_value.to.call_count == 1 - assert get_peft_model.return_value.to.call_args.args[0] == "cuda" + assert model.return_value.to.call_count == 1 + assert model.return_value.to.call_args.args[0] == "cuda" else: - assert get_peft_model.return_value.to.call_count == 0 - - assert get_peft_model.return_value.print_trainable_parameters.call_count == 1 + assert model.return_value.to.call_count == 0 @patch("llama_recipes.finetuning.get_peft_model") @patch("llama_recipes.finetuning.setup") @patch("llama_recipes.finetuning.train") +@patch("llama_recipes.finetuning.MllamaForConditionalGeneration.from_pretrained") +@patch("llama_recipes.finetuning.AutoProcessor.from_pretrained") @patch("llama_recipes.finetuning.LlamaForCausalLM.from_pretrained") +@patch("llama_recipes.finetuning.AutoConfig.from_pretrained") @patch("llama_recipes.finetuning.AutoTokenizer.from_pretrained") @patch("llama_recipes.finetuning.get_preprocessed_dataset") def test_finetuning_peft_llama_adapter( - get_dataset, tokenizer, get_model, train, setup, get_peft_model + get_dataset, + tokenizer, + get_config, + get_model, + get_processor, + get_mmodel, + train, + setup, + get_peft_model, + model_type, ): kwargs = { "use_peft": True, "peft_method": "llama_adapter", "enable_fsdp": True, + "batching_strategy": "packing" if model_type == "llama" else "padding", } get_dataset.return_value = get_fake_dataset() get_model.return_value.get_input_embeddings.return_value.weight.shape = [0] + get_mmodel.return_value.get_input_embeddings.return_value.weight.shape = [0] + get_config.return_value = Config(model_type=model_type) os.environ["RANK"] = "0" os.environ["LOCAL_RANK"] = "0" @@ -195,20 +166,38 @@ def test_finetuning_peft_llama_adapter( @patch("llama_recipes.finetuning.train") +@patch("llama_recipes.finetuning.MllamaForConditionalGeneration.from_pretrained") +@patch("llama_recipes.finetuning.AutoProcessor.from_pretrained") @patch("llama_recipes.finetuning.LlamaForCausalLM.from_pretrained") +@patch("llama_recipes.finetuning.AutoConfig.from_pretrained") @patch("llama_recipes.finetuning.AutoTokenizer.from_pretrained") @patch("llama_recipes.finetuning.get_preprocessed_dataset") @patch("llama_recipes.finetuning.get_peft_model") @patch("llama_recipes.finetuning.StepLR") def test_finetuning_weight_decay( - step_lr, get_peft_model, get_dataset, tokenizer, get_model, train + step_lr, + get_peft_model, + get_dataset, + tokenizer, + get_config, + get_model, + get_processor, + get_mmodel, + train, + model_type, ): - kwargs = {"weight_decay": 0.01} + kwargs = { + "weight_decay": 0.01, + "batching_strategy": "packing" if model_type == "llama" else "padding", + } get_dataset.return_value = get_fake_dataset() - get_model.return_value.parameters.return_value = [torch.ones(1, 1)] - get_model.return_value.get_input_embeddings.return_value.weight.shape = [0] + model = get_model if model_type == "llama" else get_mmodel + model.return_value.parameters.return_value = [torch.ones(1, 1)] + model.return_value.get_input_embeddings.return_value.weight.shape = [0] + + get_config.return_value = Config(model_type=model_type) main(**kwargs) @@ -217,35 +206,54 @@ def test_finetuning_weight_decay( args, kwargs = train.call_args optimizer = args[4] - print(optimizer.state_dict()) - assert isinstance(optimizer, AdamW) assert optimizer.state_dict()["param_groups"][0]["weight_decay"] == approx(0.01) @patch("llama_recipes.finetuning.train") +@patch("llama_recipes.finetuning.MllamaForConditionalGeneration.from_pretrained") +@patch("llama_recipes.finetuning.AutoProcessor.from_pretrained") @patch("llama_recipes.finetuning.LlamaForCausalLM.from_pretrained") +@patch("llama_recipes.finetuning.AutoConfig.from_pretrained") @patch("llama_recipes.finetuning.AutoTokenizer.from_pretrained") @patch("llama_recipes.finetuning.get_preprocessed_dataset") @patch("llama_recipes.finetuning.optim.AdamW") @patch("llama_recipes.finetuning.StepLR") def test_batching_strategy( - step_lr, optimizer, get_dataset, tokenizer, get_model, train + step_lr, + optimizer, + get_dataset, + tokenizer, + get_config, + get_model, + get_processor, + get_mmodel, + train, + model_type, ): - kwargs = {"batching_strategy": "packing"} + kwargs = { + "batching_strategy": "packing", + } get_dataset.return_value = get_fake_dataset() - get_model.return_value.get_input_embeddings.return_value.weight.shape = [0] + model = get_model if model_type == "llama" else get_mmodel + model.return_value.get_input_embeddings.return_value.weight.shape = [0] - main(**kwargs) + get_config.return_value = Config(model_type=model_type) - assert train.call_count == 1 + c = nullcontext() if model_type == "llama" else pytest.raises(ValueError) + + with c: + main(**kwargs) - args, kwargs = train.call_args - train_dataloader, eval_dataloader = args[1:3] - assert isinstance(train_dataloader.batch_sampler, BatchSampler) - assert isinstance(eval_dataloader.batch_sampler, BatchSampler) + assert train.call_count == (1 if model_type == "llama" else 0) + + if model_type == "llama": + args, kwargs = train.call_args + train_dataloader, eval_dataloader = args[1:3] + assert isinstance(train_dataloader.batch_sampler, BatchSampler) + assert isinstance(eval_dataloader.batch_sampler, BatchSampler) kwargs["batching_strategy"] = "padding" train.reset_mock() diff --git a/src/tests/test_train_utils.py b/src/tests/test_train_utils.py index ca92c21ed..66e3e9f07 100644 --- a/src/tests/test_train_utils.py +++ b/src/tests/test_train_utils.py @@ -27,10 +27,16 @@ def temp_output_dir(): @patch("llama_recipes.utils.train_utils.nullcontext") @patch("llama_recipes.utils.train_utils.torch.cuda.amp.GradScaler") @patch("llama_recipes.utils.train_utils.torch.cuda.amp.autocast") -def test_gradient_accumulation(autocast, scaler, nullcontext, mem_trace, mocker): +def test_gradient_accumulation( + autocast, + scaler, + nullcontext, + mem_trace, + mocker): model = mocker.MagicMock(name="model") model().loss.__truediv__().detach.return_value = torch.tensor(1) + model().loss.detach.return_value = torch.tensor(1) mock_tensor = mocker.MagicMock(name="tensor") batch = {"input": mock_tensor} train_dataloader = [batch, batch, batch, batch, batch] @@ -47,6 +53,9 @@ def test_gradient_accumulation(autocast, scaler, nullcontext, mem_trace, mocker) train_config.max_train_step = 0 train_config.max_eval_step = 0 train_config.save_metrics = False + train_config.flop_counter_start = 0 + train_config.use_profiler = False + train_config.flop_counter = True train( model, @@ -86,6 +95,7 @@ def test_gradient_accumulation(autocast, scaler, nullcontext, mem_trace, mocker) def test_save_to_json(temp_output_dir, mocker): model = mocker.MagicMock(name="model") model().loss.__truediv__().detach.return_value = torch.tensor(1) + model().loss.detach.return_value = torch.tensor(1) mock_tensor = mocker.MagicMock(name="tensor") batch = {"input": mock_tensor} train_dataloader = [batch, batch, batch, batch, batch] @@ -103,6 +113,7 @@ def test_save_to_json(temp_output_dir, mocker): train_config.max_train_step = 0 train_config.max_eval_step = 0 train_config.output_dir = temp_output_dir + train_config.flop_counter_start = 0 train_config.use_profiler = False results = train( diff --git a/src/tests/utils.py b/src/tests/utils.py new file mode 100644 index 000000000..14b96a9a1 --- /dev/null +++ b/src/tests/utils.py @@ -0,0 +1,50 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. + +from transformers import AutoTokenizer + + +class FakeTokenizer(object): + def __init__(self): + self.pad_token_id = 0 + self.bos_token_id = 42 + self.eos_token_id = 43 + self.sep_token_id = 3 + self.vocab_size = 128256 + + self.pad_token = "<|pad_id|>" + self.bos_token = "<|bos_id|>" + self.eos_token = "<|eos_id|>" + self.sep_token = "<|sep_id|>" + self.tokenizer = self + self.padding_side = "left" + + def __call__(self, *args, **kwargs): + ids = self.encode(*args, **kwargs) + return {"input_ids": ids} + + def encode(self, text, *args, **kwargs): + return [self.bos_token_id] + [len(c) for c in text.split(" ")] + [self.eos_token_id] + + def __len__(self): + return 128256 + + def pad(self, *args, **kwargs): + args = args[0] + max_len = max([len(a["input_ids"]) for a in args]) + for a in args: + for k in a.keys(): + a[k] = a[k] + ([self.pad_token_id if k == "input_ids" else 0] * (max_len - len(a))) + out = {} + for k in args[0].keys(): + out[k] = [a[k] for a in args] + return out + + +def maybe_tokenizer(name): + if name == "fake_llama": + return FakeTokenizer() + try: + return AutoTokenizer.from_pretrained(name) + except OSError: + return None