diff --git a/examples/parsing_modes/demo_auto_mode.ipynb b/examples/parsing_modes/demo_auto_mode.ipynb new file mode 100644 index 0000000..bbf8333 --- /dev/null +++ b/examples/parsing_modes/demo_auto_mode.ipynb @@ -0,0 +1,657 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Cost-Optimized Parsing with Auto-Mode\n", + "\n", + "\"Open\n", + "\n", + "Many documents can have varying complexity across pages - some pages have text, and other pages have images. The text-only pages only require cheap parsing modes, whereas the image-based pages require more advanced modes. In this notebook we show you how to take advantage of \"auto-mode\" in LlamaParse which adaptively parses different pages according to different modes, which lets you get optimal performance at the cheapest cost.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install llama-index\n", + "!pip install llama-index-core\n", + "!pip install llama-index-embeddings-openai llama-index-llms-openai\n", + "!pip install llama-parse" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2024-12-08 14:28:09-- https://assets.amazon.science/9f/a3/ae41627f4ab2bde091f1ebc6b830/the-amazon-nova-family-of-models-technical-report-and-model-card.pdf\n", + "Resolving assets.amazon.science (assets.amazon.science)... 18.155.192.66, 18.155.192.102, 18.155.192.84, ...\n", + "Connecting to assets.amazon.science (assets.amazon.science)|18.155.192.66|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 21222963 (20M) [application/pdf]\n", + "Saving to: ‘./data/nova_technical_report.pdf’\n", + "\n", + "./data/nova_technic 100%[===================>] 20.24M 36.1MB/s in 0.6s \n", + "\n", + "2024-12-08 14:28:10 (36.1 MB/s) - ‘./data/nova_technical_report.pdf’ saved [21222963/21222963]\n", + "\n" + ] + } + ], + "source": [ + "!mkdir -p data\n", + "!wget 'https://assets.amazon.science/9f/a3/ae41627f4ab2bde091f1ebc6b830/the-amazon-nova-family-of-models-technical-report-and-model-card.pdf' -O './data/nova_technical_report.pdf'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Some OpenAI and LlamaParse details" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# llama-parse is async-first, running the async code in a notebook requires the use of nest_asyncio\n", + "import nest_asyncio\n", + "\n", + "nest_asyncio.apply()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "# API access to llama-cloud\n", + "os.environ[\"LLAMA_CLOUD_API_KEY\"] = \"llx-...\"\n", + "\n", + "# Using OpenAI API for embeddings/llms\n", + "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.llms.openai import OpenAI\n", + "from llama_index.embeddings.openai import OpenAIEmbedding\n", + "from llama_index.core import VectorStoreIndex\n", + "from llama_index.core import Settings\n", + "\n", + "embed_model = OpenAIEmbedding(model=\"text-embedding-3-small\")\n", + "llm = OpenAI(model=\"gpt-4o-mini\")\n", + "\n", + "Settings.llm = llm\n", + "Settings.embed_model = embed_model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using `LlamaParse` with Auto-Mode\n", + "\n", + "We feed the Uber March 2022 10QA into LlamaParse with auto-mode enabled to get back the Markdown representation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Started parsing the file under job_id 1dcfb080-9ee8-4e61-904d-2c94b0dad1cf\n" + ] + } + ], + "source": [ + "from llama_parse import LlamaParse\n", + "\n", + "file_path = \"data/nova_technical_report.pdf\"\n", + "\n", + "documents = LlamaParse(\n", + " result_type=\"markdown\",\n", + " auto_mode=True,\n", + " auto_mode_trigger_on_image_in_page=True,\n", + " # auto_mode_trigger_on_table_in_page=False,\n", + " # auto_mode_trigger_on_text_in_page=\"\"\n", + " # auto_mode_trigger_on_regexp_in_page=\"\"\n", + ").load_data(file_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Started parsing the file under job_id 01a5bbdf-744b-4ac1-8fab-ea963d722164\n", + "...." + ] + } + ], + "source": [ + "base_documents = LlamaParse(result_type=\"markdown\", invalidate_cache=True).load_data(\n", + " file_path\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# base_documents" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Show Example Pages\n", + "\n", + "Here we show example pages that are parsed with auto-mode. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from copy import deepcopy\n", + "from llama_index.core.schema import TextNode\n", + "from llama_index.core import VectorStoreIndex\n", + "\n", + "\n", + "def get_page_nodes(docs, separator=\"\\n---\\n\"):\n", + " \"\"\"Split each document into page node, by separator.\"\"\"\n", + " nodes = []\n", + " for doc in docs:\n", + " doc_chunks = doc.text.split(separator)\n", + " for doc_chunk in doc_chunks:\n", + " node = TextNode(\n", + " text=doc_chunk,\n", + " metadata=deepcopy(doc.metadata),\n", + " )\n", + " nodes.append(node)\n", + "\n", + " return nodes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "page_nodes = get_page_nodes(documents)\n", + "base_page_nodes = get_page_nodes(base_documents)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Page 11** contains images and tables, and we can see that auto-mode automatically switches to higher-quality parsing vs. the default parsed page.\n", + "\n", + "![](page_11.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# The Amazon Nova Family of Models\n", + "\n", + "| Nova Micro | Nova Lite | Nova Pro |\n", + "|------------|-----------|----------|\n", + "| Nova Micro performance chart | Nova Lite performance chart | Nova Pro performance chart |\n", + "| Context Length | Context Length | Context Length |\n", + "\n", + "Figure 2: Text Needle-in-a-Haystack recall performance for Nova Micro (up-to 128k), Nova Lite (up-to 300k) and Nova Pro (up-to 300k) models.\n", + "\n", + "| | SQuALITY | LVBench |\n", + "|------------|-----------|----------|\n", + "| | ROUGE-L | accuracy |\n", + "| Nova Pro | 19.8 ±8.7 | 41.6 ±2.5 |\n", + "| Nova Lite | 19.2 ±8.6 | 40.4 ±2.4 |\n", + "| Nova Micro | 18.8 ±8.6 | - |\n", + "| Claude 3.5 Sonnet (Jun) | 13.4 ±7.5 | - |\n", + "| Gemini 1.5 Pro (001) | - | 33.1 ±2.3 |\n", + "| Gemini 1.5 Pro (002) | 19.1 ±8.6 M | - |\n", + "| Gemini 1.5 Flash (002) | 18.1 ±8.4 M | - |\n", + "| GPT-4o | 18.8 ±8.6 | 30.8 ±2.3 |\n", + "| Llama 3 - 70B | 16.4 ±8.1 | - |\n", + "| Llama 3 - 8B | 15.3 ±7.9 | - |\n", + "\n", + "Table 6: Text and Multimodal long context performance on SQuALITY (ROUGE-L) and LVBench (Accuracy). For SQuALITY, measurements for Claude 3.5 Sonnet, GPT-4o, Llama 3 70B and Llama 3 8B are taken from the Llama 3 report [45]. Gemini results were measured by us² (M). For LVBench, Gemini and GPT-4o numbers were taken from the corresponding benchmark leaderboard [77].\n", + "\n", + "Results for text and multimodal long context benchmarks are presented in Table 6. In the long video question answering task, both Amazon Nova Pro and Lite demonstrate robust performance on the LVBench dataset, surpassing other models. Amazon Nova models consistently demonstrate exceptional performance in retrieving information from any depth across both text and multimodal understanding use cases, delivering high accuracy and reliability.\n", + "\n", + "## 2.4 Functional expertise\n", + "\n", + "In addition to core capabilities, foundation models must perform well in particular specialties and domains. Across our many areas of performance analyses, we have selected four domains for which to present benchmarking results: Software engineering, financial analysis, and retrieval-augmented generation. Prompt templates for all benchmarks can be found in Appendix B.3.\n" + ] + } + ], + "source": [ + "print(page_nodes[10].get_content())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# The Amazon Nova Family of Models\n", + "\n", + "| |Nova Micro|Nova Lite|Nova Pro|\n", + "|---|---|---|---|\n", + "|10|10|10|100|\n", + "|20|20|20| |\n", + "|30|30|30|75|\n", + "|40|40|40| |\n", + "|50|50|50|50|\n", + "|60|60|60| |\n", + "|70|70|70|25|\n", + "|80|80|80| |\n", + "|90|90|90| |\n", + "|100|100|100| |\n", + "\n", + "Context Length\n", + "\n", + "Figure 2: Text Needle-in-a-Haystack recall performance for Nova Micro (up-to 128k), Nova Lite (up-to 300k) and Nova Pro (up-to 300k) models.\n", + "\n", + "# SQuALITY\n", + "\n", + "# LVBench\n", + "\n", + "| |ROUGE-L|accuracy|\n", + "|---|---|---|\n", + "|Nova Pro|19.8 ±8.7|41.6 ±2.5|\n", + "|Nova Lite|19.2 ±8.6|40.4 ±2.4|\n", + "|Nova Micro|18.8 ±8.6|-|\n", + "|Claude 3.5 Sonnet (Jun)|13.4 ±7.5|-|\n", + "|Gemini 1.5 Pro (001)|-|33.1 ±2.3|\n", + "|Gemini 1.5 Pro (002)|19.1 ±8.6 M|-|\n", + "|Gemini 1.5 Flash (002)|18.1 ±8.4 M|-|\n", + "|GPT-4o|18.8 ±8.6|30.8 ±2.3|\n", + "|Llama 3 - 70B|16.4 ±8.1|-|\n", + "|Llama 3 - 8B|15.3 ±7.9|-|\n", + "\n", + "Table 6: Text and Multimodal long context performance on SQuALITY (ROUGE-L) and LVBench (Accuracy). For SQuALITY, measurements for Claude 3.5 Sonnet, GPT-4o, Llama 3 70B and Llama 3 8B are taken from the Llama 3 report [45]. Gemini results were measured by us2 (M). For LVBench, Gemini and GPT-4o numbers were taken from the corresponding benchmark leaderboard [77].\n", + "\n", + "Results for text and multimodal long context benchmarks are presented in Table 6. In the long video question answering task, both Amazon Nova Pro and Lite demonstrate robust performance on the LVBench dataset, surpassing other models. Amazon Nova models consistently demonstrate exceptional performance in retrieving information from any depth across both text and multimodal understanding use cases, delivering high accuracy and reliability.\n", + "\n", + "# 2.4 Functional expertise\n", + "\n", + "In addition to core capabilities, foundation models must perform well in particular specialties and domains. Across our many areas of performance analyses, we have selected four domains for which to present benchmarking results: Software engineering, financial analysis, and retrieval-augmented generation. Prompt templates for all benchmarks can be found in Appendix B.3.\n" + ] + } + ], + "source": [ + "print(base_page_nodes[10].get_content())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Page 14** contains all charts. Auto-mode detects these charts and uses premium processing to convert these charts into both tabular and mermaid format. Whereas the markdown mode has a few more challenges in converting the chart to markdown.\n", + "\n", + "![](page_14.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# The Amazon Nova Family of Models\n", + "\n", + "| Model Family | Meta | Amazon | Google | Mistral AI | OpenAI | Anthropic |\n", + "|--------------|------|--------|--------|------------|--------|-----------|\n", + "| Time to First Token (sec) | 0.72 | 0.37 | 0.35 | 0.53 | 0.62 | 0.98 |\n", + "| Output Tokens per Second | 58 | 115 | 190 | 73 | 64 | 29 |\n", + "| Total Response Time (sec) | 2.9 | 1.4 | 0.9 | 2.4 | 2.7 | 4.0 |\n", + "\n", + "```mermaid\n", + "graph TD\n", + " subgraph \"Time to First Token (sec)\"\n", + " A1[Llama 2 7B] --> 0.29\n", + " A2[Nova Micro] --> 0.32\n", + " A3[Gemini 1.5 Pro 1B] --> 0.35\n", + " A4[Gemini 1.5 Pro 1.5B] --> 0.35\n", + " A5[Mistral 8x7B] --> 0.36\n", + " A6[Llama 2 13B] --> 0.36\n", + " A7[Nova Lite] --> 0.37\n", + " A8[Nova Pro] --> 0.38\n", + " A9[Llama 2 70B] --> 0.42\n", + " A10[GPT-3.5] --> 0.42\n", + " A11[Llama 2 34B] --> 0.46\n", + " A12[Mistral Large] --> 0.53\n", + " A13[GPT-4] --> 0.62\n", + " A14[Llama 2 34B] --> 0.72\n", + " A15[Claude 2] --> 0.72\n", + " A16[Claude 3 Sonnet] --> 0.87\n", + " A17[Gemini 1.5 Pro] --> 0.98\n", + " end\n", + "\n", + " subgraph \"Output Tokens per Second\"\n", + " B1[Gemini 1.5 Pro 1B] --> 283\n", + " B2[Nova Micro] --> 210\n", + " B3[Gemini 1.5 Pro] --> 190\n", + " B4[GPT-4] --> 163\n", + " B5[Nova Lite] --> 157\n", + " B6[Llama 2 13B] --> 157\n", + " B7[Llama 2 34B] --> 124\n", + " B8[Mistral 8x7B] --> 115\n", + " B9[GPT-3.5] --> 113\n", + " B10[Nova Pro] --> 100\n", + " B11[Llama 2 70B] --> 73\n", + " B12[Claude 3 Sonnet] --> 64\n", + " B13[Gemini 1.5 Pro] --> 58\n", + " B14[Claude 2] --> 57\n", + " B15[Llama 2 34B] --> 40\n", + " B16[Mistral Large] --> 35\n", + " B17[Llama 2 7B] --> 29\n", + " end\n", + "\n", + " subgraph \"Total Response Time (sec)\"\n", + " C1[Gemini 1.5 Pro 1B] --> 0.7\n", + " C2[Nova Micro] --> 0.8\n", + " C3[Gemini 1.5 Pro] --> 0.9\n", + " C4[Nova Lite] --> 1.0\n", + " C5[Llama 2 13B] --> 1.0\n", + " C6[Llama 2 34B] --> 1.1\n", + " C7[GPT-3.5] --> 1.2\n", + " C8[Mistral 8x7B] --> 1.3\n", + " C9[Nova Pro] --> 1.4\n", + " C10[GPT-4] --> 1.5\n", + " C11[Llama 2 70B] --> 1.7\n", + " C12[Claude 3 Sonnet] --> 2.4\n", + " C13[GPT-4] --> 2.7\n", + " C14[Gemini 1.5 Pro] --> 2.8\n", + " C15[Llama 2 34B] --> 2.9\n", + " C16[Mistral Large] --> 3.4\n", + " C17[Llama 2 7B] --> 4.0\n", + " end\n", + "```\n", + "\n", + "Figure 3: Time to First Token (↓), Output Tokens per Second (↑), and Total Response Time (↓) using 1,000 tokens of input and 100 tokens of output for Amazon Nova models and select publicly-available models (Artificial Analysis, Nov 29th, 2024).\n", + "\n", + "14\n" + ] + } + ], + "source": [ + "print(page_nodes[13].get_content())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# The Amazon Nova Family of Models\n", + "\n", + "# Model Family\n", + "\n", + "| | |Meta| |Amazon| |Google|Mistral AI| | |OpenAI| |Anthropic| | | | | | | | | | | | | | | | | | | | |\n", + "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n", + "|1.0| | | | | | | | | | | | | |0.98| | | | | | | | | | | | | | | | | | |\n", + "|0.8| | | | | | | | | | | | | |0.87| | | | | | | | | | | | | | | | | | |\n", + "|0.6| | | | | | | | | | | |0.72|0.72| | | | | | | | | | | | | | | | | | | |\n", + "|0.4| | | | | | |0.42|0.42|0.46| | | | | | | | | | | | | | | | | | | | | | | |\n", + "|0.2|0.32|0.35|0.35|0.36|0.36|0.37|0.38| | | | | | | | | | | | | | | | | | | | | | | | | |\n", + "|0.0|9|1|2|1|8|8|2|2|8|8|2|8|1|8|2|8|2|J|2| | | | | | | | | | | | | |\n", + "|1| | | | | | | | | | | | |8|1|1|1|1|5|1|~|{|Tv|!0'A|3|3|8|3|A|2| | | |\n", + "| |283|250|200|210|190|163|157|157| | | | | | | | | | | | | | | | | | | | | | | | |\n", + "| |150| | | |124|115|113|100| | | | | | | | | | | | | | | | | | | | | | | | |\n", + "| |100| | | | | | | |73|64|58|57|40|35| | | | | | | | | | | | | | | | | | |\n", + "| |50| | | | | | | | | | | | |29| | | | | | | | | | | | | | | | | | |\n", + "| |2|8|{|1|2|2|8|9|1|5|8|3|9| | | | | | | | | | | | | | | | | | | |\n", + "| | | | | | | | | | |1|1|2|8|1|1|2|1|~|2|8|3|1|N?|2|{|8|2|~|8|2|3|8|\n", + "| |J| |J| |J| | | | | |80'| |{| | | | | | | | | | | | | | | | | | | |\n", + "| | | | | | | | | | | | | | |4.0| | | | | | | | | | | | | | | | | | |\n", + "| |3.5| | | | | | | | | | | | |3.4| | | | | | | | | | | | | | | | | | |\n", + "| |3.0| | | | | | | | | |2.7|2.8|2.9| | | | | | | | | | | | | | | | | | | |\n", + "| |2.5| | | | | | | | |2.4| | | | | | | | | | | | | | | | | | | | | | |\n", + "| |2.0| | | | | | |1.5|1.7| | | | | | | | | | | | | | | | | | | | | | | |\n", + "| |1.5| |1.2| | |1.3|1.4| | | | | | | | | | | | | | | | | | | | | | | | | |\n", + "| |0.8|0.9|1.0|1.0| | | | | | | | | | | | | | | | | | | | | | | | | | | | |\n", + "| |1.0|0.5|0.7| | | | | | | | | | | | | | | | | | | | | | | | | | | | | |\n", + "| |0.0|8|0|2|g|5|8|9|8|2|2|8|1|K|2|2|8|2|J|{|8|2|8| | | | | | | | | |\n", + "| | | | | | | | | |{| | | | |2|1|~|1|1|8|3|8|3|3| | | | | | | | | |\n", + "| |JN?| | | | | | | | |8|3|K| | | | | | | | | | | | | | | | | | | | |\n", + "\n", + "Figure 3: Time to First Token (↓), Output Tokens per Second (↑), and Total Response Time (↓) using 1,000 tokens of input and 100 tokens of output for Amazon Nova models and select publicly-available models (Artificial Analysis, Nov 29th, 2024).\n" + ] + } + ], + "source": [ + "print(base_page_nodes[13].get_content())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Page 3** is fully text, and we can see there's no difference between the auto-mode parsed page vs. the default markdown-mode parsed page. \n", + "\n", + "![](page_3.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# The Amazon Nova Family of Models\n", + "\n", + "# 1 Introduction\n", + "\n", + "This document introduces Amazon Nova, a new generation of state-of-the-art foundation models that deliver frontier intelligence and industry-leading price performance.\n", + "\n", + "# 1.1 Amazon Nova Pro, Lite, and Micro\n", + "\n", + "Key capabilities of Amazon Nova Pro, Lite, and Micro include:\n", + "\n", + "- Frontier intelligence: Amazon Nova models possess frontier intelligence, enabling them to understand and process complex language tasks with state-of-the-art accuracy. Amazon Nova Micro sets new standards in its intelligence tier in several text benchmarks such as Language Understanding (MMLU), Deep Reasoning (GPQA), Mathematics (MATH), and Multi-step Reasoning (Big-Bench Hard). Our multimodal models, Amazon Nova Pro and Lite, take text, images, documents, and video as input and generate text as output. These models set standards in several benchmarks such as Video Captioning (VATEX), Visual QA (TextVQA), Function Calling (BFCL), and multimodal agentic benchmarks (GroundUI-1K, VisualWebBench, Mind2Web) in their respective intelligence tiers. These models are the first to offer video understanding capabilities on Amazon Bedrock, enabling deeper insights from multimedia content.\n", + "- Speed: Amazon Nova has been designed for fast inference, with Amazon Micro, Lite, and Pro each being one of the fastest models in their respective intelligence tiers.\n", + "- Agentic Workflows: Amazon Nova Pro, Lite, and Micro can power AI agents capable of breaking down and executing multi-step tasks. These models are integrated with Bedrock Knowledge Bases and they excel at retrieval-augmented generation (RAG) to ensure the best accuracy by grounding their responses to the developer’s data.\n", + "- Customizability: Developers can fine-tune these models with multimodal data (Pro and Lite) or text data (Pro, Lite, and Micro), providing the flexibility to achieve desired accuracy, latency, and cost. Developers can also run self-service Custom Fine-Tuning (CFT) and distillation of larger models to smaller ones via Bedrock APIs.\n", + "- Price-Performance: Each model was optimized to deliver exceptional price-performance value, offering state-of-the-art performance on key benchmarks at low cost.\n", + "\n", + "Amazon Nova Pro, Lite, and Micro are based on the Transformer architecture [74]. Each model went through a series of training processes that began with pretraining using a mixture of large amounts of multilingual and multimodal data. Our models were trained on data from a variety of sources, including licensed data, proprietary data, open source datasets, and publicly available data where appropriate. We curated data from over 200 languages, with particular emphasis on Arabic, Dutch, English, French, German, Hebrew, Hindi, Italian, Japanese, Korean, Portuguese, Russian, Simplified Chinese, Spanish, and Turkish. After pretraining, models iteratively went through a series of fine-tuning stages, including Supervised Fine-Tuning (SFT) on instruction-demonstration pairs (including multimodal ones) and reward model (RM) training from human preference data [59]. Finally, the models learned from human preferences via methods like Direct Preference Optimization (DPO) [62] and Proximal Policy Optimization (PPO) [68] to ensure that the final models are aligned with human preferences in both quality and responsibility.\n", + "\n", + "# 1.2 Amazon Nova Canvas and Reel\n", + "\n", + "Amazon Nova Canvas and Amazon Nova Reel are designed to create realistic multimodal content, including images and videos, for a wide range of applications such as advertising, marketing, and entertainment. Amazon Nova Canvas offers the following functionalities, with more details provided in Appendix A:\n", + "\n", + "- Text-to-image generation: Amazon Nova Canvas can generate images with various resolutions (from 512 up to 2K horizontal resolution) and aspect ratios (any aspect ratio between 1:4 and 4:1 with a maximum of 4.2M pixels). Customers can provide reference images to guide the model to generate outputs in a specific style or color palette, or to generate variations of an image.\n", + "- Image editing: Amazon Nova Canvas allows precise image editing operations like inpainting and outpainting through natural language mask prompts. These mask prompts describe the specific area of the input image that needs to be repainted. The user can also easily change a background with the background removal feature, leaving the subject of the image unchanged.\n" + ] + } + ], + "source": [ + "print(page_nodes[2].get_content())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# The Amazon Nova Family of Models\n", + "\n", + "# 1 Introduction\n", + "\n", + "This document introduces Amazon Nova, a new generation of state-of-the-art foundation models that deliver frontier intelligence and industry-leading price performance.\n", + "\n", + "# 1.1 Amazon Nova Pro, Lite, and Micro\n", + "\n", + "Key capabilities of Amazon Nova Pro, Lite, and Micro include:\n", + "\n", + "- Frontier intelligence: Amazon Nova models possess frontier intelligence, enabling them to understand and process complex language tasks with state-of-the-art accuracy. Amazon Nova Micro sets new standards in its intelligence tier in several text benchmarks such as Language Understanding (MMLU), Deep Reasoning (GPQA), Mathematics (MATH), and Multi-step Reasoning (Big-Bench Hard). Our multimodal models, Amazon Nova Pro and Lite, take text, images, documents, and video as input and generate text as output. These models set standards in several benchmarks such as Video Captioning (VATEX), Visual QA (TextVQA), Function Calling (BFCL), and multimodal agentic benchmarks (GroundUI-1K, VisualWebBench, Mind2Web) in their respective intelligence tiers. These models are the first to offer video understanding capabilities on Amazon Bedrock, enabling deeper insights from multimedia content.\n", + "- Speed: Amazon Nova has been designed for fast inference, with Amazon Micro, Lite, and Pro each being one of the fastest models in their respective intelligence tiers.\n", + "- Agentic Workflows: Amazon Nova Pro, Lite, and Micro can power AI agents capable of breaking down and executing multi-step tasks. These models are integrated with Bedrock Knowledge Bases and they excel at retrieval-augmented generation (RAG) to ensure the best accuracy by grounding their responses to the developer’s data.\n", + "- Customizability: Developers can fine-tune these models with multimodal data (Pro and Lite) or text data (Pro, Lite, and Micro), providing the flexibility to achieve desired accuracy, latency, and cost. Developers can also run self-service Custom Fine-Tuning (CFT) and distillation of larger models to smaller ones via Bedrock APIs.\n", + "- Price-Performance: Each model was optimized to deliver exceptional price-performance value, offering state-of-the-art performance on key benchmarks at low cost.\n", + "\n", + "Amazon Nova Pro, Lite, and Micro are based on the Transformer architecture [74]. Each model went through a series of training processes that began with pretraining using a mixture of large amounts of multilingual and multimodal data. Our models were trained on data from a variety of sources, including licensed data, proprietary data, open source datasets, and publicly available data where appropriate. We curated data from over 200 languages, with particular emphasis on Arabic, Dutch, English, French, German, Hebrew, Hindi, Italian, Japanese, Korean, Portuguese, Russian, Simplified Chinese, Spanish, and Turkish. After pretraining, models iteratively went through a series of fine-tuning stages, including Supervised Fine-Tuning (SFT) on instruction-demonstration pairs (including multimodal ones) and reward model (RM) training from human preference data [59]. Finally, the models learned from human preferences via methods like Direct Preference Optimization (DPO) [62] and Proximal Policy Optimization (PPO) [68] to ensure that the final models are aligned with human preferences in both quality and responsibility.\n", + "\n", + "# 1.2 Amazon Nova Canvas and Reel\n", + "\n", + "Amazon Nova Canvas and Amazon Nova Reel are designed to create realistic multimodal content, including images and videos, for a wide range of applications such as advertising, marketing, and entertainment. Amazon Nova Canvas offers the following functionalities, with more details provided in Appendix A:\n", + "\n", + "- Text-to-image generation: Amazon Nova Canvas can generate images with various resolutions (from 512 up to 2K horizontal resolution) and aspect ratios (any aspect ratio between 1:4 and 4:1 with a maximum of 4.2M pixels). Customers can provide reference images to guide the model to generate outputs in a specific style or color palette, or to generate variations of an image.\n", + "- Image editing: Amazon Nova Canvas allows precise image editing operations like inpainting and outpainting through natural language mask prompts. These mask prompts describe the specific area of the input image that needs to be repainted. The user can also easily change a background with the background removal feature, leaving the subject of the image unchanged.\n" + ] + } + ], + "source": [ + "print(base_page_nodes[2].get_content())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set up a Simple RAG Pipeline\n", + "\n", + "Let's set up a simple RAG pipeline over these documents! " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# dump both indexed tables and page text into the vector index\n", + "vector_index = VectorStoreIndex(page_nodes)\n", + "query_engine = vector_index.as_query_engine(similarity_top_k=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = query_engine.query(\n", + " \"Give me a comparison graph of time-to-first-token among all models\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Here is a comparison of the Time to First Token (TTFT) for various models:\n", + "\n", + "- **Llama 2 7B**: 0.29 sec\n", + "- **Nova Micro**: 0.32 sec\n", + "- **Gemini 1.5 Pro 1B**: 0.35 sec\n", + "- **Gemini 1.5 Pro 1.5B**: 0.35 sec\n", + "- **Mistral 8x7B**: 0.36 sec\n", + "- **Llama 2 13B**: 0.36 sec\n", + "- **Nova Lite**: 0.37 sec\n", + "- **Nova Pro**: 0.38 sec\n", + "- **Llama 2 70B**: 0.42 sec\n", + "- **GPT-3.5**: 0.42 sec\n", + "- **Llama 2 34B**: 0.46 sec\n", + "- **Mistral Large**: 0.53 sec\n", + "- **GPT-4**: 0.62 sec\n", + "- **Llama 2 34B**: 0.72 sec\n", + "- **Claude 2**: 0.72 sec\n", + "- **Claude 3 Sonnet**: 0.87 sec\n", + "- **Gemini 1.5 Pro**: 0.98 sec\n", + "\n", + "This data can be visualized in a bar graph format, with the models on the x-axis and the corresponding TTFT values on the y-axis, showing the performance of each model in terms of response time.\n" + ] + } + ], + "source": [ + "print(str(response))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "llama_parse", + "language": "python", + "name": "llama_parse" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/parsing_modes/page_11.png b/examples/parsing_modes/page_11.png new file mode 100644 index 0000000..fe36ee6 Binary files /dev/null and b/examples/parsing_modes/page_11.png differ diff --git a/examples/parsing_modes/page_14.png b/examples/parsing_modes/page_14.png new file mode 100644 index 0000000..b4b8f5f Binary files /dev/null and b/examples/parsing_modes/page_14.png differ diff --git a/examples/parsing_modes/page_3.png b/examples/parsing_modes/page_3.png new file mode 100644 index 0000000..b7989ed Binary files /dev/null and b/examples/parsing_modes/page_3.png differ