diff --git a/examples_notebooks/global_search/index.html b/examples_notebooks/global_search/index.html
index dfd58213e9..587b437a12 100644
--- a/examples_notebooks/global_search/index.html
+++ b/examples_notebooks/global_search/index.html
@@ -2248,7 +2248,7 @@ <h3 id="load-community-reports-as-context-for-global-search">Load community repo
 <pre>
 <span class="ansi-red-fg">---------------------------------------------------------------------------</span>
 <span class="ansi-red-fg">AttributeError</span>                            Traceback (most recent call last)
-<span class="ansi-green-fg">/tmp/ipykernel_2116/1512985616.py</span> in <span class="ansi-cyan-fg">?</span><span class="ansi-blue-fg">()</span>
+<span class="ansi-green-fg">/tmp/ipykernel_2110/1512985616.py</span> in <span class="ansi-cyan-fg">?</span><span class="ansi-blue-fg">()</span>
 <span class="ansi-green-intense-fg ansi-bold">      2</span> entity_df <span class="ansi-blue-fg">=</span> pd<span class="ansi-blue-fg">.</span>read_parquet<span class="ansi-blue-fg">(</span><span class="ansi-blue-fg">f"{INPUT_DIR}/{ENTITY_TABLE}.parquet"</span><span class="ansi-blue-fg">)</span>
 <span class="ansi-green-intense-fg ansi-bold">      3</span> report_df <span class="ansi-blue-fg">=</span> pd<span class="ansi-blue-fg">.</span>read_parquet<span class="ansi-blue-fg">(</span><span class="ansi-blue-fg">f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet"</span><span class="ansi-blue-fg">)</span>
 <span class="ansi-green-intense-fg ansi-bold">      4</span> entity_embedding_df <span class="ansi-blue-fg">=</span> pd<span class="ansi-blue-fg">.</span>read_parquet<span class="ansi-blue-fg">(</span><span class="ansi-blue-fg">f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet"</span><span class="ansi-blue-fg">)</span>
diff --git a/examples_notebooks/global_search_with_dynamic_community_selection/index.html b/examples_notebooks/global_search_with_dynamic_community_selection/index.html
index 5327ce1fb3..81c0b5dc77 100644
--- a/examples_notebooks/global_search_with_dynamic_community_selection/index.html
+++ b/examples_notebooks/global_search_with_dynamic_community_selection/index.html
@@ -2156,7 +2156,7 @@ <h3 id="load-community-reports-as-context-for-global-search">Load community repo
 <pre>
 <span class="ansi-red-fg">---------------------------------------------------------------------------</span>
 <span class="ansi-red-fg">AttributeError</span>                            Traceback (most recent call last)
-<span class="ansi-green-fg">/tmp/ipykernel_2146/2760368953.py</span> in <span class="ansi-cyan-fg">?</span><span class="ansi-blue-fg">()</span>
+<span class="ansi-green-fg">/tmp/ipykernel_2143/2760368953.py</span> in <span class="ansi-cyan-fg">?</span><span class="ansi-blue-fg">()</span>
 <span class="ansi-green-intense-fg ansi-bold">      2</span> entity_df <span class="ansi-blue-fg">=</span> pd<span class="ansi-blue-fg">.</span>read_parquet<span class="ansi-blue-fg">(</span><span class="ansi-blue-fg">f"{INPUT_DIR}/{ENTITY_TABLE}.parquet"</span><span class="ansi-blue-fg">)</span>
 <span class="ansi-green-intense-fg ansi-bold">      3</span> report_df <span class="ansi-blue-fg">=</span> pd<span class="ansi-blue-fg">.</span>read_parquet<span class="ansi-blue-fg">(</span><span class="ansi-blue-fg">f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet"</span><span class="ansi-blue-fg">)</span>
 <span class="ansi-green-intense-fg ansi-bold">      4</span> entity_embedding_df <span class="ansi-blue-fg">=</span> pd<span class="ansi-blue-fg">.</span>read_parquet<span class="ansi-blue-fg">(</span><span class="ansi-blue-fg">f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet"</span><span class="ansi-blue-fg">)</span>
diff --git a/examples_notebooks/index_migration/index.html b/examples_notebooks/index_migration/index.html
index b1fb986c2a..65d4cb3ac5 100644
--- a/examples_notebooks/index_migration/index.html
+++ b/examples_notebooks/index_migration/index.html
@@ -2324,9 +2324,8 @@ <h2 id="index-migration">Index Migration<a class="anchor-link" href="#index-migr
 </div>
 </clipboard-copy>
 </div>
-<div class="highlight-ipynb hl-python"><pre><span></span><span class="kn">from</span> <span class="nn">datashaper</span> <span class="kn">import</span> <span class="n">NoopVerbCallbacks</span>
-
-<span class="kn">from</span> <span class="nn">graphrag.cache.factory</span> <span class="kn">import</span> <span class="n">create_cache</span>
+<div class="highlight-ipynb hl-python"><pre><span></span><span class="kn">from</span> <span class="nn">graphrag.cache.factory</span> <span class="kn">import</span> <span class="n">create_cache</span>
+<span class="kn">from</span> <span class="nn">graphrag.callbacks.noop_verb_callbacks</span> <span class="kn">import</span> <span class="n">NoopVerbCallbacks</span>
 <span class="kn">from</span> <span class="nn">graphrag.index.flows.generate_text_embeddings</span> <span class="kn">import</span> <span class="n">generate_text_embeddings</span>
 
 <span class="c1"># We only need to re-run the embeddings workflow, to ensure that embeddings for all required search fields are in place</span>
@@ -2355,9 +2354,8 @@ <h2 id="index-migration">Index Migration<a class="anchor-link" href="#index-migr
     <span class="n">snapshot_embeddings_enabled</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
 <span class="p">)</span>
 </pre></div>
-<div class="clipboard-copy-txt" id="cell-7">from datashaper import NoopVerbCallbacks
-
-from graphrag.cache.factory import create_cache
+<div class="clipboard-copy-txt" id="cell-7">from graphrag.cache.factory import create_cache
+from graphrag.callbacks.noop_verb_callbacks import NoopVerbCallbacks
 from graphrag.index.flows.generate_text_embeddings import generate_text_embeddings
 
 # We only need to re-run the embeddings workflow, to ensure that embeddings for all required search fields are in place
@@ -2399,12 +2397,10 @@ <h2 id="index-migration">Index Migration<a class="anchor-link" href="#index-migr
 <pre>
 <span class="ansi-red-fg">---------------------------------------------------------------------------</span>
 <span class="ansi-red-fg">ImportError</span>                               Traceback (most recent call last)
-Cell <span class="ansi-green-fg">In[7], line 3</span>
-<span class="ansi-green-intense-fg ansi-bold">      1</span> <span class="ansi-bold" style="color: rgb(0,135,0)">from</span> <span class="ansi-bold" style="color: rgb(0,0,255)">datashaper</span> <span class="ansi-bold" style="color: rgb(0,135,0)">import</span> NoopVerbCallbacks
-<span class="ansi-green-fg">----&gt; 3</span> <span class="ansi-bold" style="color: rgb(0,135,0)">from</span> <span class="ansi-bold" style="color: rgb(0,0,255)">graphrag</span><span class="ansi-bold" style="color: rgb(0,0,255)">.</span><span class="ansi-bold" style="color: rgb(0,0,255)">cache</span><span class="ansi-bold" style="color: rgb(0,0,255)">.</span><span class="ansi-bold" style="color: rgb(0,0,255)">factory</span> <span class="ansi-bold" style="color: rgb(0,135,0)">import</span> create_cache
-<span class="ansi-green-intense-fg ansi-bold">      4</span> <span class="ansi-bold" style="color: rgb(0,135,0)">from</span> <span class="ansi-bold" style="color: rgb(0,0,255)">graphrag</span><span class="ansi-bold" style="color: rgb(0,0,255)">.</span><span class="ansi-bold" style="color: rgb(0,0,255)">index</span><span class="ansi-bold" style="color: rgb(0,0,255)">.</span><span class="ansi-bold" style="color: rgb(0,0,255)">flows</span><span class="ansi-bold" style="color: rgb(0,0,255)">.</span><span class="ansi-bold" style="color: rgb(0,0,255)">generate_text_embeddings</span> <span class="ansi-bold" style="color: rgb(0,135,0)">import</span> generate_text_embeddings
-<span class="ansi-green-intense-fg ansi-bold">      6</span> <span style="color: rgb(95,135,135)"># We only need to re-run the embeddings workflow, to ensure that embeddings for all required search fields are in place</span>
-<span class="ansi-green-intense-fg ansi-bold">      7</span> <span style="color: rgb(95,135,135)"># We'll construct the context and run this function flow directly to avoid everything else</span>
+Cell <span class="ansi-green-fg">In[7], line 1</span>
+<span class="ansi-green-fg">----&gt; 1</span> <span class="ansi-bold" style="color: rgb(0,135,0)">from</span> <span class="ansi-bold" style="color: rgb(0,0,255)">graphrag</span><span class="ansi-bold" style="color: rgb(0,0,255)">.</span><span class="ansi-bold" style="color: rgb(0,0,255)">cache</span><span class="ansi-bold" style="color: rgb(0,0,255)">.</span><span class="ansi-bold" style="color: rgb(0,0,255)">factory</span> <span class="ansi-bold" style="color: rgb(0,135,0)">import</span> create_cache
+<span class="ansi-green-intense-fg ansi-bold">      2</span> <span class="ansi-bold" style="color: rgb(0,135,0)">from</span> <span class="ansi-bold" style="color: rgb(0,0,255)">graphrag</span><span class="ansi-bold" style="color: rgb(0,0,255)">.</span><span class="ansi-bold" style="color: rgb(0,0,255)">callbacks</span><span class="ansi-bold" style="color: rgb(0,0,255)">.</span><span class="ansi-bold" style="color: rgb(0,0,255)">noop_verb_callbacks</span> <span class="ansi-bold" style="color: rgb(0,135,0)">import</span> NoopVerbCallbacks
+<span class="ansi-green-intense-fg ansi-bold">      3</span> <span class="ansi-bold" style="color: rgb(0,135,0)">from</span> <span class="ansi-bold" style="color: rgb(0,0,255)">graphrag</span><span class="ansi-bold" style="color: rgb(0,0,255)">.</span><span class="ansi-bold" style="color: rgb(0,0,255)">index</span><span class="ansi-bold" style="color: rgb(0,0,255)">.</span><span class="ansi-bold" style="color: rgb(0,0,255)">flows</span><span class="ansi-bold" style="color: rgb(0,0,255)">.</span><span class="ansi-bold" style="color: rgb(0,0,255)">generate_text_embeddings</span> <span class="ansi-bold" style="color: rgb(0,135,0)">import</span> generate_text_embeddings
 
 <span class="ansi-red-fg">ImportError</span>: cannot import name 'create_cache' from 'graphrag.cache.factory' (/home/runner/work/graphrag/graphrag/graphrag/cache/factory.py)</pre>
 </div>
diff --git a/index/architecture/index.html b/index/architecture/index.html
index d01f53c0c1..578e5cb653 100644
--- a/index/architecture/index.html
+++ b/index/architecture/index.html
@@ -628,27 +628,9 @@
 </li>
         
           <li class="md-nav__item">
-  <a href="#datashaper-workflows" class="md-nav__link">
+  <a href="#workflows" class="md-nav__link">
     <span class="md-ellipsis">
-      DataShaper Workflows
-    </span>
-  </a>
-  
-</li>
-        
-          <li class="md-nav__item">
-  <a href="#llm-based-workflow-steps" class="md-nav__link">
-    <span class="md-ellipsis">
-      LLM-based Workflow Steps
-    </span>
-  </a>
-  
-</li>
-        
-          <li class="md-nav__item">
-  <a href="#workflow-graphs" class="md-nav__link">
-    <span class="md-ellipsis">
-      Workflow Graphs
+      Workflows
     </span>
   </a>
   
@@ -1493,27 +1475,9 @@
 </li>
         
           <li class="md-nav__item">
-  <a href="#datashaper-workflows" class="md-nav__link">
-    <span class="md-ellipsis">
-      DataShaper Workflows
-    </span>
-  </a>
-  
-</li>
-        
-          <li class="md-nav__item">
-  <a href="#llm-based-workflow-steps" class="md-nav__link">
+  <a href="#workflows" class="md-nav__link">
     <span class="md-ellipsis">
-      LLM-based Workflow Steps
-    </span>
-  </a>
-  
-</li>
-        
-          <li class="md-nav__item">
-  <a href="#workflow-graphs" class="md-nav__link">
-    <span class="md-ellipsis">
-      Workflow Graphs
+      Workflows
     </span>
   </a>
   
@@ -1566,24 +1530,8 @@ <h3 id="knowledge-model">Knowledge Model</h3>
 <p>In order to support the GraphRAG system, the outputs of the indexing engine (in the Default Configuration Mode) are aligned to a knowledge model we call the <em>GraphRAG Knowledge Model</em>.
 This model is designed to be an abstraction over the underlying data storage technology, and to provide a common interface for the GraphRAG system to interact with.
 In normal use-cases the outputs of the GraphRAG Indexer would be loaded into a database system, and the GraphRAG's Query Engine would interact with the database using the knowledge model data-store types.</p>
-<h3 id="datashaper-workflows">DataShaper Workflows</h3>
-<p>GraphRAG's Indexing Pipeline is built on top of our open-source library, <a href="https://github.com/microsoft/datashaper">DataShaper</a>.
-DataShaper is a data processing library that allows users to declaratively express data pipelines, schemas, and related assets using well-defined schemas.
-DataShaper has implementations in JavaScript and Python, and is designed to be extensible to other languages.</p>
-<p>One of the core resource types within DataShaper is a <a href="https://github.com/microsoft/datashaper/blob/main/javascript/schema/src/workflow/WorkflowSchema.ts">Workflow</a>.
-Workflows are expressed as sequences of steps, which we call <a href="https://github.com/microsoft/datashaper/blob/main/javascript/schema/src/workflow/verbs.ts">verbs</a>.
-Each step has a verb name and a configuration object.
-In DataShaper, these verbs model relational concepts such as SELECT, DROP, JOIN, etc.. Each verb transforms an input data table, and that table is passed down the pipeline.</p>
-<pre class="mermaid"><code>---
-title: Sample Workflow
----
-flowchart LR
-    input[Input Table] --&gt; select[SELECT] --&gt; join[JOIN] --&gt; binarize[BINARIZE] --&gt; output[Output Table]</code></pre>
-<h3 id="llm-based-workflow-steps">LLM-based Workflow Steps</h3>
-<p>GraphRAG's Indexing Pipeline implements a handful of custom verbs on top of the standard, relational verbs that our DataShaper library provides. These verbs give us the ability to augment text documents with rich, structured data using the power of LLMs such as GPT-4. We utilize these verbs in our standard workflow to extract entities, relationships, claims, community structures, and community reports and summaries. This behavior is customizable and can be extended to support many kinds of AI-based data enrichment and extraction tasks.</p>
-<h3 id="workflow-graphs">Workflow Graphs</h3>
-<p>Because of the complexity of our data indexing tasks, we needed to be able to express our data pipeline as series of multiple, interdependent workflows.
-In the GraphRAG Indexing Pipeline, each workflow may define dependencies on other workflows, effectively forming a directed acyclic graph (DAG) of workflows, which is then used to schedule processing.</p>
+<h3 id="workflows">Workflows</h3>
+<p>Because of the complexity of our data indexing tasks, we needed to be able to express our data pipeline as series of multiple, interdependent workflows.</p>
 <pre class="mermaid"><code>---
 title: Sample Workflow DAG
 ---
@@ -1599,7 +1547,7 @@ <h3 id="dataframe-message-format">Dataframe Message Format</h3>
 <p>The primary unit of communication between workflows, and between workflow steps is an instance of <code>pandas.DataFrame</code>.
 Although side-effects are possible, our goal is to be <em>data-centric</em> and <em>table-centric</em> in our approach to data processing.
 This allows us to easily reason about our data, and to leverage the power of dataframe-based ecosystems.
-Our underlying dataframe technology may change over time, but our primary goal is to support the DataShaper workflow schema while retaining single-machine ease of use and developer ergonomics.</p>
+Our underlying dataframe technology may change over time, but our primary goal is to support the workflow schema while retaining single-machine ease of use and developer ergonomics.</p>
 <h3 id="llm-caching">LLM Caching</h3>
 <p>The GraphRAG library was designed with LLM interactions in mind, and a common setback when working with LLM APIs is various errors due to network latency, throttling, etc..
 Because of these potential error cases, we've added a cache layer around LLM interactions.
diff --git a/search/search_index.json b/search/search_index.json
index 1c8c995a8e..2641d067af 100644
--- a/search/search_index.json
+++ b/search/search_index.json
@@ -1 +1 @@
-{"config": {"lang": ["en"], "separator": "[\\s\\-]+", "pipeline": ["stopWordFilter"]}, "docs": [{"location": "", "title": "Welcome to GraphRAG", "text": "<p>\ud83d\udc49 Microsoft Research Blog Post  \ud83d\udc49 GraphRAG Accelerator  \ud83d\udc49 GraphRAG Arxiv</p> <p> </p> <p> Figure 1: An LLM-generated knowledge graph built using GPT-4 Turbo. </p> <p>GraphRAG is a structured, hierarchical approach to Retrieval Augmented Generation (RAG), as opposed to naive semantic-search approaches using plain text snippets. The GraphRAG process involves extracting a knowledge graph out of raw text, building a community hierarchy, generating summaries for these communities, and then leveraging these structures when perform RAG-based tasks.</p> <p>To learn more about GraphRAG and how it can be used to enhance your LLMs ability to reason about your private data, please visit the Microsoft Research Blog Post.</p>"}, {"location": "#solution-accelerator", "title": "Solution Accelerator \ud83d\ude80", "text": "<p>To quickstart the GraphRAG system we recommend trying the Solution Accelerator package. This provides a user-friendly end-to-end experience with Azure resources.</p>"}, {"location": "#get-started-with-graphrag", "title": "Get Started with GraphRAG \ud83d\ude80", "text": "<p>To start using GraphRAG, check out the Get Started guide. For a deeper dive into the main sub-systems, please visit the docpages for the Indexer and Query packages.</p>"}, {"location": "#graphrag-vs-baseline-rag", "title": "GraphRAG vs Baseline RAG \ud83d\udd0d", "text": "<p>Retrieval-Augmented Generation (RAG) is a technique to improve LLM outputs using real-world information. This technique is an important part of most LLM-based tools and the majority of RAG approaches use vector similarity as the search technique, which we call Baseline RAG. GraphRAG uses knowledge graphs to provide substantial improvements in question-and-answer performance when reasoning about complex information. RAG techniques have shown promise in helping LLMs to reason about private datasets - data that the LLM is not trained on and has never seen before, such as an enterprise\u2019s proprietary research, business documents, or communications. Baseline RAG was created to help solve this problem, but we observe situations where baseline RAG performs very poorly. For example:</p> <ul> <li>Baseline RAG struggles to connect the dots. This happens when answering a question requires traversing disparate pieces of information through their shared attributes in order to provide new synthesized insights.</li> <li>Baseline RAG performs poorly when being asked to holistically understand summarized semantic concepts over large data collections or even singular large documents.</li> </ul> <p>To address this, the tech community is working to develop methods that extend and enhance RAG. Microsoft Research\u2019s new approach, GraphRAG, uses LLMs to create a knowledge graph based on an input corpus. This graph, along with community summaries and graph machine learning outputs, are used to augment prompts at query time. GraphRAG shows substantial improvement in answering the two classes of questions described above, demonstrating intelligence or mastery that outperforms other approaches previously applied to private datasets.</p>"}, {"location": "#the-graphrag-process", "title": "The GraphRAG Process \ud83e\udd16", "text": "<p>GraphRAG builds upon our prior research and tooling using graph machine learning. The basic steps of the GraphRAG process are as follows:</p>"}, {"location": "#index", "title": "Index", "text": "<ul> <li>Slice up an input corpus into a series of TextUnits, which act as analyzable units for the rest of the process, and provide fine-grained references in our outputs.</li> <li>Extract all entities, relationships, and key claims from the TextUnits using an LLM.</li> <li>Perform a hierarchical clustering of the graph using the Leiden technique. To see this visually, check out Figure 1 above. Each circle is an entity (e.g., a person, place, or organization), with the size representing the degree of the entity, and the color representing its community.</li> <li>Generate summaries of each community and its constituents from the bottom-up. This aids in holistic understanding of the dataset.</li> </ul>"}, {"location": "#query", "title": "Query", "text": "<p>At query time, these structures are used to provide materials for the LLM context window when answering a question. The primary query modes are:</p> <ul> <li>Global Search for reasoning about holistic questions about the corpus by leveraging the community summaries.</li> <li>Local Search for reasoning about specific entities by fanning-out to their neighbors and associated concepts.</li> <li>DRIFT Search for reasoning about specific entities by fanning-out to their neighbors and associated concepts, but with the added context of community information.</li> </ul>"}, {"location": "#prompt-tuning", "title": "Prompt Tuning", "text": "<p>Using GraphRAG with your data out of the box may not yield the best possible results. We strongly recommend to fine-tune your prompts following the Prompt Tuning Guide in our documentation.</p>"}, {"location": "blog_posts/", "title": "Microsoft Research Blog", "text": "<ul> <li> <p> GraphRAG: Unlocking LLM discovery on narrative private data</p> <p>Published February 13, 2024 <p>By Jonathan Larson, Senior Principal Data Architect; Steven Truitt, Principal Program Manager</p> <li> <p> GraphRAG: New tool for complex data discovery now on GitHub</p> <p>Published July 2, 2024 <p>By Darren Edge, Senior Director; Ha Trinh, Senior Data Scientist; Steven Truitt, Principal Program Manager; Jonathan Larson, Senior Principal Data Architect</p> <li> <p> GraphRAG auto-tuning provides rapid adaptation to new domains</p> <p>Published September 9, 2024 <p>By Alonso Guevara Fern\u00e1ndez, Sr. Software Engineer; Katy Smith, Data Scientist II; Joshua Bradley, Senior Data Scientist; Darren Edge, Senior Director; Ha Trinh, Senior Data Scientist; Sarah Smith, Senior Program Manager; Ben Cutler, Senior Director; Steven Truitt, Principal Program Manager; Jonathan Larson, Senior Principal Data Architect</p> <li> <p> Introducing DRIFT Search: Combining global and local search methods to improve quality and efficiency</p> <p>Published October 31, 2024 <p>By Julian Whiting, Senior Machine Learning Engineer; Zachary Hills , Senior Software Engineer; Alonso Guevara Fern\u00e1ndez, Sr. Software Engineer; Ha Trinh, Senior Data Scientist; Adam Bradley , Managing Partner, Strategic Research; Jonathan Larson, Senior Principal Data Architect</p> <li> <p> GraphRAG: Improving global search via dynamic community selection</p> <p>Published November 15, 2024 <p>By Bryan Li, Research Intern; Ha Trinh, Senior Data Scientist; Darren Edge, Senior Director; Jonathan Larson, Senior Principal Data Architect</p> <li> <p> LazyGraphRAG: Setting a new standard for quality and cost</p> <p>Published November 25, 2024 <p>By Darren Edge, Senior Director; Ha Trinh, Senior Data Scientist;  Jonathan Larson, Senior Principal Data Architect</p> <li> <p> Moving to GraphRAG 1.0 \u2013 Streamlining ergonomics for developers and users</p> <p>Published December 16, 2024 <p>By Nathan Evans, Principal Software Architect; Alonso Guevara Fern\u00e1ndez, Senior Software Engineer; Joshua Bradley, Senior Data Scientist</p>"}, {"location": "cli/", "title": "CLI Reference", "text": "<p>This page documents the command-line interface of the graphrag library.</p>"}, {"location": "cli/#graphrag", "title": "graphrag", "text": "<p>GraphRAG: A graph-based retrieval-augmented generation (RAG) system.</p> <p>Usage:</p> <pre><code> [OPTIONS] COMMAND [ARGS]...\n</code></pre> <p>Options:</p> <pre><code>  --install-completion  Install completion for the current shell.\n  --show-completion     Show completion for the current shell, to copy it or\n                        customize the installation.\n</code></pre>"}, {"location": "cli/#index", "title": "index", "text": "<p>Build a knowledge graph index.</p> <p>Usage:</p> <pre><code> index [OPTIONS]\n</code></pre> <p>Options:</p> <pre><code>  --config PATH                   The configuration to use.\n  --root PATH                     The project root directory.  \\[default: .]\n  --verbose / --no-verbose        Run the indexing pipeline with verbose\n                                  logging  \\[default: no-verbose]\n  --memprofile / --no-memprofile  Run the indexing pipeline with memory\n                                  profiling  \\[default: no-memprofile]\n  --resume TEXT                   Resume a given indexing run\n  --logger [rich|print|none]      The progress logger to use.  \\[default:\n                                  rich]\n  --dry-run / --no-dry-run        Run the indexing pipeline without executing\n                                  any steps to inspect and validate the\n                                  configuration.  \\[default: no-dry-run]\n  --cache / --no-cache            Use LLM cache.  \\[default: cache]\n  --skip-validation / --no-skip-validation\n                                  Skip any preflight validation. Useful when\n                                  running no LLM steps.  \\[default: no-skip-\n                                  validation]\n  --output PATH                   Indexing pipeline output directory.\n                                  Overrides storage.base_dir in the\n                                  configuration file.\n</code></pre>"}, {"location": "cli/#init", "title": "init", "text": "<p>Generate a default configuration file.</p> <p>Usage:</p> <pre><code> init [OPTIONS]\n</code></pre> <p>Options:</p> <pre><code>  --root PATH  The project root directory.  \\[required]\n</code></pre>"}, {"location": "cli/#prompt-tune", "title": "prompt-tune", "text": "<p>Generate custom graphrag prompts with your own data (i.e. auto templating).</p> <p>Usage:</p> <pre><code> prompt-tune [OPTIONS]\n</code></pre> <p>Options:</p> <pre><code>  --root PATH                     The project root directory.  \\[default: .]\n  --config PATH                   The configuration to use.\n  --domain TEXT                   The domain your input data is related to.\n                                  For example 'space science', 'microbiology',\n                                  'environmental news'. If not defined, a\n                                  domain will be inferred from the input data.\n  --selection-method [all|random|top|auto]\n                                  The text chunk selection method.  \\[default:\n                                  random]\n  --n-subset-max INTEGER          The number of text chunks to embed when\n                                  --selection-method=auto.  \\[default: 300]\n  --k INTEGER                     The maximum number of documents to select\n                                  from each centroid when --selection-\n                                  method=auto.  \\[default: 15]\n  --limit INTEGER                 The number of documents to load when\n                                  --selection-method={random,top}.  \\[default:\n                                  15]\n  --max-tokens INTEGER            The max token count for prompt generation.\n                                  \\[default: 2000]\n  --min-examples-required INTEGER\n                                  The minimum number of examples to\n                                  generate/include in the entity extraction\n                                  prompt.  \\[default: 2]\n  --chunk-size INTEGER            The max token count for prompt generation.\n                                  \\[default: 200]\n  --language TEXT                 The primary language used for inputs and\n                                  outputs in graphrag prompts.\n  --discover-entity-types / --no-discover-entity-types\n                                  Discover and extract unspecified entity\n                                  types.  \\[default: discover-entity-types]\n  --output PATH                   The directory to save prompts to, relative\n                                  to the project root directory.  \\[default:\n                                  prompts]\n</code></pre>"}, {"location": "cli/#query", "title": "query", "text": "<p>Query a knowledge graph index.</p> <p>Usage:</p> <pre><code> query [OPTIONS]\n</code></pre> <p>Options:</p> <pre><code>  --method [local|global|drift|basic]\n                                  The query algorithm to use.  \\[required]\n  --query TEXT                    The query to execute.  \\[required]\n  --config PATH                   The configuration to use.\n  --data PATH                     Indexing pipeline output directory (i.e.\n                                  contains the parquet files).\n  --root PATH                     The project root directory.  \\[default: .]\n  --community-level INTEGER       The community level in the Leiden community\n                                  hierarchy from which to load community\n                                  reports. Higher values represent reports\n                                  from smaller communities.  \\[default: 2]\n  --dynamic-community-selection / --no-dynamic-community-selection\n                                  Use global search with dynamic community\n                                  selection.  \\[default: no-dynamic-community-\n                                  selection]\n  --response-type TEXT            Free form text describing the response type\n                                  and format, can be anything, e.g. Multiple\n                                  Paragraphs, Single Paragraph, Single\n                                  Sentence, List of 3-7 Points, Single Page,\n                                  Multi-Page Report. Default: Multiple\n                                  Paragraphs  \\[default: Multiple Paragraphs]\n  --streaming / --no-streaming    Print response in a streaming manner.\n                                  \\[default: no-streaming]\n</code></pre>"}, {"location": "cli/#update", "title": "update", "text": "<p>Update an existing knowledge graph index.</p> <p>Applies a default storage configuration (if not provided by config), saving the new index to the local file system in the <code>update_output</code> folder.</p> <p>Usage:</p> <pre><code> update [OPTIONS]\n</code></pre> <p>Options:</p> <pre><code>  --config PATH                   The configuration to use.\n  --root PATH                     The project root directory.  \\[default: .]\n  --verbose / --no-verbose        Run the indexing pipeline with verbose\n                                  logging  \\[default: no-verbose]\n  --memprofile / --no-memprofile  Run the indexing pipeline with memory\n                                  profiling  \\[default: no-memprofile]\n  --logger [rich|print|none]      The progress logger to use.  \\[default:\n                                  rich]\n  --cache / --no-cache            Use LLM cache.  \\[default: cache]\n  --skip-validation / --no-skip-validation\n                                  Skip any preflight validation. Useful when\n                                  running no LLM steps.  \\[default: no-skip-\n                                  validation]\n  --output PATH                   Indexing pipeline output directory.\n                                  Overrides storage.base_dir in the\n                                  configuration file.\n</code></pre>"}, {"location": "developing/", "title": "Development Guide", "text": ""}, {"location": "developing/#requirements", "title": "Requirements", "text": "Name Installation Purpose Python 3.10-3.12 Download The library is Python-based. Poetry Instructions Poetry is used for package management and virtualenv management in Python codebases"}, {"location": "developing/#getting-started", "title": "Getting Started", "text": ""}, {"location": "developing/#install-dependencies", "title": "Install Dependencies", "text": "<pre><code># Install Python dependencies.\npoetry install\n</code></pre>"}, {"location": "developing/#execute-the-indexing-engine", "title": "Execute the Indexing Engine", "text": "<pre><code>poetry run poe index &lt;...args&gt;\n</code></pre>"}, {"location": "developing/#executing-queries", "title": "Executing Queries", "text": "<pre><code>poetry run poe query &lt;...args&gt;\n</code></pre>"}, {"location": "developing/#azurite", "title": "Azurite", "text": "<p>Some unit and smoke tests use Azurite to emulate Azure resources. This can be started by running:</p> <pre><code>./scripts/start-azurite.sh\n</code></pre> <p>or by simply running <code>azurite</code> in the terminal if already installed globally. See the Azurite documentation for more information about how to install and use Azurite.</p>"}, {"location": "developing/#lifecycle-scripts", "title": "Lifecycle Scripts", "text": "<p>Our Python package utilizes Poetry to manage dependencies and poethepoet to manage build scripts.</p> <p>Available scripts are:</p> <ul> <li><code>poetry run poe index</code> - Run the Indexing CLI</li> <li><code>poetry run poe query</code> - Run the Query CLI</li> <li><code>poetry build</code> - This invokes <code>poetry build</code>, which will build a wheel file and other distributable artifacts.</li> <li><code>poetry run poe test</code> - This will execute all tests.</li> <li><code>poetry run poe test_unit</code> - This will execute unit tests.</li> <li><code>poetry run poe test_integration</code> - This will execute integration tests.</li> <li><code>poetry run poe test_smoke</code> - This will execute smoke tests.</li> <li><code>poetry run poe check</code> - This will perform a suite of static checks across the package, including:</li> <li>formatting</li> <li>documentation formatting</li> <li>linting</li> <li>security patterns</li> <li>type-checking</li> <li><code>poetry run poe fix</code> - This will apply any available auto-fixes to the package. Usually this is just formatting fixes.</li> <li><code>poetry run poe fix_unsafe</code> - This will apply any available auto-fixes to the package, including those that may be unsafe.</li> <li><code>poetry run poe format</code> - Explicitly run the formatter across the package.</li> </ul>"}, {"location": "developing/#troubleshooting", "title": "Troubleshooting", "text": ""}, {"location": "developing/#runtimeerror-llvm-config-failed-executing-please-point-llvm_config-to-the-path-for-llvm-config-when-running-poetry-install", "title": "\"RuntimeError: llvm-config failed executing, please point LLVM_CONFIG to the path for llvm-config\" when running poetry install", "text": "<p>Make sure llvm-9 and llvm-9-dev are installed:</p> <p><code>sudo apt-get install llvm-9 llvm-9-dev</code></p> <p>and then in your bashrc, add</p> <p><code>export LLVM_CONFIG=/usr/bin/llvm-config-9</code></p>"}, {"location": "developing/#numba_pymoduleh610-fatal-error-pythonh-no-such-file-or-directory-when-running-poetry-install", "title": "\"numba/_pymodule.h:6:10: fatal error: Python.h: No such file or directory\" when running poetry install", "text": "<p>Make sure you have python3.10-dev installed or more generally <code>python&lt;version&gt;-dev</code></p> <p><code>sudo apt-get install python3.10-dev</code></p>"}, {"location": "developing/#llm-call-constantly-exceeds-tpm-rpm-or-time-limits", "title": "LLM call constantly exceeds TPM, RPM or time limits", "text": "<p><code>GRAPHRAG_LLM_THREAD_COUNT</code> and <code>GRAPHRAG_EMBEDDING_THREAD_COUNT</code> are both set to 50 by default. You can modify these values to reduce concurrency. Please refer to the Configuration Documents</p>"}, {"location": "get_started/", "title": "Getting Started", "text": ""}, {"location": "get_started/#requirements", "title": "Requirements", "text": "<p>Python 3.10-3.12</p> <p>To get started with the GraphRAG system, you have a few options:</p> <p>\ud83d\udc49 Use the GraphRAG Accelerator solution  \ud83d\udc49 Install from pypi.  \ud83d\udc49 Use it from source</p>"}, {"location": "get_started/#quickstart", "title": "Quickstart", "text": "<p>To get started with the GraphRAG system we recommend trying the Solution Accelerator package. This provides a user-friendly end-to-end experience with Azure resources.</p>"}, {"location": "get_started/#overview", "title": "Overview", "text": "<p>The following is a simple end-to-end example for using the GraphRAG system. It shows how to use the system to index some text, and then use the indexed data to answer questions about the documents.</p>"}, {"location": "get_started/#install-graphrag", "title": "Install GraphRAG", "text": "<pre><code>pip install graphrag\n</code></pre> <p>The graphrag library includes a CLI for a no-code approach to getting started. Please review the full CLI documentation for further detail.</p>"}, {"location": "get_started/#running-the-indexer", "title": "Running the Indexer", "text": "<p>We need to set up a data project and some initial configuration. First let's get a sample dataset ready:</p> <pre><code>mkdir -p ./ragtest/input\n</code></pre> <p>Get a copy of A Christmas Carol by Charles Dickens from a trusted source:</p> <pre><code>curl https://www.gutenberg.org/cache/epub/24022/pg24022.txt -o ./ragtest/input/book.txt\n</code></pre>"}, {"location": "get_started/#set-up-your-workspace-variables", "title": "Set Up Your Workspace Variables", "text": "<p>To initialize your workspace, first run the <code>graphrag init</code> command. Since we have already configured a directory named <code>./ragtest</code> in the previous step, run the following command:</p> <pre><code>graphrag init --root ./ragtest\n</code></pre> <p>This will create two files: <code>.env</code> and <code>settings.yaml</code> in the <code>./ragtest</code> directory.</p> <ul> <li><code>.env</code> contains the environment variables required to run the GraphRAG pipeline. If you inspect the file, you'll see a single environment variable defined,   <code>GRAPHRAG_API_KEY=&lt;API_KEY&gt;</code>. This is the API key for the OpenAI API or Azure OpenAI endpoint. You can replace this with your own API key. If you are using another form of authentication (i.e. managed identity), please delete this file.</li> <li><code>settings.yaml</code> contains the settings for the pipeline. You can modify this file to change the settings for the pipeline.   </li> </ul>"}, {"location": "get_started/#openai-and-azure-openai", "title": "OpenAI and Azure OpenAI", "text": "<p>If running in OpenAI mode, update the value of <code>GRAPHRAG_API_KEY</code> in the <code>.env</code> file with your OpenAI API key.</p>"}, {"location": "get_started/#azure-openai", "title": "Azure OpenAI", "text": "<p>In addition, Azure OpenAI users should set the following variables in the settings.yaml file. To find the appropriate sections, just search for the <code>llm:</code> configuration, you should see two sections, one for the chat endpoint and one for the embeddings endpoint. Here is an example of how to configure the chat endpoint:</p> <pre><code>type: azure_openai_chat # Or azure_openai_embedding for embeddings\napi_base: https://&lt;instance&gt;.openai.azure.com\napi_version: 2024-02-15-preview # You can customize this for other versions\ndeployment_name: &lt;azure_model_deployment_name&gt;\n</code></pre> <ul> <li>For more details about configuring GraphRAG, see the configuration documentation.</li> <li>To learn more about Initialization, refer to the Initialization documentation.</li> <li>For more details about using the CLI, refer to the CLI documentation.</li> </ul>"}, {"location": "get_started/#running-the-indexing-pipeline", "title": "Running the Indexing pipeline", "text": "<p>Finally we'll run the pipeline!</p> <pre><code>graphrag index --root ./ragtest\n</code></pre> <p></p> <p>This process will take some time to run. This depends on the size of your input data, what model you're using, and the text chunk size being used (these can be configured in your <code>settings.yml</code> file). Once the pipeline is complete, you should see a new folder called <code>./ragtest/output</code> with a series of parquet files.</p>"}, {"location": "get_started/#using-the-query-engine", "title": "Using the Query Engine", "text": ""}, {"location": "get_started/#running-the-query-engine", "title": "Running the Query Engine", "text": "<p>Now let's ask some questions using this dataset.</p> <p>Here is an example using Global search to ask a high-level question:</p> <pre><code>graphrag query \\\n--root ./ragtest \\\n--method global \\\n--query \"What are the top themes in this story?\"\n</code></pre> <p>Here is an example using Local search to ask a more specific question about a particular character:</p> <pre><code>graphrag query \\\n--root ./ragtest \\\n--method local \\\n--query \"Who is Scrooge and what are his main relationships?\"\n</code></pre> <p>Please refer to Query Engine docs for detailed information about how to leverage our Local and Global search mechanisms for extracting meaningful insights from data after the Indexer has wrapped up execution.</p>"}, {"location": "get_started/#visualizing-the-graph", "title": "Visualizing the Graph", "text": "<p>Check out our visualization guide for a more interactive experience in debugging and exploring the knowledge graph.</p>"}, {"location": "visualization_guide/", "title": "Visualizing and Debugging Your Knowledge Graph", "text": "<p>The following step-by-step guide walks through the process to visualize a knowledge graph after it's been constructed by graphrag. Note that some of the settings recommended below are based on our own experience of what works well. Feel free to change and explore other settings for a better visualization experience!</p>"}, {"location": "visualization_guide/#1-run-the-pipeline", "title": "1. Run the Pipeline", "text": "<p>Before building an index, please review your <code>settings.yaml</code> configuration file and ensure that graphml snapshots is enabled. <pre><code>snapshots:\n  graphml: true\n</code></pre> (Optional) To support other visualization tools and exploration, additional parameters can be enabled that provide access to vector embeddings. <pre><code>embed_graph:\n  enabled: true # will generate node2vec embeddings for nodes\numap:\n  enabled: true # will generate UMAP embeddings for nodes\n</code></pre> After running the indexing pipeline over your data, there will be an output folder (defined by the <code>storage.base_dir</code> setting).</p> <ul> <li>Output Folder: Contains artifacts from the LLM\u2019s indexing pass.</li> </ul>"}, {"location": "visualization_guide/#2-locate-the-knowledge-graph", "title": "2. Locate the Knowledge Graph", "text": "<p>In the output folder, look for a file named <code>merged_graph.graphml</code>. graphml is a standard file format supported by many visualization tools. We recommend trying Gephi.</p>"}, {"location": "visualization_guide/#3-open-the-graph-in-gephi", "title": "3. Open the Graph in Gephi", "text": "<ol> <li>Install and open Gephi</li> <li>Navigate to the <code>output</code> folder containing the various parquet files.</li> <li>Import the <code>merged_graph.graphml</code> file into Gephi. This will result in a fairly plain view of the undirected graph nodes and edges.</li> </ol>"}, {"location": "visualization_guide/#4-install-the-leiden-algorithm-plugin", "title": "4. Install the Leiden Algorithm Plugin", "text": "<ol> <li>Go to <code>Tools</code> -&gt; <code>Plugins</code>.</li> <li>Search for \"Leiden Algorithm\".</li> <li>Click <code>Install</code> and restart Gephi.</li> </ol>"}, {"location": "visualization_guide/#5-run-statistics", "title": "5. Run Statistics", "text": "<ol> <li>In the <code>Statistics</code> tab on the right, click <code>Run</code> for <code>Average Degree</code> and <code>Leiden Algorithm</code>.</li> </ol> <ol> <li>For the Leiden Algorithm, adjust the settings:</li> <li>Quality function: Modularity</li> <li>Resolution: 1</li> </ol>"}, {"location": "visualization_guide/#6-color-the-graph-by-clusters", "title": "6. Color the Graph by Clusters", "text": "<ol> <li>Go to the <code>Appearance</code> pane in the upper left side of Gephi.</li> </ol> <ol> <li>Select <code>Nodes</code>, then <code>Partition</code>, and click the color palette icon in the upper right.</li> <li>Choose <code>Cluster</code> from the dropdown.</li> <li>Click the <code>Palette...</code> hyperlink, then <code>Generate...</code>.</li> <li>Uncheck <code>Limit number of colors</code>, click <code>Generate</code>, and then <code>Ok</code>.</li> <li>Click <code>Apply</code> to color the graph. This will color the graph based on the partitions discovered by Leiden.</li> </ol>"}, {"location": "visualization_guide/#7-resize-nodes-by-degree-centrality", "title": "7. Resize Nodes by Degree Centrality", "text": "<ol> <li>In the <code>Appearance</code> pane in the upper left, select <code>Nodes</code> -&gt; <code>Ranking</code></li> <li>Select the <code>Sizing</code> icon in the upper right.</li> <li>Choose <code>Degree</code> and set:</li> <li>Min: 10</li> <li>Max: 150</li> <li>Click <code>Apply</code>.</li> </ol>"}, {"location": "visualization_guide/#8-layout-the-graph", "title": "8. Layout the Graph", "text": "<ol> <li>In the <code>Layout</code> tab in the lower left, select <code>OpenORD</code>.</li> </ol> <ol> <li>Set <code>Liquid</code> and <code>Expansion</code> stages to 50, and everything else to 0.</li> <li>Click <code>Run</code> and monitor the progress.</li> </ol>"}, {"location": "visualization_guide/#9-run-forceatlas2", "title": "9. Run ForceAtlas2", "text": "<ol> <li>Select <code>Force Atlas 2</code> in the layout options.</li> </ol> <ol> <li>Adjust the settings:</li> <li>Scaling: 15</li> <li>Dissuade Hubs: checked</li> <li>LinLog mode: uncheck</li> <li>Prevent Overlap: checked</li> <li>Click <code>Run</code> and wait.</li> <li>Press <code>Stop</code> when it looks like the graph nodes have settled and no longer change position significantly.</li> </ol>"}, {"location": "visualization_guide/#10-add-text-labels-optional", "title": "10. Add Text Labels (Optional)", "text": "<ol> <li>Turn on text labels in the appropriate section.</li> <li>Configure and resize them as needed.</li> </ol> <p>Your final graph should now be visually organized and ready for analysis!</p>"}, {"location": "config/env_vars/", "title": "Default Configuration Mode (using Env Vars)", "text": ""}, {"location": "config/env_vars/#text-embeddings-customization", "title": "Text-Embeddings Customization", "text": "<p>By default, the GraphRAG indexer will only export embeddings required for our query methods. However, the model has embeddings defined for all plaintext fields, and these can be generated by setting the <code>GRAPHRAG_EMBEDDING_TARGET</code> environment variable to <code>all</code>.</p> <p>If the embedding target is <code>all</code>, and you want to only embed a subset of these fields, you may specify which embeddings to skip using the <code>GRAPHRAG_EMBEDDING_SKIP</code> argument described below.</p>"}, {"location": "config/env_vars/#embedded-fields", "title": "Embedded Fields", "text": "<ul> <li><code>text_unit.text</code></li> <li><code>document.text</code></li> <li><code>entity.title</code></li> <li><code>entity.description</code></li> <li><code>relationship.description</code></li> <li><code>community.title</code></li> <li><code>community.summary</code></li> <li><code>community.full_content</code></li> </ul>"}, {"location": "config/env_vars/#input-data", "title": "Input Data", "text": "<p>Our pipeline can ingest .csv or .txt data from an input folder. These files can be nested within subfolders. To configure how input data is handled, what fields are mapped over, and how timestamps are parsed, look for configuration values starting with <code>GRAPHRAG_INPUT_</code> below. In general, CSV-based data provides the most customizability. Each CSV should at least contain a <code>text</code> field (which can be mapped with environment variables), but it's helpful if they also have <code>title</code>, <code>timestamp</code>, and <code>source</code> fields. Additional fields can be included as well, which will land as extra fields on the <code>Document</code> table.</p>"}, {"location": "config/env_vars/#base-llm-settings", "title": "Base LLM Settings", "text": "<p>These are the primary settings for configuring LLM connectivity.</p> Parameter Required? Description Type Default Value <code>GRAPHRAG_API_KEY</code> Yes for OpenAI. Optional for AOAI The API key. (Note: <code>OPENAI_API_KEY is also used as a fallback). If not defined when using AOAI, managed identity will be used. |</code>str<code>|</code>None` <code>GRAPHRAG_API_BASE</code> For AOAI The API Base URL <code>str</code> <code>None</code> <code>GRAPHRAG_API_VERSION</code> For AOAI The AOAI API version. <code>str</code> <code>None</code> <code>GRAPHRAG_API_ORGANIZATION</code> The AOAI organization. <code>str</code> <code>None</code> <code>GRAPHRAG_API_PROXY</code> The AOAI proxy. <code>str</code> <code>None</code>"}, {"location": "config/env_vars/#text-generation-settings", "title": "Text Generation Settings", "text": "<p>These settings control the text generation model used by the pipeline. Any settings with a fallback will use the base LLM settings, if available.</p> Parameter Required? Description Type Default Value <code>GRAPHRAG_LLM_TYPE</code> For AOAI The LLM operation type. Either <code>openai_chat</code> or <code>azure_openai_chat</code> <code>str</code> <code>openai_chat</code> <code>GRAPHRAG_LLM_DEPLOYMENT_NAME</code> For AOAI The AOAI model deployment name. <code>str</code> <code>None</code> <code>GRAPHRAG_LLM_API_KEY</code> Yes (uses fallback) The API key. If not defined when using AOAI, managed identity will be used. <code>str</code> <code>None</code> <code>GRAPHRAG_LLM_API_BASE</code> For AOAI (uses fallback) The API Base URL <code>str</code> <code>None</code> <code>GRAPHRAG_LLM_API_VERSION</code> For AOAI (uses fallback) The AOAI API version. <code>str</code> <code>None</code> <code>GRAPHRAG_LLM_API_ORGANIZATION</code> For AOAI (uses fallback) The AOAI organization. <code>str</code> <code>None</code> <code>GRAPHRAG_LLM_API_PROXY</code> The AOAI proxy. <code>str</code> <code>None</code> <code>GRAPHRAG_LLM_MODEL</code> The LLM model. <code>str</code> <code>gpt-4-turbo-preview</code> <code>GRAPHRAG_LLM_MAX_TOKENS</code> The maximum number of tokens. <code>int</code> <code>4000</code> <code>GRAPHRAG_LLM_REQUEST_TIMEOUT</code> The maximum number of seconds to wait for a response from the chat client. <code>int</code> <code>180</code> <code>GRAPHRAG_LLM_MODEL_SUPPORTS_JSON</code> Indicates whether the given model supports JSON output mode. <code>True</code> to enable. <code>str</code> <code>None</code> <code>GRAPHRAG_LLM_THREAD_COUNT</code> The number of threads to use for LLM parallelization. <code>int</code> 50 <code>GRAPHRAG_LLM_THREAD_STAGGER</code> The time to wait (in seconds) between starting each thread. <code>float</code> 0.3 <code>GRAPHRAG_LLM_CONCURRENT_REQUESTS</code> The number of concurrent requests to allow for the embedding client. <code>int</code> 25 <code>GRAPHRAG_LLM_TOKENS_PER_MINUTE</code> The number of tokens per minute to allow for the LLM client. 0 = Bypass <code>int</code> 0 <code>GRAPHRAG_LLM_REQUESTS_PER_MINUTE</code> The number of requests per minute to allow for the LLM client. 0 = Bypass <code>int</code> 0 <code>GRAPHRAG_LLM_MAX_RETRIES</code> The maximum number of retries to attempt when a request fails. <code>int</code> 10 <code>GRAPHRAG_LLM_MAX_RETRY_WAIT</code> The maximum number of seconds to wait between retries. <code>int</code> 10 <code>GRAPHRAG_LLM_SLEEP_ON_RATE_LIMIT_RECOMMENDATION</code> Whether to sleep on rate limit recommendation. (Azure Only) <code>bool</code> <code>True</code> <code>GRAPHRAG_LLM_TEMPERATURE</code> The temperature to use generation. <code>float</code> 0 <code>GRAPHRAG_LLM_TOP_P</code> The top_p to use for sampling. <code>float</code> 1 <code>GRAPHRAG_LLM_N</code> The number of responses to generate. <code>int</code> 1"}, {"location": "config/env_vars/#text-embedding-settings", "title": "Text Embedding Settings", "text": "<p>These settings control the text embedding model used by the pipeline. Any settings with a fallback will use the base LLM settings, if available.</p> Parameter Required ? Description Type Default <code>GRAPHRAG_EMBEDDING_TYPE</code> For AOAI The embedding client to use. Either <code>openai_embedding</code> or <code>azure_openai_embedding</code> <code>str</code> <code>openai_embedding</code> <code>GRAPHRAG_EMBEDDING_DEPLOYMENT_NAME</code> For AOAI The AOAI deployment name. <code>str</code> <code>None</code> <code>GRAPHRAG_EMBEDDING_API_KEY</code> Yes (uses fallback) The API key to use for the embedding client. If not defined when using AOAI, managed identity will be used. <code>str</code> <code>None</code> <code>GRAPHRAG_EMBEDDING_API_BASE</code> For AOAI (uses fallback) The API base URL. <code>str</code> <code>None</code> <code>GRAPHRAG_EMBEDDING_API_VERSION</code> For AOAI (uses fallback) The AOAI API version to use for the embedding client. <code>str</code> <code>None</code> <code>GRAPHRAG_EMBEDDING_API_ORGANIZATION</code> For AOAI (uses fallback) The AOAI organization to use for the embedding client. <code>str</code> <code>None</code> <code>GRAPHRAG_EMBEDDING_API_PROXY</code> The AOAI proxy to use for the embedding client. <code>str</code> <code>None</code> <code>GRAPHRAG_EMBEDDING_MODEL</code> The model to use for the embedding client. <code>str</code> <code>text-embedding-3-small</code> <code>GRAPHRAG_EMBEDDING_BATCH_SIZE</code> The number of texts to embed at once. (Azure limit is 16) <code>int</code> 16 <code>GRAPHRAG_EMBEDDING_BATCH_MAX_TOKENS</code> The maximum tokens per batch (Azure limit is 8191) <code>int</code> 8191 <code>GRAPHRAG_EMBEDDING_TARGET</code> The target fields to embed. Either <code>required</code> or <code>all</code>. <code>str</code> <code>required</code> <code>GRAPHRAG_EMBEDDING_SKIP</code> A comma-separated list of fields to skip embeddings for . (e.g. 'relationship.description') <code>str</code> <code>None</code> <code>GRAPHRAG_EMBEDDING_THREAD_COUNT</code> The number of threads to use for parallelization for embeddings. <code>int</code> <code>GRAPHRAG_EMBEDDING_THREAD_STAGGER</code> The time to wait (in seconds) between starting each thread for embeddings. <code>float</code> 50 <code>GRAPHRAG_EMBEDDING_CONCURRENT_REQUESTS</code> The number of concurrent requests to allow for the embedding client. <code>int</code> 25 <code>GRAPHRAG_EMBEDDING_TOKENS_PER_MINUTE</code> The number of tokens per minute to allow for the embedding client. 0 = Bypass <code>int</code> 0 <code>GRAPHRAG_EMBEDDING_REQUESTS_PER_MINUTE</code> The number of requests per minute to allow for the embedding client. 0 = Bypass <code>int</code> 0 <code>GRAPHRAG_EMBEDDING_MAX_RETRIES</code> The maximum number of retries to attempt when a request fails. <code>int</code> 10 <code>GRAPHRAG_EMBEDDING_MAX_RETRY_WAIT</code> The maximum number of seconds to wait between retries. <code>int</code> 10 <code>GRAPHRAG_EMBEDDING_SLEEP_ON_RATE_LIMIT_RECOMMENDATION</code> Whether to sleep on rate limit recommendation. (Azure Only) <code>bool</code> <code>True</code>"}, {"location": "config/env_vars/#input-settings", "title": "Input Settings", "text": "<p>These settings control the data input used by the pipeline. Any settings with a fallback will use the base LLM settings, if available.</p>"}, {"location": "config/env_vars/#plaintext-input-data-graphrag_input_file_typetext", "title": "Plaintext Input Data (<code>GRAPHRAG_INPUT_FILE_TYPE</code>=text)", "text": "Parameter Description Type Required or Optional Default <code>GRAPHRAG_INPUT_FILE_PATTERN</code> The file pattern regexp to use when reading input files from the input directory. <code>str</code> optional <code>.*\\.txt$</code>"}, {"location": "config/env_vars/#csv-input-data-graphrag_input_file_typecsv", "title": "CSV Input Data (<code>GRAPHRAG_INPUT_FILE_TYPE</code>=csv)", "text": "Parameter Description Type Required or Optional Default <code>GRAPHRAG_INPUT_TYPE</code> The input storage type to use when reading files. (<code>file</code> or <code>blob</code>) <code>str</code> optional <code>file</code> <code>GRAPHRAG_INPUT_FILE_PATTERN</code> The file pattern regexp to use when reading input files from the input directory. <code>str</code> optional <code>.*\\.txt$</code> <code>GRAPHRAG_INPUT_SOURCE_COLUMN</code> The 'source' column to use when reading CSV input files. <code>str</code> optional <code>source</code> <code>GRAPHRAG_INPUT_TIMESTAMP_COLUMN</code> The 'timestamp' column to use when reading CSV input files. <code>str</code> optional <code>None</code> <code>GRAPHRAG_INPUT_TIMESTAMP_FORMAT</code> The timestamp format to use when parsing timestamps in the timestamp column. <code>str</code> optional <code>None</code> <code>GRAPHRAG_INPUT_TEXT_COLUMN</code> The 'text' column to use when reading CSV input files. <code>str</code> optional <code>text</code> <code>GRAPHRAG_INPUT_DOCUMENT_ATTRIBUTE_COLUMNS</code> A list of CSV columns, comma-separated, to incorporate as document fields. <code>str</code> optional <code>id</code> <code>GRAPHRAG_INPUT_TITLE_COLUMN</code> The 'title' column to use when reading CSV input files. <code>str</code> optional <code>title</code> <code>GRAPHRAG_INPUT_STORAGE_ACCOUNT_BLOB_URL</code> The Azure Storage blob endpoint to use when in <code>blob</code> mode and using managed identity. Will have the format <code>https://&lt;storage_account_name&gt;.blob.core.windows.net</code> <code>str</code> optional <code>None</code> <code>GRAPHRAG_INPUT_CONNECTION_STRING</code> The connection string to use when reading CSV input files from Azure Blob Storage. <code>str</code> optional <code>None</code> <code>GRAPHRAG_INPUT_CONTAINER_NAME</code> The container name to use when reading CSV input files from Azure Blob Storage. <code>str</code> optional <code>None</code> <code>GRAPHRAG_INPUT_BASE_DIR</code> The base directory to read input files from. <code>str</code> optional <code>None</code>"}, {"location": "config/env_vars/#data-mapping-settings", "title": "Data Mapping Settings", "text": "Parameter Description Type Required or Optional Default <code>GRAPHRAG_INPUT_FILE_TYPE</code> The type of input data, <code>csv</code> or <code>text</code> <code>str</code> optional <code>text</code> <code>GRAPHRAG_INPUT_ENCODING</code> The encoding to apply when reading CSV/text input files. <code>str</code> optional <code>utf-8</code>"}, {"location": "config/env_vars/#data-chunking", "title": "Data Chunking", "text": "Parameter Description Type Required or Optional Default <code>GRAPHRAG_CHUNK_SIZE</code> The chunk size in tokens for text-chunk analysis windows. <code>str</code> optional 1200 <code>GRAPHRAG_CHUNK_OVERLAP</code> The chunk overlap in tokens for text-chunk analysis windows. <code>str</code> optional 100 <code>GRAPHRAG_CHUNK_BY_COLUMNS</code> A comma-separated list of document attributes to groupby when performing TextUnit chunking. <code>str</code> optional <code>id</code> <code>GRAPHRAG_CHUNK_ENCODING_MODEL</code> The encoding model to use for chunking. <code>str</code> optional The top-level encoding model."}, {"location": "config/env_vars/#prompting-overrides", "title": "Prompting Overrides", "text": "Parameter Description Type Required or Optional Default <code>GRAPHRAG_ENTITY_EXTRACTION_PROMPT_FILE</code> The path (relative to the root) of an entity extraction prompt template text file. <code>str</code> optional <code>None</code> <code>GRAPHRAG_ENTITY_EXTRACTION_MAX_GLEANINGS</code> The maximum number of redrives (gleanings) to invoke when extracting entities in a loop. <code>int</code> optional 1 <code>GRAPHRAG_ENTITY_EXTRACTION_ENTITY_TYPES</code> A comma-separated list of entity types to extract. <code>str</code> optional <code>organization,person,event,geo</code> <code>GRAPHRAG_ENTITY_EXTRACTION_ENCODING_MODEL</code> The encoding model to use for entity extraction. <code>str</code> optional The top-level encoding model. <code>GRAPHRAG_SUMMARIZE_DESCRIPTIONS_PROMPT_FILE</code> The path (relative to the root) of an description summarization prompt template text file. <code>str</code> optional <code>None</code> <code>GRAPHRAG_SUMMARIZE_DESCRIPTIONS_MAX_LENGTH</code> The maximum number of tokens to generate per description summarization. <code>int</code> optional 500 <code>GRAPHRAG_CLAIM_EXTRACTION_ENABLED</code> Whether claim extraction is enabled for this pipeline. <code>bool</code> optional <code>False</code> <code>GRAPHRAG_CLAIM_EXTRACTION_DESCRIPTION</code> The claim_description prompting argument to utilize. <code>string</code> optional \"Any claims or facts that could be relevant to threat analysis.\" <code>GRAPHRAG_CLAIM_EXTRACTION_PROMPT_FILE</code> The claim extraction prompt to utilize. <code>string</code> optional <code>None</code> <code>GRAPHRAG_CLAIM_EXTRACTION_MAX_GLEANINGS</code> The maximum number of redrives (gleanings) to invoke when extracting claims in a loop. <code>int</code> optional 1 <code>GRAPHRAG_CLAIM_EXTRACTION_ENCODING_MODEL</code> The encoding model to use for claim extraction. <code>str</code> optional The top-level encoding model <code>GRAPHRAG_COMMUNITY_REPORTS_PROMPT_FILE</code> The community reports extraction prompt to utilize. <code>string</code> optional <code>None</code> <code>GRAPHRAG_COMMUNITY_REPORTS_MAX_LENGTH</code> The maximum number of tokens to generate per community reports. <code>int</code> optional 1500"}, {"location": "config/env_vars/#storage", "title": "Storage", "text": "<p>This section controls the storage mechanism used by the pipeline used for exporting output tables.</p> Parameter Description Type Required or Optional Default <code>GRAPHRAG_STORAGE_TYPE</code> The type of storage to use. Options are <code>file</code>, <code>memory</code>, or <code>blob</code> <code>str</code> optional <code>file</code> <code>GRAPHRAG_STORAGE_STORAGE_ACCOUNT_BLOB_URL</code> The Azure Storage blob endpoint to use when in <code>blob</code> mode and using managed identity. Will have the format <code>https://&lt;storage_account_name&gt;.blob.core.windows.net</code> <code>str</code> optional None <code>GRAPHRAG_STORAGE_CONNECTION_STRING</code> The Azure Storage connection string to use when in <code>blob</code> mode. <code>str</code> optional None <code>GRAPHRAG_STORAGE_CONTAINER_NAME</code> The Azure Storage container name to use when in <code>blob</code> mode. <code>str</code> optional None <code>GRAPHRAG_STORAGE_BASE_DIR</code> The base path to data outputs outputs. <code>str</code> optional None"}, {"location": "config/env_vars/#cache", "title": "Cache", "text": "<p>This section controls the cache mechanism used by the pipeline. This is used to cache LLM invocation results.</p> Parameter Description Type Required or Optional Default <code>GRAPHRAG_CACHE_TYPE</code> The type of cache to use. Options are <code>file</code>, <code>memory</code>, <code>none</code> or <code>blob</code> <code>str</code> optional <code>file</code> <code>GRAPHRAG_CACHE_STORAGE_ACCOUNT_BLOB_URL</code> The Azure Storage blob endpoint to use when in <code>blob</code> mode and using managed identity. Will have the format <code>https://&lt;storage_account_name&gt;.blob.core.windows.net</code> <code>str</code> optional None <code>GRAPHRAG_CACHE_CONNECTION_STRING</code> The Azure Storage connection string to use when in <code>blob</code> mode. <code>str</code> optional None <code>GRAPHRAG_CACHE_CONTAINER_NAME</code> The Azure Storage container name to use when in <code>blob</code> mode. <code>str</code> optional None <code>GRAPHRAG_CACHE_BASE_DIR</code> The base path to the cache files. <code>str</code> optional None"}, {"location": "config/env_vars/#reporting", "title": "Reporting", "text": "<p>This section controls the reporting mechanism used by the pipeline, for common events and error messages. The default is to write reports to a file in the output directory. However, you can also choose to write reports to the console or to an Azure Blob Storage container.</p> Parameter Description Type Required or Optional Default <code>GRAPHRAG_REPORTING_TYPE</code> The type of reporter to use. Options are <code>file</code>, <code>console</code>, or <code>blob</code> <code>str</code> optional <code>file</code> <code>GRAPHRAG_REPORTING_STORAGE_ACCOUNT_BLOB_URL</code> The Azure Storage blob endpoint to use when in <code>blob</code> mode and using managed identity. Will have the format <code>https://&lt;storage_account_name&gt;.blob.core.windows.net</code> <code>str</code> optional None <code>GRAPHRAG_REPORTING_CONNECTION_STRING</code> The Azure Storage connection string to use when in <code>blob</code> mode. <code>str</code> optional None <code>GRAPHRAG_REPORTING_CONTAINER_NAME</code> The Azure Storage container name to use when in <code>blob</code> mode. <code>str</code> optional None <code>GRAPHRAG_REPORTING_BASE_DIR</code> The base path to the reporting outputs. <code>str</code> optional None"}, {"location": "config/env_vars/#node2vec-parameters", "title": "Node2Vec Parameters", "text": "Parameter Description Type Required or Optional Default <code>GRAPHRAG_NODE2VEC_ENABLED</code> Whether to enable Node2Vec <code>bool</code> optional False <code>GRAPHRAG_NODE2VEC_NUM_WALKS</code> The Node2Vec number of walks to perform <code>int</code> optional 10 <code>GRAPHRAG_NODE2VEC_WALK_LENGTH</code> The Node2Vec walk length <code>int</code> optional 40 <code>GRAPHRAG_NODE2VEC_WINDOW_SIZE</code> The Node2Vec window size <code>int</code> optional 2 <code>GRAPHRAG_NODE2VEC_ITERATIONS</code> The number of iterations to run node2vec <code>int</code> optional 3 <code>GRAPHRAG_NODE2VEC_RANDOM_SEED</code> The random seed to use for node2vec <code>int</code> optional 597832"}, {"location": "config/env_vars/#data-snapshotting", "title": "Data Snapshotting", "text": "Parameter Description Type Required or Optional Default <code>GRAPHRAG_SNAPSHOT_EMBEDDINGS</code> Whether to enable embeddings snapshots. <code>bool</code> optional False <code>GRAPHRAG_SNAPSHOT_GRAPHML</code> Whether to enable GraphML snapshots. <code>bool</code> optional False <code>GRAPHRAG_SNAPSHOT_RAW_ENTITIES</code> Whether to enable raw entity snapshots. <code>bool</code> optional False <code>GRAPHRAG_SNAPSHOT_TOP_LEVEL_NODES</code> Whether to enable top-level node snapshots. <code>bool</code> optional False <code>GRAPHRAG_SNAPSHOT_TRANSIENT</code> Whether to enable transient table snapshots. <code>bool</code> optional False"}, {"location": "config/env_vars/#miscellaneous-settings", "title": "Miscellaneous Settings", "text": "Parameter Description Type Required or Optional Default <code>GRAPHRAG_ASYNC_MODE</code> Which async mode to use. Either <code>asyncio</code> or <code>threaded</code>. <code>str</code> optional <code>asyncio</code> <code>GRAPHRAG_ENCODING_MODEL</code> The text encoding model, used in tiktoken, to encode text. <code>str</code> optional <code>cl100k_base</code> <code>GRAPHRAG_MAX_CLUSTER_SIZE</code> The maximum number of entities to include in a single Leiden cluster. <code>int</code> optional 10 <code>GRAPHRAG_SKIP_WORKFLOWS</code> A comma-separated list of workflow names to skip. <code>str</code> optional <code>None</code> <code>GRAPHRAG_UMAP_ENABLED</code> Whether to enable UMAP layouts <code>bool</code> optional False"}, {"location": "config/init/", "title": "Configuring GraphRAG Indexing", "text": "<p>To start using GraphRAG, you must generate a configuration file. The <code>init</code> command is the easiest way to get started. It will create a <code>.env</code> and <code>settings.yaml</code> files in the specified directory with the necessary configuration settings. It will also output the default LLM prompts used by GraphRAG.</p>"}, {"location": "config/init/#usage", "title": "Usage", "text": "<pre><code>graphrag init [--root PATH]\n</code></pre>"}, {"location": "config/init/#options", "title": "Options", "text": "<ul> <li><code>--root PATH</code> - The project root directory to initialize graphrag at. Default is the current directory.</li> </ul>"}, {"location": "config/init/#example", "title": "Example", "text": "<pre><code>graphrag init --root ./ragtest\n</code></pre>"}, {"location": "config/init/#output", "title": "Output", "text": "<p>The <code>init</code> command will create the following files in the specified directory:</p> <ul> <li><code>settings.yaml</code> - The configuration settings file. This file contains the configuration settings for GraphRAG.</li> <li><code>.env</code> - The environment variables file. These are referenced in the <code>settings.yaml</code> file.</li> <li><code>prompts/</code> - The LLM prompts folder. This contains the default prompts used by GraphRAG, you can modify them or run the Auto Prompt Tuning command to generate new prompts adapted to your data.</li> </ul>"}, {"location": "config/init/#next-steps", "title": "Next Steps", "text": "<p>After initializing your workspace, you can either run the Prompt Tuning command to adapt the prompts to your data or even start running the Indexing Pipeline to index your data. For more information on configuring GraphRAG, see the Configuration documentation.</p>"}, {"location": "config/overview/", "title": "Configuring GraphRAG Indexing", "text": "<p>The GraphRAG system is highly configurable. This page provides an overview of the configuration options available for the GraphRAG indexing engine.</p>"}, {"location": "config/overview/#default-configuration-mode", "title": "Default Configuration Mode", "text": "<p>The default configuration mode is the simplest way to get started with the GraphRAG system. It is designed to work out-of-the-box with minimal configuration. The primary configuration sections for the Indexing Engine pipelines are described below. The main ways to set up GraphRAG in Default Configuration mode are via:</p> <ul> <li>Init command (recommended)</li> <li>Using YAML for deeper control</li> <li>Purely using environment variables</li> </ul>"}, {"location": "config/yaml/", "title": "Default Configuration Mode (using YAML/JSON)", "text": "<p>The default configuration mode may be configured by using a <code>settings.yml</code> or <code>settings.json</code> file in the data project root. If a <code>.env</code> file is present along with this config file, then it will be loaded, and the environment variables defined therein will be available for token replacements in your configuration document using <code>${ENV_VAR}</code> syntax. We initialize with YML by default in <code>graphrag init</code> but you may use the equivalent JSON form if preferred.</p> <p>Many of these config values have defaults. Rather than replicate them here, please refer to the constants in the code directly.</p> <p>For example:</p> <pre><code># .env\nGRAPHRAG_API_KEY=some_api_key\n\n# settings.yml\nllm: \n  api_key: ${GRAPHRAG_API_KEY}\n</code></pre>"}, {"location": "config/yaml/#config-sections", "title": "Config Sections", "text": ""}, {"location": "config/yaml/#indexing", "title": "Indexing", "text": ""}, {"location": "config/yaml/#llm", "title": "llm", "text": "<p>This is the base LLM configuration section. Other steps may override this configuration with their own LLM configuration.</p>"}, {"location": "config/yaml/#fields", "title": "Fields", "text": "<ul> <li><code>api_key</code> str - The OpenAI API key to use.</li> <li><code>type</code> openai_chat|azure_openai_chat|openai_embedding|azure_openai_embedding - The type of LLM to use.</li> <li><code>model</code> str - The model name.</li> <li><code>max_tokens</code> int - The maximum number of output tokens.</li> <li><code>request_timeout</code> float - The per-request timeout.</li> <li><code>api_base</code> str - The API base url to use.</li> <li><code>api_version</code> str - The API version</li> <li><code>organization</code> str - The client organization.</li> <li><code>proxy</code> str - The proxy URL to use.</li> <li><code>audience</code> str - (Azure OpenAI only) The URI of the target Azure resource/service for which a managed identity token is requested. Used if <code>api_key</code> is not defined. Default=<code>https://cognitiveservices.azure.com/.default</code></li> <li><code>deployment_name</code> str - The deployment name to use (Azure).</li> <li><code>model_supports_json</code> bool - Whether the model supports JSON-mode output.</li> <li><code>tokens_per_minute</code> int - Set a leaky-bucket throttle on tokens-per-minute.</li> <li><code>requests_per_minute</code> int - Set a leaky-bucket throttle on requests-per-minute.</li> <li><code>max_retries</code> int - The maximum number of retries to use.</li> <li><code>max_retry_wait</code> float - The maximum backoff time.</li> <li><code>sleep_on_rate_limit_recommendation</code> bool - Whether to adhere to sleep recommendations (Azure).</li> <li><code>concurrent_requests</code> int The number of open requests to allow at once.</li> <li><code>temperature</code> float - The temperature to use.</li> <li><code>top_p</code> float - The top-p value to use.</li> <li><code>n</code> int - The number of completions to generate.</li> </ul>"}, {"location": "config/yaml/#parallelization", "title": "parallelization", "text": ""}, {"location": "config/yaml/#fields_1", "title": "Fields", "text": "<ul> <li><code>stagger</code> float - The threading stagger value.</li> <li><code>num_threads</code> int - The maximum number of work threads.</li> </ul>"}, {"location": "config/yaml/#async_mode", "title": "async_mode", "text": "<p>asyncio|threaded The async mode to use. Either <code>asyncio</code> or `threaded.</p>"}, {"location": "config/yaml/#embeddings", "title": "embeddings", "text": ""}, {"location": "config/yaml/#fields_2", "title": "Fields", "text": "<ul> <li><code>llm</code> (see LLM top-level config)</li> <li><code>parallelization</code> (see Parallelization top-level config)</li> <li><code>async_mode</code> (see Async Mode top-level config)</li> <li><code>batch_size</code> int - The maximum batch size to use.</li> <li><code>batch_max_tokens</code> int - The maximum batch # of tokens.</li> <li><code>target</code> required|all|none - Determines which set of embeddings to export.</li> <li><code>skip</code> list[str] - Which embeddings to skip. Only useful if target=all to customize the list.</li> <li><code>vector_store</code> dict - The vector store to use. Configured for lancedb by default.<ul> <li><code>type</code> str - <code>lancedb</code> or <code>azure_ai_search</code>. Default=<code>lancedb</code></li> <li><code>db_uri</code> str (only for lancedb) - The database uri. Default=<code>storage.base_dir/lancedb</code></li> <li><code>url</code> str (only for AI Search) - AI Search endpoint</li> <li><code>api_key</code> str (optional - only for AI Search) - The AI Search api key to use.</li> <li><code>audience</code> str (only for AI Search) - Audience for managed identity token if managed identity authentication is used.</li> <li><code>overwrite</code> bool (only used at index creation time) - Overwrite collection if it exist. Default=<code>True</code></li> <li><code>container_name</code> str - The name of a vector container. This stores all indexes (tables) for a given dataset ingest. Default=<code>default</code></li> </ul> </li> <li><code>strategy</code> dict - Fully override the text-embedding strategy.</li> </ul>"}, {"location": "config/yaml/#input", "title": "input", "text": ""}, {"location": "config/yaml/#fields_3", "title": "Fields", "text": "<ul> <li><code>type</code> file|blob - The input type to use. Default=<code>file</code></li> <li><code>file_type</code> text|csv - The type of input data to load. Either <code>text</code> or <code>csv</code>. Default is <code>text</code></li> <li><code>base_dir</code> str - The base directory to read input from, relative to the root.</li> <li><code>connection_string</code> str - (blob only) The Azure Storage connection string.</li> <li><code>storage_account_blob_url</code> str - The storage account blob URL to use.</li> <li><code>container_name</code> str - (blob only) The Azure Storage container name.</li> <li><code>file_encoding</code> str - The encoding of the input file. Default is <code>utf-8</code></li> <li><code>file_pattern</code> str - A regex to match input files. Default is <code>.*\\.csv$</code> if in csv mode and <code>.*\\.txt$</code> if in text mode.</li> <li><code>file_filter</code> dict - Key/value pairs to filter. Default is None.</li> <li><code>source_column</code> str - (CSV Mode Only) The source column name.</li> <li><code>timestamp_column</code> str - (CSV Mode Only) The timestamp column name.</li> <li><code>timestamp_format</code> str - (CSV Mode Only) The source format.</li> <li><code>text_column</code> str - (CSV Mode Only) The text column name.</li> <li><code>title_column</code> str - (CSV Mode Only) The title column name.</li> <li><code>document_attribute_columns</code> list[str] - (CSV Mode Only) The additional document attributes to include.</li> </ul>"}, {"location": "config/yaml/#chunks", "title": "chunks", "text": ""}, {"location": "config/yaml/#fields_4", "title": "Fields", "text": "<ul> <li><code>size</code> int - The max chunk size in tokens.</li> <li><code>overlap</code> int - The chunk overlap in tokens.</li> <li><code>group_by_columns</code> list[str] - group documents by fields before chunking.</li> <li><code>encoding_model</code> str - The text encoding model to use. Default is to use the top-level encoding model.</li> <li><code>strategy</code> dict - Fully override the chunking strategy.</li> </ul>"}, {"location": "config/yaml/#cache", "title": "cache", "text": ""}, {"location": "config/yaml/#fields_5", "title": "Fields", "text": "<ul> <li><code>type</code> file|memory|none|blob - The cache type to use. Default=<code>file</code></li> <li><code>connection_string</code> str - (blob only) The Azure Storage connection string.</li> <li><code>container_name</code> str - (blob only) The Azure Storage container name.</li> <li><code>base_dir</code> str - The base directory to write cache to, relative to the root.</li> <li><code>storage_account_blob_url</code> str - The storage account blob URL to use.</li> </ul>"}, {"location": "config/yaml/#storage", "title": "storage", "text": ""}, {"location": "config/yaml/#fields_6", "title": "Fields", "text": "<ul> <li><code>type</code> file|memory|blob - The storage type to use. Default=<code>file</code></li> <li><code>connection_string</code> str - (blob only) The Azure Storage connection string.</li> <li><code>container_name</code> str - (blob only) The Azure Storage container name.</li> <li><code>base_dir</code> str - The base directory to write output artifacts to, relative to the root.</li> <li><code>storage_account_blob_url</code> str - The storage account blob URL to use.</li> </ul>"}, {"location": "config/yaml/#update_index_storage", "title": "update_index_storage", "text": ""}, {"location": "config/yaml/#fields_7", "title": "Fields", "text": "<ul> <li><code>type</code> file|memory|blob - The storage type to use. Default=<code>file</code></li> <li><code>connection_string</code> str - (blob only) The Azure Storage connection string.</li> <li><code>container_name</code> str - (blob only) The Azure Storage container name.</li> <li><code>base_dir</code> str - The base directory to write output artifacts to, relative to the root.</li> <li><code>storage_account_blob_url</code> str - The storage account blob URL to use.</li> </ul>"}, {"location": "config/yaml/#reporting", "title": "reporting", "text": ""}, {"location": "config/yaml/#fields_8", "title": "Fields", "text": "<ul> <li><code>type</code> file|console|blob - The reporting type to use. Default=<code>file</code></li> <li><code>connection_string</code> str - (blob only) The Azure Storage connection string.</li> <li><code>container_name</code> str - (blob only) The Azure Storage container name.</li> <li><code>base_dir</code> str - The base directory to write reports to, relative to the root.</li> <li><code>storage_account_blob_url</code> str - The storage account blob URL to use.</li> </ul>"}, {"location": "config/yaml/#entity_extraction", "title": "entity_extraction", "text": ""}, {"location": "config/yaml/#fields_9", "title": "Fields", "text": "<ul> <li><code>llm</code> (see LLM top-level config)</li> <li><code>parallelization</code> (see Parallelization top-level config)</li> <li><code>async_mode</code> (see Async Mode top-level config)</li> <li><code>prompt</code> str - The prompt file to use.</li> <li><code>entity_types</code> list[str] - The entity types to identify.</li> <li><code>max_gleanings</code> int - The maximum number of gleaning cycles to use.</li> <li><code>encoding_model</code> str - The text encoding model to use. By default, this will use the top-level encoding model.</li> <li><code>strategy</code> dict - Fully override the entity extraction strategy.</li> </ul>"}, {"location": "config/yaml/#summarize_descriptions", "title": "summarize_descriptions", "text": ""}, {"location": "config/yaml/#fields_10", "title": "Fields", "text": "<ul> <li><code>llm</code> (see LLM top-level config)</li> <li><code>parallelization</code> (see Parallelization top-level config)</li> <li><code>async_mode</code> (see Async Mode top-level config)</li> <li><code>prompt</code> str - The prompt file to use.</li> <li><code>max_length</code> int - The maximum number of output tokens per summarization.</li> <li><code>strategy</code> dict - Fully override the summarize description strategy.</li> </ul>"}, {"location": "config/yaml/#claim_extraction", "title": "claim_extraction", "text": ""}, {"location": "config/yaml/#fields_11", "title": "Fields", "text": "<ul> <li><code>enabled</code> bool - Whether to enable claim extraction. Off by default, because claim prompts really need user tuning.</li> <li><code>llm</code> (see LLM top-level config)</li> <li><code>parallelization</code> (see Parallelization top-level config)</li> <li><code>async_mode</code> (see Async Mode top-level config)</li> <li><code>prompt</code> str - The prompt file to use.</li> <li><code>description</code> str - Describes the types of claims we want to extract.</li> <li><code>max_gleanings</code> int - The maximum number of gleaning cycles to use.</li> <li><code>encoding_model</code> str - The text encoding model to use. By default, this will use the top-level encoding model.</li> <li><code>strategy</code> dict - Fully override the claim extraction strategy.</li> </ul>"}, {"location": "config/yaml/#community_reports", "title": "community_reports", "text": ""}, {"location": "config/yaml/#fields_12", "title": "Fields", "text": "<ul> <li><code>llm</code> (see LLM top-level config)</li> <li><code>parallelization</code> (see Parallelization top-level config)</li> <li><code>async_mode</code> (see Async Mode top-level config)</li> <li><code>prompt</code> str - The prompt file to use.</li> <li><code>max_length</code> int - The maximum number of output tokens per report.</li> <li><code>max_input_length</code> int - The maximum number of input tokens to use when generating reports.</li> <li><code>strategy</code> dict - Fully override the community reports strategy.</li> </ul>"}, {"location": "config/yaml/#cluster_graph", "title": "cluster_graph", "text": ""}, {"location": "config/yaml/#fields_13", "title": "Fields", "text": "<ul> <li><code>max_cluster_size</code> int - The maximum cluster size to export.</li> <li><code>strategy</code> dict - Fully override the cluster_graph strategy.</li> </ul>"}, {"location": "config/yaml/#embed_graph", "title": "embed_graph", "text": ""}, {"location": "config/yaml/#fields_14", "title": "Fields", "text": "<ul> <li><code>enabled</code> bool - Whether to enable graph embeddings.</li> <li><code>num_walks</code> int - The node2vec number of walks.</li> <li><code>walk_length</code> int - The node2vec walk length.</li> <li><code>window_size</code> int - The node2vec window size.</li> <li><code>iterations</code> int - The node2vec number of iterations.</li> <li><code>random_seed</code> int - The node2vec random seed.</li> <li><code>strategy</code> dict - Fully override the embed graph strategy.</li> </ul>"}, {"location": "config/yaml/#umap", "title": "umap", "text": ""}, {"location": "config/yaml/#fields_15", "title": "Fields", "text": "<ul> <li><code>enabled</code> bool - Whether to enable UMAP layouts.</li> </ul>"}, {"location": "config/yaml/#snapshots", "title": "snapshots", "text": ""}, {"location": "config/yaml/#fields_16", "title": "Fields", "text": "<ul> <li><code>embeddings</code> bool - Export embeddings snapshots to parquet.</li> <li><code>graphml</code> bool - Export graph snapshots to GraphML.</li> <li><code>transient</code> bool - Export transient workflow tables snapshots to parquet.</li> </ul>"}, {"location": "config/yaml/#encoding_model", "title": "encoding_model", "text": "<p>str - The text encoding model to use. Default=<code>cl100k_base</code>.</p>"}, {"location": "config/yaml/#skip_workflows", "title": "skip_workflows", "text": "<p>list[str] - Which workflow names to skip.</p>"}, {"location": "config/yaml/#query", "title": "Query", "text": ""}, {"location": "config/yaml/#local_search", "title": "local_search", "text": ""}, {"location": "config/yaml/#fields_17", "title": "Fields", "text": "<ul> <li><code>prompt</code> str - The prompt file to use.</li> <li><code>text_unit_prop</code> float - The text unit proportion. </li> <li><code>community_prop</code> float - The community proportion.</li> <li><code>conversation_history_max_turns</code> int - The conversation history maximum turns.</li> <li><code>top_k_entities</code> int - The top k mapped entities.</li> <li><code>top_k_relationships</code> int - The top k mapped relations.</li> <li><code>temperature</code> float | None - The temperature to use for token generation.</li> <li><code>top_p</code> float | None - The top-p value to use for token generation.</li> <li><code>n</code> int | None - The number of completions to generate.</li> <li><code>max_tokens</code> int - The maximum tokens.</li> <li><code>llm_max_tokens</code> int - The LLM maximum tokens.</li> </ul>"}, {"location": "config/yaml/#global_search", "title": "global_search", "text": ""}, {"location": "config/yaml/#fields_18", "title": "Fields", "text": "<ul> <li><code>map_prompt</code> str - The mapper prompt file to use.</li> <li><code>reduce_prompt</code> str - The reducer prompt file to use.</li> <li><code>knowledge_prompt</code> str - The knowledge prompt file to use.</li> <li><code>map_prompt</code> str | None - The global search mapper prompt to use.</li> <li><code>reduce_prompt</code> str | None - The global search reducer to use.</li> <li><code>knowledge_prompt</code> str | None - The global search general prompt to use.</li> <li><code>temperature</code> float | None - The temperature to use for token generation.</li> <li><code>top_p</code> float | None - The top-p value to use for token generation.</li> <li><code>n</code> int | None - The number of completions to generate.</li> <li><code>max_tokens</code> int - The maximum context size in tokens.</li> <li><code>data_max_tokens</code> int - The data llm maximum tokens.</li> <li><code>map_max_tokens</code> int - The map llm maximum tokens.</li> <li><code>reduce_max_tokens</code> int - The reduce llm maximum tokens.</li> <li><code>concurrency</code> int - The number of concurrent requests.</li> <li><code>dynamic_search_llm</code> str - LLM model to use for dynamic community selection.</li> <li><code>dynamic_search_threshold</code> int - Rating threshold in include a community report.</li> <li><code>dynamic_search_keep_parent</code> bool - Keep parent community if any of the child communities are relevant.</li> <li><code>dynamic_search_num_repeats</code> int - Number of times to rate the same community report.</li> <li><code>dynamic_search_use_summary</code> bool - Use community summary instead of full_context.</li> <li><code>dynamic_search_concurrent_coroutines</code> int - Number of concurrent coroutines to rate community reports.</li> <li><code>dynamic_search_max_level</code> int - The maximum level of community hierarchy to consider if none of the processed communities are relevant.</li> </ul>"}, {"location": "config/yaml/#drift_search", "title": "drift_search", "text": ""}, {"location": "config/yaml/#fields_19", "title": "Fields", "text": "<ul> <li><code>prompt</code> str - The prompt file to use.</li> <li><code>temperature</code> float - The temperature to use for token generation.\",</li> <li><code>top_p</code> float - The top-p value to use for token generation.</li> <li><code>n</code> int - The number of completions to generate.</li> <li><code>max_tokens</code> int - The maximum context size in tokens.</li> <li><code>data_max_tokens</code> int - The data llm maximum tokens.</li> <li><code>concurrency</code> int - The number of concurrent requests.</li> <li><code>drift_k_followups</code> int - The number of top global results to retrieve.</li> <li><code>primer_folds</code> int - The number of folds for search priming.</li> <li><code>primer_llm_max_tokens</code> int - The maximum number of tokens for the LLM in primer.</li> <li><code>n_depth</code> int - The number of drift search steps to take.</li> <li><code>local_search_text_unit_prop</code> float - The proportion of search dedicated to text units.</li> <li><code>local_search_community_prop</code> float - The proportion of search dedicated to community properties.</li> <li><code>local_search_top_k_mapped_entities</code> int - The number of top K entities to map during local search.</li> <li><code>local_search_top_k_relationships</code> int - The number of top K relationships to map during local search.</li> <li><code>local_search_max_data_tokens</code> int - The maximum context size in tokens for local search.</li> <li><code>local_search_temperature</code> float - The temperature to use for token generation in local search.</li> <li><code>local_search_top_p</code> float - The top-p value to use for token generation in local search.</li> <li><code>local_search_n</code> int - The number of completions to generate in local search.</li> <li><code>local_search_llm_max_gen_tokens</code> int - The maximum number of generated tokens for the LLM in local search.</li> </ul>"}, {"location": "data/operation_dulce/ABOUT/", "title": "About", "text": "<p>This document (Operation Dulce) is an AI-generated science fiction novella, included here for the purposes of integration testing.</p>"}, {"location": "index/architecture/", "title": "Indexing Architecture", "text": ""}, {"location": "index/architecture/#key-concepts", "title": "Key Concepts", "text": ""}, {"location": "index/architecture/#knowledge-model", "title": "Knowledge Model", "text": "<p>In order to support the GraphRAG system, the outputs of the indexing engine (in the Default Configuration Mode) are aligned to a knowledge model we call the GraphRAG Knowledge Model. This model is designed to be an abstraction over the underlying data storage technology, and to provide a common interface for the GraphRAG system to interact with. In normal use-cases the outputs of the GraphRAG Indexer would be loaded into a database system, and the GraphRAG's Query Engine would interact with the database using the knowledge model data-store types.</p>"}, {"location": "index/architecture/#datashaper-workflows", "title": "DataShaper Workflows", "text": "<p>GraphRAG's Indexing Pipeline is built on top of our open-source library, DataShaper. DataShaper is a data processing library that allows users to declaratively express data pipelines, schemas, and related assets using well-defined schemas. DataShaper has implementations in JavaScript and Python, and is designed to be extensible to other languages.</p> <p>One of the core resource types within DataShaper is a Workflow. Workflows are expressed as sequences of steps, which we call verbs. Each step has a verb name and a configuration object. In DataShaper, these verbs model relational concepts such as SELECT, DROP, JOIN, etc.. Each verb transforms an input data table, and that table is passed down the pipeline.</p> <pre><code>---\ntitle: Sample Workflow\n---\nflowchart LR\n    input[Input Table] --&gt; select[SELECT] --&gt; join[JOIN] --&gt; binarize[BINARIZE] --&gt; output[Output Table]</code></pre>"}, {"location": "index/architecture/#llm-based-workflow-steps", "title": "LLM-based Workflow Steps", "text": "<p>GraphRAG's Indexing Pipeline implements a handful of custom verbs on top of the standard, relational verbs that our DataShaper library provides. These verbs give us the ability to augment text documents with rich, structured data using the power of LLMs such as GPT-4. We utilize these verbs in our standard workflow to extract entities, relationships, claims, community structures, and community reports and summaries. This behavior is customizable and can be extended to support many kinds of AI-based data enrichment and extraction tasks.</p>"}, {"location": "index/architecture/#workflow-graphs", "title": "Workflow Graphs", "text": "<p>Because of the complexity of our data indexing tasks, we needed to be able to express our data pipeline as series of multiple, interdependent workflows. In the GraphRAG Indexing Pipeline, each workflow may define dependencies on other workflows, effectively forming a directed acyclic graph (DAG) of workflows, which is then used to schedule processing.</p> <pre><code>---\ntitle: Sample Workflow DAG\n---\nstateDiagram-v2\n    [*] --&gt; Prepare\n    Prepare --&gt; Chunk\n    Chunk --&gt; ExtractGraph\n    Chunk --&gt; EmbedDocuments\n    ExtractGraph --&gt; GenerateReports\n    ExtractGraph --&gt; EmbedEntities\n    ExtractGraph --&gt; EmbedGraph</code></pre>"}, {"location": "index/architecture/#dataframe-message-format", "title": "Dataframe Message Format", "text": "<p>The primary unit of communication between workflows, and between workflow steps is an instance of <code>pandas.DataFrame</code>. Although side-effects are possible, our goal is to be data-centric and table-centric in our approach to data processing. This allows us to easily reason about our data, and to leverage the power of dataframe-based ecosystems. Our underlying dataframe technology may change over time, but our primary goal is to support the DataShaper workflow schema while retaining single-machine ease of use and developer ergonomics.</p>"}, {"location": "index/architecture/#llm-caching", "title": "LLM Caching", "text": "<p>The GraphRAG library was designed with LLM interactions in mind, and a common setback when working with LLM APIs is various errors due to network latency, throttling, etc.. Because of these potential error cases, we've added a cache layer around LLM interactions. When completion requests are made using the same input set (prompt and tuning parameters), we return a cached result if one exists. This allows our indexer to be more resilient to network issues, to act idempotently, and to provide a more efficient end-user experience.</p>"}, {"location": "index/default_dataflow/", "title": "Indexing Dataflow", "text": ""}, {"location": "index/default_dataflow/#the-graphrag-knowledge-model", "title": "The GraphRAG Knowledge Model", "text": "<p>The knowledge model is a specification for data outputs that conform to our data-model definition. You can find these definitions in the python/graphrag/graphrag/model folder within the GraphRAG repository. The following entity types are provided. The fields here represent the fields that are text-embedded by default.</p> <ul> <li><code>Document</code> - An input document into the system. These either represent individual rows in a CSV or individual .txt file.</li> <li><code>TextUnit</code> - A chunk of text to analyze. The size of these chunks, their overlap, and whether they adhere to any data boundaries may be configured below. A common use case is to set <code>CHUNK_BY_COLUMNS</code> to <code>id</code> so that there is a 1-to-many relationship between documents and TextUnits instead of a many-to-many.</li> <li><code>Entity</code> - An entity extracted from a TextUnit. These represent people, places, events, or some other entity-model that you provide.</li> <li><code>Relationship</code> - A relationship between two entities. These are generated from the covariates.</li> <li><code>Covariate</code> - Extracted claim information, which contains statements about entities which may be time-bound.</li> <li><code>Community</code> - Once the graph of entities and relationships is built, we perform hierarchical community detection on them to create a clustering structure.</li> <li><code>Community Report</code> - The contents of each community are summarized into a generated report, useful for human reading and downstream search.</li> <li><code>Node</code> - This table contains layout information for rendered graph-views of the Entities and Documents which have been embedded and clustered.</li> </ul>"}, {"location": "index/default_dataflow/#the-default-configuration-workflow", "title": "The Default Configuration Workflow", "text": "<p>Let's take a look at how the default-configuration workflow transforms text documents into the GraphRAG Knowledge Model. This page gives a general overview of the major steps in this process. To fully configure this workflow, check out the configuration documentation.</p> <pre><code>---\ntitle: Dataflow Overview\n---\nflowchart TB\n    subgraph phase1[Phase 1: Compose TextUnits]\n    documents[Documents] --&gt; chunk[Chunk]\n    chunk --&gt; embed[Embed] --&gt; textUnits[Text Units]\n    end\n    subgraph phase2[Phase 2: Graph Extraction]\n    textUnits --&gt; graph_extract[Entity &amp; Relationship Extraction]\n    graph_extract --&gt; graph_summarize[Entity &amp; Relationship Summarization]\n    graph_summarize --&gt; claim_extraction[Claim Extraction]\n    claim_extraction --&gt; graph_outputs[Graph Tables]\n    end\n    subgraph phase3[Phase 3: Graph Augmentation]\n    graph_outputs --&gt; community_detect[Community Detection]\n    community_detect --&gt; graph_embed[Graph Embedding]\n    graph_embed --&gt; augmented_graph[Augmented Graph Tables]\n    end\n    subgraph phase4[Phase 4: Community Summarization]\n    augmented_graph --&gt; summarized_communities[Community Summarization]\n    summarized_communities --&gt; embed_communities[Community Embedding]\n    embed_communities --&gt; community_outputs[Community Tables]\n    end\n    subgraph phase5[Phase 5: Document Processing]\n    documents --&gt; link_to_text_units[Link to TextUnits]\n    textUnits --&gt; link_to_text_units\n    link_to_text_units --&gt; embed_documents[Document Embedding]\n    embed_documents --&gt; document_graph[Document Graph Creation]\n    document_graph --&gt; document_outputs[Document Tables]\n    end\n    subgraph phase6[Phase 6: Network Visualization]\n    document_outputs --&gt; umap_docs[Umap Documents]\n    augmented_graph --&gt; umap_entities[Umap Entities]\n    umap_docs --&gt; combine_nodes[Nodes Table]\n    umap_entities --&gt; combine_nodes\n    end</code></pre>"}, {"location": "index/default_dataflow/#phase-1-compose-textunits", "title": "Phase 1: Compose TextUnits", "text": "<p>The first phase of the default-configuration workflow is to transform input documents into TextUnits. A TextUnit is a chunk of text that is used for our graph extraction techniques. They are also used as source-references by extracted knowledge items in order to empower breadcrumbs and provenance by concepts back to their original source tex.</p> <p>The chunk size (counted in tokens), is user-configurable. By default this is set to 300 tokens, although we've had positive experience with 1200-token chunks using a single \"glean\" step. (A \"glean\" step is a follow-on extraction). Larger chunks result in lower-fidelity output and less meaningful reference texts; however, using larger chunks can result in much faster processing time.</p> <p>The group-by configuration is also user-configurable. By default, we align our chunks to document boundaries, meaning that there is a strict 1-to-many relationship between Documents and TextUnits. In rare cases, this can be turned into a many-to-many relationship. This is useful when the documents are very short and we need several of them to compose a meaningful analysis unit (e.g. Tweets or a chat log)</p> <p>Each of these text-units are text-embedded and passed into the next phase of the pipeline.</p> <pre><code>---\ntitle: Documents into Text Chunks\n---\nflowchart LR\n    doc1[Document 1] --&gt; tu1[TextUnit 1]\n    doc1 --&gt; tu2[TextUnit 2]\n    doc2[Document 2] --&gt; tu3[TextUnit 3]\n    doc2 --&gt; tu4[TextUnit 4]\n</code></pre>"}, {"location": "index/default_dataflow/#phase-2-graph-extraction", "title": "Phase 2: Graph Extraction", "text": "<p>In this phase, we analyze each text unit and extract our graph primitives: Entities, Relationships, and Claims. Entities and Relationships are extracted at once in our entity_extract verb, and claims are extracted in our claim_extract verb. Results are then combined and passed into following phases of the pipeline.</p> <pre><code>---\ntitle: Graph Extraction\n---\nflowchart LR\n    tu[TextUnit] --&gt; ge[Graph Extraction] --&gt; gs[Graph Summarization]\n    tu --&gt; ce[Claim Extraction]</code></pre>"}, {"location": "index/default_dataflow/#entity-relationship-extraction", "title": "Entity &amp; Relationship Extraction", "text": "<p>In this first step of graph extraction, we process each text-unit in order to extract entities and relationships out of the raw text using the LLM. The output of this step is a subgraph-per-TextUnit containing a list of entities with a name, type, and description, and a list of relationships with a source, target, and description.</p> <p>These subgraphs are merged together - any entities with the same name and type are merged by creating an array of their descriptions. Similarly, any relationships with the same source and target are merged by creating an array of their descriptions.</p>"}, {"location": "index/default_dataflow/#entity-relationship-summarization", "title": "Entity &amp; Relationship Summarization", "text": "<p>Now that we have a graph of entities and relationships, each with a list of descriptions, we can summarize these lists into a single description per entity and relationship. This is done by asking the LLM for a short summary that captures all of the distinct information from each description. This allows all of our entities and relationships to have a single concise description.</p>"}, {"location": "index/default_dataflow/#claim-extraction-emission", "title": "Claim Extraction &amp; Emission", "text": "<p>Finally, as an independent workflow, we extract claims from the source TextUnits. These claims represent positive factual statements with an evaluated status and time-bounds. These get exported as a primary artifact called Covariates.</p> <p>Note: claim extraction is optional and turned off by default. This is because claim extraction generally requires prompt tuning to be useful.</p>"}, {"location": "index/default_dataflow/#phase-3-graph-augmentation", "title": "Phase 3: Graph Augmentation", "text": "<p>Now that we have a usable graph of entities and relationships, we want to understand their community structure and augment the graph with additional information. This is done in two steps: Community Detection and Graph Embedding. These give us explicit (communities) and implicit (embeddings) ways of understanding the topological structure of our graph.</p> <pre><code>---\ntitle: Graph Augmentation\n---\nflowchart LR\n    cd[Leiden Hierarchical Community Detection] --&gt; ge[Node2Vec Graph Embedding] --&gt; ag[Graph Table Emission]</code></pre>"}, {"location": "index/default_dataflow/#community-detection", "title": "Community Detection", "text": "<p>In this step, we generate a hierarchy of entity communities using the Hierarchical Leiden Algorithm. This method will apply a recursive community-clustering to our graph until we reach a community-size threshold. This will allow us to understand the community structure of our graph and provide a way to navigate and summarize the graph at different levels of granularity.</p>"}, {"location": "index/default_dataflow/#graph-embedding", "title": "Graph Embedding", "text": "<p>In this step, we generate a vector representation of our graph using the Node2Vec algorithm. This will allow us to understand the implicit structure of our graph and provide an additional vector-space in which to search for related concepts during our query phase.</p>"}, {"location": "index/default_dataflow/#graph-tables-emission", "title": "Graph Tables Emission", "text": "<p>Once our graph augmentation steps are complete, the final Entities and Relationships tables are exported after their text fields are text-embedded.</p>"}, {"location": "index/default_dataflow/#phase-4-community-summarization", "title": "Phase 4: Community Summarization", "text": "<pre><code>---\ntitle: Community Summarization\n---\nflowchart LR\n    sc[Generate Community Reports] --&gt; ss[Summarize Community Reports] --&gt; ce[Community Embedding] --&gt; co[Community Tables Emission]</code></pre> <p>At this point, we have a functional graph of entities and relationships, a hierarchy of communities for the entities, as well as node2vec embeddings.</p> <p>Now we want to build on the communities data and generate reports for each community. This gives us a high-level understanding of the graph at several points of graph granularity. For example, if community A is the top-level community, we'll get a report about the entire graph. If the community is lower-level, we'll get a report about a local cluster.</p>"}, {"location": "index/default_dataflow/#generate-community-reports", "title": "Generate Community Reports", "text": "<p>In this step, we generate a summary of each community using the LLM. This will allow us to understand the distinct information contained within each community and provide a scoped understanding of the graph, from either a high-level or a low-level perspective. These reports contain an executive overview and reference the key entities, relationships, and claims within the community sub-structure.</p>"}, {"location": "index/default_dataflow/#summarize-community-reports", "title": "Summarize Community Reports", "text": "<p>In this step, each community report is then summarized via the LLM for shorthand use.</p>"}, {"location": "index/default_dataflow/#community-embedding", "title": "Community Embedding", "text": "<p>In this step, we generate a vector representation of our communities by generating text embeddings of the community report, the community report summary, and the title of the community report.</p>"}, {"location": "index/default_dataflow/#community-tables-emission", "title": "Community Tables Emission", "text": "<p>At this point, some bookkeeping work is performed and we export the Communities and CommunityReports tables.</p>"}, {"location": "index/default_dataflow/#phase-5-document-processing", "title": "Phase 5: Document Processing", "text": "<p>In this phase of the workflow, we create the Documents table for the knowledge model.</p> <pre><code>---\ntitle: Document Processing\n---\nflowchart LR\n    aug[Augment] --&gt; dp[Link to TextUnits] --&gt; de[Avg. Embedding] --&gt; dg[Document Table Emission]</code></pre>"}, {"location": "index/default_dataflow/#augment-with-columns-csv-only", "title": "Augment with Columns (CSV Only)", "text": "<p>If the workflow is operating on CSV data, you may configure your workflow to add additional fields to Documents output. These fields should exist on the incoming CSV tables. Details about configuring this can be found in the configuration documentation.</p>"}, {"location": "index/default_dataflow/#link-to-textunits", "title": "Link to TextUnits", "text": "<p>In this step, we link each document to the text-units that were created in the first phase. This allows us to understand which documents are related to which text-units and vice-versa.</p>"}, {"location": "index/default_dataflow/#document-embedding", "title": "Document Embedding", "text": "<p>In this step, we generate a vector representation of our documents using an average embedding of document slices. We re-chunk documents without overlapping chunks, and then generate an embedding for each chunk. We create an average of these chunks weighted by token-count and use this as the document embedding. This will allow us to understand the implicit relationship between documents, and will help us generate a network representation of our documents.</p>"}, {"location": "index/default_dataflow/#documents-table-emission", "title": "Documents Table Emission", "text": "<p>At this point, we can export the Documents table into the knowledge Model.</p>"}, {"location": "index/default_dataflow/#phase-6-network-visualization", "title": "Phase 6: Network Visualization", "text": "<p>In this phase of the workflow, we perform some steps to support network visualization of our high-dimensional vector spaces within our existing graphs. At this point there are two logical graphs at play: the Entity-Relationship graph and the Document graph.</p> <pre><code>---\ntitle: Network Visualization Workflows\n---\nflowchart LR\n    nv[Umap Documents] --&gt; ne[Umap Entities] --&gt; ng[Nodes Table Emission]</code></pre> <p>For each of the logical graphs, we perform a UMAP dimensionality reduction to generate a 2D representation of the graph. This will allow us to visualize the graph in a 2D space and understand the relationships between the nodes in the graph. The UMAP embeddings are then exported as a table of Nodes. The rows of this table include a discriminator indicating whether the node is a document or an entity, and the UMAP coordinates.</p>"}, {"location": "index/outputs/", "title": "Outputs", "text": "<p>The default pipeline produces a series of output tables that align with the conceptual knowledge model. This page describes the detailed output table schemas. By default we write these tables out as parquet files on disk.</p>"}, {"location": "index/outputs/#shared-fields", "title": "Shared fields", "text": "<p>All tables have two identifier fields:</p> name type description id str Generated UUID, assuring global uniqueness human_readable_id int This is an incremented short ID created per-run. For example, we use this short ID with generated summaries that print citations so they are easy to cross-reference visually."}, {"location": "index/outputs/#create_final_communities", "title": "create_final_communities", "text": "<p>This is a list of the final communities generated by Leiden. Communities are strictly hierarchical, subdividing into children as the cluster affinity is narrowed.</p> name type description community int Leiden-generated cluster ID for the community. Note that these increment with depth, so they are unique through all levels of the community hierarchy. For this table, human_readable_id is a copy of the community ID rather than a plain increment. parent int Parent community ID. level int Depth of the community in the hierarchy. title str Friendly name of the community. entity_ids str[] List of entities that are members of the community. relationship_ids str[] List of relationships that are wholly within the community (source and target are both in the community). text_unit_ids str[] List of text units represented within the community. period str Date of ingest, used for incremental update merges. ISO8601 size int Size of the community (entity count), used for incremental update merges."}, {"location": "index/outputs/#create_final_community_reports", "title": "create_final_community_reports", "text": "<p>This is the list of summarized reports for each community.</p> name type description community int Short ID of the community this report applies to. parent int Parent community ID. level int Level of the community this report applies to. title str LM-generated title for the report. summary str LM-generated summary of the report. full_content str LM-generated full report. rank float LM-derived relevance ranking of the report based on member entity salience rank_explanation str LM-derived explanation of the rank. findings dict LM-derived list of the top 5-10 insights from the community. Contains <code>summary</code> and <code>explanation</code> values. full_content_json json Full JSON output as returned by the LM. Most fields are extracted into columns, but this JSON is sent for query summarization so we leave it to allow for prompt tuning to add fields/content by end users. period str Date of ingest, used for incremental update merges. ISO8601 size int Size of the community (entity count), used for incremental update merges."}, {"location": "index/outputs/#create_final_covariates", "title": "create_final_covariates", "text": "<p>(Optional) If claim extraction is turned on, this is a list of the extracted covariates. Note that claims are typically oriented around identifying malicious behavior such as fraud, so they are not useful for all datasets.</p> name type description covariate_type str This is always \"claim\" with our default covariates. type str Nature of the claim type. description str LM-generated description of the behavior. subject_id str Name of the source entity (that is performing the claimed behavior). object_id str Name of the target entity (that the claimed behavior is performed on). status str LM-derived assessment of the correctness of the claim. One of [TRUE, FALSE, SUSPECTED] start_date str LM-derived start of the claimed activity. ISO8601 end_date str LM-derived end of the claimed activity. ISO8601 source_text str Short string of text containing the claimed behavior. text_unit_id str ID of the text unit the claim text was extracted from."}, {"location": "index/outputs/#create_final_documents", "title": "create_final_documents", "text": "<p>List of document content after import.</p> name type description title str Filename, unless otherwise configured during CSV import. text str Full text of the document. text_unit_ids str[] List of text units (chunks) that were parsed from the document. attributes dict (optional) If specified during CSV import, this is a dict of attributes for the document."}, {"location": "index/outputs/#create_final_entities", "title": "create_final_entities", "text": "<p>List of all entities found in the data by the LM.</p> name type description title str Name of the entity. type str Type of the entity. By default this will be \"organization\", \"person\", \"geo\", or \"event\" unless configured differently or auto-tuning is used. description str Textual description of the entity. Entities may be found in many text units, so this is an LM-derived summary of all descriptions. text_unit_ids str[] List of the text units containing the entity."}, {"location": "index/outputs/#create_final_nodes", "title": "create_final_nodes", "text": "<p>This is graph-related information for the entities. It contains only information relevant to the graph such as community. There is an entry for each entity at every community level it is found within, so you may see \"duplicate\" entities.</p> <p>Note that the ID fields match those in create_final_entities and can be used for joining if additional information about a node is required.</p> name type description title str Name of the referenced entity. Duplicated from create_final_entities for convenient cross-referencing. community int Leiden community the node is found within. Entities are not always assigned a community (they may not be close enough to any), so they may have a ID of -1. level int Level of the community the entity is in. degree int Node degree (connectedness) in the graph. x float X position of the node for visual layouts. If graph embeddings and UMAP are not turned on, this will be 0. y float Y position of the node for visual layouts. If graph embeddings and UMAP are not turned on, this will be 0."}, {"location": "index/outputs/#create_final_relationships", "title": "create_final_relationships", "text": "<p>List of all entity-to-entity relationships found in the data by the LM. This is also the edge list for the graph.</p> name type description source str Name of the source entity. target str Name of the target entity. description str LM-derived description of the relationship. Also see note for entity descriptions. weight float Weight of the edge in the graph. This is summed from an LM-derived \"strength\" measure for each relationship instance. combined_degree int Sum of source and target node degrees. text_unit_ids str[] List of text units the relationship was found within."}, {"location": "index/outputs/#create_final_text_units", "title": "create_final_text_units", "text": "<p>List of all text chunks parsed from the input documents.</p> name type description text str Raw full text of the chunk. n_tokens int Number of tokens in the chunk. This should normally match the <code>chunk_size</code> config parameter, except for the last chunk which is often shorter. document_ids str[] List of document IDs the chunk came from. This is normally only 1 due to our default groupby, but for very short text documents (e.g., microblogs) it can be configured so text units span multiple documents. entity_ids str[] List of entities found in the text unit. relationships_ids str[] List of relationships found in the text unit. covariate_ids str[] Optional list of covariates found in the text unit."}, {"location": "index/overview/", "title": "GraphRAG Indexing \ud83e\udd16", "text": "<p>The GraphRAG indexing package is a data pipeline and transformation suite that is designed to extract meaningful, structured data from unstructured text using LLMs.</p> <p>Indexing Pipelines are configurable. They are composed of workflows, standard and custom steps, prompt templates, and input/output adapters. Our standard pipeline is designed to:</p> <ul> <li>extract entities, relationships and claims from raw text</li> <li>perform community detection in entities</li> <li>generate community summaries and reports at multiple levels of granularity</li> <li>embed entities into a graph vector space</li> <li>embed text chunks into a textual vector space</li> </ul> <p>The outputs of the pipeline can be stored in a variety of formats, including JSON and Parquet - or they can be handled manually via the Python API.</p>"}, {"location": "index/overview/#getting-started", "title": "Getting Started", "text": ""}, {"location": "index/overview/#requirements", "title": "Requirements", "text": "<p>See the requirements section in Get Started for details on setting up a development environment.</p> <p>The Indexing Engine can be used in either a default configuration mode or with a custom pipeline. To configure GraphRAG, see the configuration documentation. After you have a config file you can run the pipeline using the CLI or the Python API.</p>"}, {"location": "index/overview/#usage", "title": "Usage", "text": ""}, {"location": "index/overview/#cli", "title": "CLI", "text": "<pre><code># Via Poetry\npoetry run poe cli --root &lt;data_root&gt; # default config mode\npoetry run poe cli --config your_pipeline.yml # custom config mode\n\n# Via Node\nyarn run:index --root &lt;data_root&gt; # default config mode\nyarn run:index --config your_pipeline.yml # custom config mode\n</code></pre>"}, {"location": "index/overview/#python-api", "title": "Python API", "text": "<p>Please see the examples folder for a handful of functional pipelines illustrating how to create and run via a custom settings.yml or through custom python scripts.</p>"}, {"location": "index/overview/#further-reading", "title": "Further Reading", "text": "<ul> <li>To start developing within the GraphRAG project, see getting started</li> <li>To understand the underlying concepts and execution model of the indexing library, see the architecture documentation</li> <li>To get running with a series of examples, see the examples documentation</li> <li>To read more about configuring the indexing engine, see the configuration documentation</li> </ul>"}, {"location": "prompt_tuning/auto_prompt_tuning/", "title": "Auto Prompt Tuning \u2699\ufe0f", "text": "<p>GraphRAG provides the ability to create domain adapted prompts for the generation of the knowledge graph. This step is optional, though it is highly encouraged to run it as it will yield better results when executing an Index Run.</p> <p>These are generated by loading the inputs, splitting them into chunks (text units) and then running a series of LLM invocations and template substitutions to generate the final prompts. We suggest using the default values provided by the script, but in this page you'll find the detail of each in case you want to further explore and tweak the prompt tuning algorithm.</p> <p> </p> <p> Figure 1: Auto Tuning Conceptual Diagram. </p>"}, {"location": "prompt_tuning/auto_prompt_tuning/#prerequisites", "title": "Prerequisites", "text": "<p>Before running auto tuning, ensure you have already initialized your workspace with the <code>graphrag init</code> command. This will create the necessary configuration files and the default prompts. Refer to the Init Documentation for more information about the initialization process.</p>"}, {"location": "prompt_tuning/auto_prompt_tuning/#usage", "title": "Usage", "text": "<p>You can run the main script from the command line with various options:</p> <pre><code>graphrag prompt-tune [--root ROOT] [--config CONFIG] [--domain DOMAIN]  [--selection-method METHOD] [--limit LIMIT] [--language LANGUAGE] \\\n[--max-tokens MAX_TOKENS] [--chunk-size CHUNK_SIZE] [--n-subset-max N_SUBSET_MAX] [--k K] \\\n[--min-examples-required MIN_EXAMPLES_REQUIRED] [--discover-entity-types] [--output OUTPUT]\n</code></pre>"}, {"location": "prompt_tuning/auto_prompt_tuning/#command-line-options", "title": "Command-Line Options", "text": "<ul> <li> <p><code>--config</code> (required): The path to the configuration file. This is required to load the data and model settings.</p> </li> <li> <p><code>--root</code> (optional): The data project root directory, including the config files (YML, JSON, or .env). Defaults to the current directory.</p> </li> <li> <p><code>--domain</code> (optional): The domain related to your input data, such as 'space science', 'microbiology', or 'environmental news'. If left empty, the domain will be inferred from the input data.</p> </li> <li> <p><code>--method</code> (optional): The method to select documents. Options are all, random, auto or top. Default is random.</p> </li> <li> <p><code>--limit</code> (optional): The limit of text units to load when using random or top selection. Default is 15.</p> </li> <li> <p><code>--language</code> (optional): The language to use for input processing. If it is different from the inputs' language, the LLM will translate. Default is \"\" meaning it will be automatically detected from the inputs.</p> </li> <li> <p><code>--max-tokens</code> (optional): Maximum token count for prompt generation. Default is 2000.</p> </li> <li> <p><code>--chunk-size</code> (optional): The size in tokens to use for generating text units from input documents. Default is 200.</p> </li> <li> <p><code>--n-subset-max</code> (optional): The number of text chunks to embed when using auto selection method. Default is 300.</p> </li> <li> <p><code>--k</code> (optional): The number of documents to select when using auto selection method. Default is 15.</p> </li> <li> <p><code>--min-examples-required</code> (optional): The minimum number of examples required for entity extraction prompts. Default is 2.</p> </li> <li> <p><code>--discover-entity-types</code> (optional): Allow the LLM to discover and extract entities automatically. We recommend using this when your data covers a lot of topics or it is highly randomized.</p> </li> <li> <p><code>--output</code> (optional): The folder to save the generated prompts. Default is \"prompts\".</p> </li> </ul>"}, {"location": "prompt_tuning/auto_prompt_tuning/#example-usage", "title": "Example Usage", "text": "<pre><code>python -m graphrag prompt-tune --root /path/to/project --config /path/to/settings.yaml --domain \"environmental news\" \\\n--method random --limit 10 --language English --max-tokens 2048 --chunk-size 256 --min-examples-required 3 \\\n--no-entity-types --output /path/to/output\n</code></pre> <p>or, with minimal configuration (suggested):</p> <pre><code>python -m graphrag prompt-tune --root /path/to/project --config /path/to/settings.yaml --no-entity-types\n</code></pre>"}, {"location": "prompt_tuning/auto_prompt_tuning/#document-selection-methods", "title": "Document Selection Methods", "text": "<p>The auto tuning feature ingests the input data and then divides it into text units the size of the chunk size parameter. After that, it uses one of the following selection methods to pick a sample to work with for prompt generation:</p> <ul> <li><code>random</code>: Select text units randomly. This is the default and recommended option.</li> <li><code>top</code>: Select the head n text units.</li> <li><code>all</code>: Use all text units for the generation. Use only with small datasets; this option is not usually recommended.</li> <li><code>auto</code>: Embed text units in a lower-dimensional space and select the k nearest neighbors to the centroid. This is useful when you have a large dataset and want to select a representative sample.</li> </ul>"}, {"location": "prompt_tuning/auto_prompt_tuning/#modify-env-vars", "title": "Modify Env Vars", "text": "<p>After running auto tuning, you should modify the following environment variables (or config variables) to pick up the new prompts on your index run. Note: Please make sure to update the correct path to the generated prompts, in this example we are using the default \"prompts\" path.</p> <ul> <li> <p><code>GRAPHRAG_ENTITY_EXTRACTION_PROMPT_FILE</code> = \"prompts/entity_extraction.txt\"</p> </li> <li> <p><code>GRAPHRAG_COMMUNITY_REPORT_PROMPT_FILE</code> = \"prompts/community_report.txt\"</p> </li> <li> <p><code>GRAPHRAG_SUMMARIZE_DESCRIPTIONS_PROMPT_FILE</code> = \"prompts/summarize_descriptions.txt\"</p> </li> </ul> <p>or in your yaml config file:</p> <pre><code>entity_extraction:\n  prompt: \"prompts/entity_extraction.txt\"\n\nsummarize_descriptions:\n  prompt: \"prompts/summarize_descriptions.txt\"\n\ncommunity_reports:\n  prompt: \"prompts/community_report.txt\"\n</code></pre>"}, {"location": "prompt_tuning/manual_prompt_tuning/", "title": "Manual Prompt Tuning \u2699\ufe0f", "text": "<p>The GraphRAG indexer, by default, will run with a handful of prompts that are designed to work well in the broad context of knowledge discovery. However, it is quite common to want to tune the prompts to better suit your specific use case. We provide a means for you to do this by allowing you to specify a custom prompt file, which will each use a series of token-replacements internally.</p> <p>Each of these prompts may be overridden by writing a custom prompt file in plaintext. We use token-replacements in the form of <code>{token_name}</code>, and the descriptions for the available tokens can be found below.</p>"}, {"location": "prompt_tuning/manual_prompt_tuning/#indexing-prompts", "title": "Indexing Prompts", "text": ""}, {"location": "prompt_tuning/manual_prompt_tuning/#entityrelationship-extraction", "title": "Entity/Relationship Extraction", "text": "<p>Prompt Source</p>"}, {"location": "prompt_tuning/manual_prompt_tuning/#tokens", "title": "Tokens", "text": "<ul> <li>{input_text} - The input text to be processed.</li> <li>{entity_types} - A list of entity types</li> <li>{tuple_delimiter} - A delimiter for separating values within a tuple. A single tuple is used to represent an individual entity or relationship.</li> <li>{record_delimiter} - A delimiter for separating tuple instances.</li> <li>{completion_delimiter} - An indicator for when generation is complete.</li> </ul>"}, {"location": "prompt_tuning/manual_prompt_tuning/#summarize-entityrelationship-descriptions", "title": "Summarize Entity/Relationship Descriptions", "text": "<p>Prompt Source</p>"}, {"location": "prompt_tuning/manual_prompt_tuning/#tokens_1", "title": "Tokens", "text": "<ul> <li>{entity_name} - The name of the entity or the source/target pair of the relationship.</li> <li>{description_list} - A list of descriptions for the entity or relationship.</li> </ul>"}, {"location": "prompt_tuning/manual_prompt_tuning/#claim-extraction", "title": "Claim Extraction", "text": "<p>Prompt Source</p>"}, {"location": "prompt_tuning/manual_prompt_tuning/#tokens_2", "title": "Tokens", "text": "<ul> <li>{input_text} - The input text to be processed.</li> <li>{tuple_delimiter} - A delimiter for separating values within a tuple. A single tuple is used to represent an individual entity or relationship.</li> <li>{record_delimiter} - A delimiter for separating tuple instances.</li> <li>{completion_delimiter} - An indicator for when generation is complete.</li> <li>{entity_specs} - A list of entity types.</li> <li>{claim_description} - Description of what claims should look like. Default is: <code>\"Any claims or facts that could be relevant to information discovery.\"</code></li> </ul> <p>See the configuration documentation for details on how to change this.</p>"}, {"location": "prompt_tuning/manual_prompt_tuning/#generate-community-reports", "title": "Generate Community Reports", "text": "<p>Prompt Source</p>"}, {"location": "prompt_tuning/manual_prompt_tuning/#tokens_3", "title": "Tokens", "text": "<ul> <li>{input_text} - The input text to generate the report with. This will contain tables of entities and relationships.</li> </ul>"}, {"location": "prompt_tuning/manual_prompt_tuning/#query-prompts", "title": "Query Prompts", "text": ""}, {"location": "prompt_tuning/manual_prompt_tuning/#local-search", "title": "Local Search", "text": "<p>Prompt Source</p>"}, {"location": "prompt_tuning/manual_prompt_tuning/#tokens_4", "title": "Tokens", "text": "<ul> <li>{response_type} - Describe how the response should look. We default to \"multiple paragraphs\".</li> <li>{context_data} - The data tables from GraphRAG's index.</li> </ul>"}, {"location": "prompt_tuning/manual_prompt_tuning/#global-search", "title": "Global Search", "text": "<p>Mapper Prompt Source</p> <p>Reducer Prompt Source</p> <p>Knowledge Prompt Source</p> <p>Global search uses a map/reduce approach to summarization. You can tune these prompts independently. This search also includes the ability to adjust the use of general knowledge from the model's training.</p>"}, {"location": "prompt_tuning/manual_prompt_tuning/#tokens_5", "title": "Tokens", "text": "<ul> <li>{response_type} - Describe how the response should look (reducer only). We default to \"multiple paragraphs\".</li> <li>{context_data} - The data tables from GraphRAG's index.</li> </ul>"}, {"location": "prompt_tuning/manual_prompt_tuning/#drift-search", "title": "Drift Search", "text": "<p>Prompt Source</p>"}, {"location": "prompt_tuning/manual_prompt_tuning/#tokens_6", "title": "Tokens", "text": "<ul> <li>{response_type} - Describe how the response should look. We default to \"multiple paragraphs\".</li> <li>{context_data} - The data tables from GraphRAG's index.</li> <li>{community_reports} - The most relevant community reports to include in the summarization.</li> <li>{query} - The query text as injected into the context.</li> </ul>"}, {"location": "prompt_tuning/overview/", "title": "Prompt Tuning \u2699\ufe0f", "text": "<p>This page provides an overview of the prompt tuning options available for the GraphRAG indexing engine.</p>"}, {"location": "prompt_tuning/overview/#default-prompts", "title": "Default Prompts", "text": "<p>The default prompts are the simplest way to get started with the GraphRAG system. It is designed to work out-of-the-box with minimal configuration. More details about each of the default prompts for indexing and query can be found on the manual tuning page.</p>"}, {"location": "prompt_tuning/overview/#auto-tuning", "title": "Auto Tuning", "text": "<p>Auto Tuning leverages your input data and LLM interactions to create domain adapted prompts for the generation of the knowledge graph. It is highly encouraged to run it as it will yield better results when executing an Index Run. For more details about how to use it, please refer to the Auto Tuning documentation.</p>"}, {"location": "prompt_tuning/overview/#manual-tuning", "title": "Manual Tuning", "text": "<p>Manual tuning is an advanced use-case. Most users will want to use the Auto Tuning feature instead. Details about how to use manual configuration are available in the manual tuning documentation.</p>"}, {"location": "query/drift_search/", "title": "DRIFT Search \ud83d\udd0e", "text": ""}, {"location": "query/drift_search/#combining-local-and-global-search", "title": "Combining Local and Global Search", "text": "<p>GraphRAG is a technique that uses large language models (LLMs) to create knowledge graphs and summaries from unstructured text documents and leverages them to improve retrieval-augmented generation (RAG) operations on private datasets. It offers comprehensive global overviews of large, private troves of unstructured text documents while also enabling exploration of detailed, localized information. By using LLMs to create comprehensive knowledge graphs that connect and describe entities and relationships contained in those documents, GraphRAG leverages semantic structuring of the data to generate responses to a wide variety of complex user queries.</p> <p>DRIFT search (Dynamic Reasoning and Inference with Flexible Traversal) builds upon Microsoft\u2019s GraphRAG technique, combining characteristics of both global and local search to generate detailed responses in a method that balances computational costs with quality outcomes using our drift search method.</p>"}, {"location": "query/drift_search/#methodology", "title": "Methodology", "text": "<p> Figure 1. An entire DRIFT search hierarchy highlighting the three core phases of the DRIFT search process. A (Primer): DRIFT compares the user\u2019s query with the top K most semantically relevant community reports, generating a broad initial answer and follow-up questions to steer further exploration. B (Follow-Up): DRIFT uses local search to refine queries, producing additional intermediate answers and follow-up questions that enhance specificity, guiding the engine towards context-rich information. A glyph on each node in the diagram shows the confidence the algorithm has to continue the query expansion step.  C (Output Hierarchy): The final output is a hierarchical structure of questions and answers ranked by relevance, reflecting a balanced mix of global insights and local refinements, making the results adaptable and comprehensive.</p> <p>DRIFT Search introduces a new approach to local search queries by including community information in the search process. This greatly expands the breadth of the query\u2019s starting point and leads to retrieval and usage of a far higher variety of facts in the final answer. This addition expands the GraphRAG query engine by providing a more comprehensive option for local search, which uses community insights to refine a query into detailed follow-up questions.</p>"}, {"location": "query/drift_search/#configuration", "title": "Configuration", "text": "<p>Below are the key parameters of the DRIFTSearch class:</p> <ul> <li><code>llm</code>: OpenAI model object to be used for response generation</li> <li><code>context_builder</code>: context builder object to be used for preparing context data from community reports and query information</li> <li><code>config</code>: model to define the DRIFT Search hyperparameters. DRIFT Config model</li> <li><code>token_encoder</code>: token encoder for tracking the budget for the algorithm.</li> <li><code>query_state</code>: a state object as defined in Query State that allows to track execution of a DRIFT Search instance, alongside follow ups and DRIFT actions.</li> </ul>"}, {"location": "query/drift_search/#how-to-use", "title": "How to Use", "text": "<p>An example of a drift search scenario can be found in the following notebook.</p>"}, {"location": "query/drift_search/#learn-more", "title": "Learn More", "text": "<p>For a more in-depth look at the DRIFT search method, please refer to our DRIFT Search blog post</p>"}, {"location": "query/global_search/", "title": "Global Search \ud83d\udd0e", "text": ""}, {"location": "query/global_search/#whole-dataset-reasoning", "title": "Whole Dataset Reasoning", "text": "<p>Baseline RAG struggles with queries that require aggregation of information across the dataset to compose an answer. Queries such as \u201cWhat are the top 5 themes in the data?\u201d perform terribly because baseline RAG relies on a vector search of semantically similar text content within the dataset. There is nothing in the query to direct it to the correct information.</p> <p>However, with GraphRAG we can answer such questions, because the structure of the LLM-generated knowledge graph tells us about the structure (and thus themes) of the dataset as a whole. This allows the private dataset to be organized into meaningful semantic clusters that are pre-summarized. Using our global search method, the LLM uses these clusters to summarize these themes when responding to a user query.</p>"}, {"location": "query/global_search/#methodology", "title": "Methodology", "text": "<pre><code>---\ntitle: Global Search Dataflow\n---\n%%{ init: { 'flowchart': { 'curve': 'step' } } }%%\nflowchart LR\n\n    uq[User Query] --- .1\n    ch1[Conversation History] --- .1\n\n    subgraph RIR\n        direction TB\n        ri1[Rated Intermediate&lt;br/&gt;Response 1]~~~ri2[Rated Intermediate&lt;br/&gt;Response 2] -.\"{1..N}\".-rin[Rated Intermediate&lt;br/&gt;Response N]\n    end\n\n    .1--Shuffled Community&lt;br/&gt;Report Batch 1--&gt;RIR\n    .1--Shuffled Community&lt;br/&gt;Report Batch 2--&gt;RIR---.2\n    .1--Shuffled Community&lt;br/&gt;Report Batch N--&gt;RIR\n\n    .2--Ranking +&lt;br/&gt;Filtering--&gt;agr[Aggregated Intermediate&lt;br/&gt;Responses]--&gt;res[Response]\n\n\n\n     classDef green fill:#26B653,stroke:#333,stroke-width:2px,color:#fff;\n     classDef turquoise fill:#19CCD3,stroke:#333,stroke-width:2px,color:#fff;\n     classDef rose fill:#DD8694,stroke:#333,stroke-width:2px,color:#fff;\n     classDef orange fill:#F19914,stroke:#333,stroke-width:2px,color:#fff;\n     classDef purple fill:#B356CD,stroke:#333,stroke-width:2px,color:#fff;\n     classDef invisible fill:#fff,stroke:#fff,stroke-width:0px,color:#fff, width:0px;\n     class uq,ch1 turquoise;\n     class ri1,ri2,rin rose;\n     class agr orange;\n     class res purple;\n     class .1,.2 invisible;\n</code></pre> <p>Given a user query and, optionally, the conversation history, the global search method uses a collection of LLM-generated community reports from a specified level of the graph's community hierarchy as context data to generate response in a map-reduce manner. At the <code>map</code> step, community reports are segmented into text chunks of pre-defined size. Each text chunk is then used to produce an intermediate response containing a list of point, each of which is accompanied by a numerical rating indicating the importance of the point. At the <code>reduce</code> step, a filtered set of the most important points from the intermediate responses are aggregated and used as the context to generate the final response. </p> <p>The quality of the global search\u2019s response can be heavily influenced by the level of the community hierarchy chosen for sourcing community reports. Lower hierarchy levels, with their detailed reports, tend to yield more thorough responses, but may also increase the time and LLM resources needed to generate the final response due to the volume of reports.</p>"}, {"location": "query/global_search/#configuration", "title": "Configuration", "text": "<p>Below are the key parameters of the GlobalSearch class:</p> <ul> <li><code>llm</code>: OpenAI model object to be used for response generation</li> <li><code>context_builder</code>: context builder object to be used for preparing context data from community reports</li> <li><code>map_system_prompt</code>: prompt template used in the <code>map</code> stage. Default template can be found at map_system_prompt</li> <li><code>reduce_system_prompt</code>: prompt template used in the <code>reduce</code> stage, default template can be found at reduce_system_prompt</li> <li><code>response_type</code>: free-form text describing the desired response type and format (e.g., <code>Multiple Paragraphs</code>, <code>Multi-Page Report</code>)</li> <li><code>allow_general_knowledge</code>: setting this to True will include additional instructions to the <code>reduce_system_prompt</code> to prompt the LLM to incorporate relevant real-world knowledge outside of the dataset. Note that this may increase hallucinations, but can be useful for certain scenarios. Default is False *<code>general_knowledge_inclusion_prompt</code>: instruction to add to the <code>reduce_system_prompt</code> if <code>allow_general_knowledge</code> is enabled. Default instruction can be found at general_knowledge_instruction</li> <li><code>max_data_tokens</code>: token budget for the context data</li> <li><code>map_llm_params</code>: a dictionary of additional parameters (e.g., temperature, max_tokens) to be passed to the LLM call at the <code>map</code> stage</li> <li><code>reduce_llm_params</code>: a dictionary of additional parameters (e.g., temperature, max_tokens) to passed to the LLM call at the <code>reduce</code> stage</li> <li><code>context_builder_params</code>: a dictionary of additional parameters to be passed to the <code>context_builder</code> object when building context window for the <code>map</code> stage.</li> <li><code>concurrent_coroutines</code>: controls the degree of parallelism in the <code>map</code> stage.</li> <li><code>callbacks</code>: optional callback functions, can be used to provide custom event handlers for LLM's completion streaming events</li> </ul>"}, {"location": "query/global_search/#how-to-use", "title": "How to Use", "text": "<p>An example of a global search scenario can be found in the following notebook.</p>"}, {"location": "query/local_search/", "title": "Local Search \ud83d\udd0e", "text": ""}, {"location": "query/local_search/#entity-based-reasoning", "title": "Entity-based Reasoning", "text": "<p>The local search method combines structured data from the knowledge graph with unstructured data from the input documents to augment the LLM context with relevant entity information at query time. It is well-suited for answering questions that require an understanding of specific entities mentioned in the input documents (e.g., \u201cWhat are the healing properties of chamomile?\u201d).</p>"}, {"location": "query/local_search/#methodology", "title": "Methodology", "text": "<pre><code>---\ntitle: Local Search Dataflow\n---\n%%{ init: { 'flowchart': { 'curve': 'step' } } }%%\nflowchart LR\n\n    uq[User Query] ---.1\n    ch1[Conversation&lt;br/&gt;History]---.1\n\n    .1--Entity&lt;br/&gt;Description&lt;br/&gt;Embedding--&gt; ee[Extracted Entities]\n\n    ee[Extracted Entities] ---.2--Entity-Text&lt;br/&gt;Unit Mapping--&gt; ctu[Candidate&lt;br/&gt;Text Units]--Ranking + &lt;br/&gt;Filtering --&gt;ptu[Prioritized&lt;br/&gt;Text Units]---.3\n    .2--Entity-Report&lt;br/&gt;Mapping--&gt; ccr[Candidate&lt;br/&gt;Community Reports]--Ranking + &lt;br/&gt;Filtering --&gt;pcr[Prioritized&lt;br/&gt;Community Reports]---.3\n    .2--Entity-Entity&lt;br/&gt;Relationships--&gt; ce[Candidate&lt;br/&gt;Entities]--Ranking + &lt;br/&gt;Filtering --&gt;pe[Prioritized&lt;br/&gt;Entities]---.3\n    .2--Entity-Entity&lt;br/&gt;Relationships--&gt; cr[Candidate&lt;br/&gt;Relationships]--Ranking + &lt;br/&gt;Filtering --&gt;pr[Prioritized&lt;br/&gt;Relationships]---.3\n    .2--Entity-Covariate&lt;br/&gt;Mappings--&gt; cc[Candidate&lt;br/&gt;Covariates]--Ranking + &lt;br/&gt;Filtering --&gt;pc[Prioritized&lt;br/&gt;Covariates]---.3\n    ch1 --&gt;ch2[Conversation History]---.3\n    .3--&gt;res[Response]\n\n     classDef green fill:#26B653,stroke:#333,stroke-width:2px,color:#fff;\n     classDef turquoise fill:#19CCD3,stroke:#333,stroke-width:2px,color:#fff;\n     classDef rose fill:#DD8694,stroke:#333,stroke-width:2px,color:#fff;\n     classDef orange fill:#F19914,stroke:#333,stroke-width:2px,color:#fff;\n     classDef purple fill:#B356CD,stroke:#333,stroke-width:2px,color:#fff;\n     classDef invisible fill:#fff,stroke:#fff,stroke-width:0px,color:#fff, width:0px;\n     class uq,ch1 turquoise\n     class ee green\n     class ctu,ccr,ce,cr,cc rose\n     class ptu,pcr,pe,pr,pc,ch2 orange\n     class res purple\n     class .1,.2,.3 invisible\n\n</code></pre> <p>Given a user query and, optionally, the conversation history, the local search method identifies a set of entities from the knowledge graph that are semantically-related to the user input. These entities serve as access points into the knowledge graph, enabling the extraction of further relevant details such as connected entities, relationships, entity covariates, and community reports. Additionally, it also extracts relevant text chunks from the raw input documents that are associated with the identified entities. These candidate data sources are then prioritized and filtered to fit within a single context window of pre-defined size, which is used to generate a response to the user query.</p>"}, {"location": "query/local_search/#configuration", "title": "Configuration", "text": "<p>Below are the key parameters of the LocalSearch class:</p> <ul> <li><code>llm</code>: OpenAI model object to be used for response generation</li> <li><code>context_builder</code>: context builder object to be used for preparing context data from collections of knowledge model objects</li> <li><code>system_prompt</code>: prompt template used to generate the search response. Default template can be found at system_prompt</li> <li><code>response_type</code>: free-form text describing the desired response type and format (e.g., <code>Multiple Paragraphs</code>, <code>Multi-Page Report</code>)</li> <li><code>llm_params</code>: a dictionary of additional parameters (e.g., temperature, max_tokens) to be passed to the LLM call</li> <li><code>context_builder_params</code>: a dictionary of additional parameters to be passed to the <code>context_builder</code> object when building context for the search prompt</li> <li><code>callbacks</code>: optional callback functions, can be used to provide custom event handlers for LLM's completion streaming events</li> </ul>"}, {"location": "query/local_search/#how-to-use", "title": "How to Use", "text": "<p>An example of a local search scenario can be found in the following notebook.</p>"}, {"location": "query/overview/", "title": "Query Engine \ud83d\udd0e", "text": "<p>The Query Engine is the retrieval module of the Graph RAG Library. It is one of the two main components of the Graph RAG library, the other being the Indexing Pipeline (see Indexing Pipeline). It is responsible for the following tasks:</p> <ul> <li>Local Search</li> <li>Global Search</li> <li>DRIFT Search</li> <li>Question Generation</li> </ul>"}, {"location": "query/overview/#local-search", "title": "Local Search", "text": "<p>Local search method generates answers by combining relevant data from the AI-extracted knowledge-graph with text chunks of the raw documents. This method is suitable for questions that require an understanding of specific entities mentioned in the documents (e.g. What are the healing properties of chamomile?).</p> <p>For more details about how Local Search works please refer to the Local Search documentation.</p>"}, {"location": "query/overview/#global-search", "title": "Global Search", "text": "<p>Global search method generates answers by searching over all AI-generated community reports in a map-reduce fashion. This is a resource-intensive method, but often gives good responses for questions that require an understanding of the dataset as a whole (e.g. What are the most significant values of the herbs mentioned in this notebook?).</p> <p>More about this can be checked at the Global Search documentation.</p>"}, {"location": "query/overview/#drift-search", "title": "DRIFT Search", "text": "<p>DRIFT Search introduces a new approach to local search queries by including community information in the search process. This greatly expands the breadth of the query\u2019s starting point and leads to retrieval and usage of a far higher variety of facts in the final answer. This addition expands the GraphRAG query engine by providing a more comprehensive option for local search, which uses community insights to refine a query into detailed follow-up questions.</p> <p>To learn more about DRIFT Search, please refer to the DRIFT Search documentation.</p>"}, {"location": "query/overview/#question-generation", "title": "Question Generation", "text": "<p>This functionality takes a list of user queries and generates the next candidate questions. This is useful for generating follow-up questions in a conversation or for generating a list of questions for the investigator to dive deeper into the dataset.</p> <p>Information about how question generation works can be found at the Question Generation documentation page.</p>"}, {"location": "query/question_generation/", "title": "Question Generation \u2754", "text": ""}, {"location": "query/question_generation/#entity-based-question-generation", "title": "Entity-based Question Generation", "text": "<p>The question generation method combines structured data from the knowledge graph with unstructured data from the input documents to generate candidate questions related to specific entities.</p>"}, {"location": "query/question_generation/#methodology", "title": "Methodology", "text": "<p>Given a list of prior user questions, the question generation method uses the same context-building approach employed in local search to extract and prioritize relevant structured and unstructured data, including entities, relationships, covariates, community reports and raw text chunks. These data records are then fitted into a single LLM prompt to generate candidate follow-up questions that represent the most important or urgent information content or themes in the data.</p>"}, {"location": "query/question_generation/#configuration", "title": "Configuration", "text": "<p>Below are the key parameters of the Question Generation class:</p> <ul> <li><code>llm</code>: OpenAI model object to be used for response generation</li> <li><code>context_builder</code>: context builder object to be used for preparing context data from collections of knowledge model objects, using the same context builder class as in local search</li> <li><code>system_prompt</code>: prompt template used to generate candidate questions. Default template can be found at system_prompt</li> <li><code>llm_params</code>: a dictionary of additional parameters (e.g., temperature, max_tokens) to be passed to the LLM call</li> <li><code>context_builder_params</code>: a dictionary of additional parameters to be passed to the <code>context_builder</code> object when building context for the question generation prompt</li> <li><code>callbacks</code>: optional callback functions, can be used to provide custom event handlers for LLM's completion streaming events</li> </ul>"}, {"location": "query/question_generation/#how-to-use", "title": "How to Use", "text": "<p>An example of the question generation function can be found in the following notebook.</p>"}, {"location": "query/notebooks/overview/", "title": "API Notebooks", "text": "<ul> <li>API Overview Notebook</li> </ul>"}, {"location": "query/notebooks/overview/#query-engine-notebooks", "title": "Query Engine Notebooks", "text": "<p>For examples about running Query please refer to the following notebooks:</p> <ul> <li>Global Search Notebook</li> <li>Local Search Notebook</li> <li>DRIFT Search Notebook</li> </ul> <p>The test dataset for these notebooks can be found in dataset.zip.</p>"}]}
\ No newline at end of file
+{"config": {"lang": ["en"], "separator": "[\\s\\-]+", "pipeline": ["stopWordFilter"]}, "docs": [{"location": "", "title": "Welcome to GraphRAG", "text": "<p>\ud83d\udc49 Microsoft Research Blog Post  \ud83d\udc49 GraphRAG Accelerator  \ud83d\udc49 GraphRAG Arxiv</p> <p> </p> <p> Figure 1: An LLM-generated knowledge graph built using GPT-4 Turbo. </p> <p>GraphRAG is a structured, hierarchical approach to Retrieval Augmented Generation (RAG), as opposed to naive semantic-search approaches using plain text snippets. The GraphRAG process involves extracting a knowledge graph out of raw text, building a community hierarchy, generating summaries for these communities, and then leveraging these structures when perform RAG-based tasks.</p> <p>To learn more about GraphRAG and how it can be used to enhance your LLMs ability to reason about your private data, please visit the Microsoft Research Blog Post.</p>"}, {"location": "#solution-accelerator", "title": "Solution Accelerator \ud83d\ude80", "text": "<p>To quickstart the GraphRAG system we recommend trying the Solution Accelerator package. This provides a user-friendly end-to-end experience with Azure resources.</p>"}, {"location": "#get-started-with-graphrag", "title": "Get Started with GraphRAG \ud83d\ude80", "text": "<p>To start using GraphRAG, check out the Get Started guide. For a deeper dive into the main sub-systems, please visit the docpages for the Indexer and Query packages.</p>"}, {"location": "#graphrag-vs-baseline-rag", "title": "GraphRAG vs Baseline RAG \ud83d\udd0d", "text": "<p>Retrieval-Augmented Generation (RAG) is a technique to improve LLM outputs using real-world information. This technique is an important part of most LLM-based tools and the majority of RAG approaches use vector similarity as the search technique, which we call Baseline RAG. GraphRAG uses knowledge graphs to provide substantial improvements in question-and-answer performance when reasoning about complex information. RAG techniques have shown promise in helping LLMs to reason about private datasets - data that the LLM is not trained on and has never seen before, such as an enterprise\u2019s proprietary research, business documents, or communications. Baseline RAG was created to help solve this problem, but we observe situations where baseline RAG performs very poorly. For example:</p> <ul> <li>Baseline RAG struggles to connect the dots. This happens when answering a question requires traversing disparate pieces of information through their shared attributes in order to provide new synthesized insights.</li> <li>Baseline RAG performs poorly when being asked to holistically understand summarized semantic concepts over large data collections or even singular large documents.</li> </ul> <p>To address this, the tech community is working to develop methods that extend and enhance RAG. Microsoft Research\u2019s new approach, GraphRAG, uses LLMs to create a knowledge graph based on an input corpus. This graph, along with community summaries and graph machine learning outputs, are used to augment prompts at query time. GraphRAG shows substantial improvement in answering the two classes of questions described above, demonstrating intelligence or mastery that outperforms other approaches previously applied to private datasets.</p>"}, {"location": "#the-graphrag-process", "title": "The GraphRAG Process \ud83e\udd16", "text": "<p>GraphRAG builds upon our prior research and tooling using graph machine learning. The basic steps of the GraphRAG process are as follows:</p>"}, {"location": "#index", "title": "Index", "text": "<ul> <li>Slice up an input corpus into a series of TextUnits, which act as analyzable units for the rest of the process, and provide fine-grained references in our outputs.</li> <li>Extract all entities, relationships, and key claims from the TextUnits using an LLM.</li> <li>Perform a hierarchical clustering of the graph using the Leiden technique. To see this visually, check out Figure 1 above. Each circle is an entity (e.g., a person, place, or organization), with the size representing the degree of the entity, and the color representing its community.</li> <li>Generate summaries of each community and its constituents from the bottom-up. This aids in holistic understanding of the dataset.</li> </ul>"}, {"location": "#query", "title": "Query", "text": "<p>At query time, these structures are used to provide materials for the LLM context window when answering a question. The primary query modes are:</p> <ul> <li>Global Search for reasoning about holistic questions about the corpus by leveraging the community summaries.</li> <li>Local Search for reasoning about specific entities by fanning-out to their neighbors and associated concepts.</li> <li>DRIFT Search for reasoning about specific entities by fanning-out to their neighbors and associated concepts, but with the added context of community information.</li> </ul>"}, {"location": "#prompt-tuning", "title": "Prompt Tuning", "text": "<p>Using GraphRAG with your data out of the box may not yield the best possible results. We strongly recommend to fine-tune your prompts following the Prompt Tuning Guide in our documentation.</p>"}, {"location": "blog_posts/", "title": "Microsoft Research Blog", "text": "<ul> <li> <p> GraphRAG: Unlocking LLM discovery on narrative private data</p> <p>Published February 13, 2024 <p>By Jonathan Larson, Senior Principal Data Architect; Steven Truitt, Principal Program Manager</p> <li> <p> GraphRAG: New tool for complex data discovery now on GitHub</p> <p>Published July 2, 2024 <p>By Darren Edge, Senior Director; Ha Trinh, Senior Data Scientist; Steven Truitt, Principal Program Manager; Jonathan Larson, Senior Principal Data Architect</p> <li> <p> GraphRAG auto-tuning provides rapid adaptation to new domains</p> <p>Published September 9, 2024 <p>By Alonso Guevara Fern\u00e1ndez, Sr. Software Engineer; Katy Smith, Data Scientist II; Joshua Bradley, Senior Data Scientist; Darren Edge, Senior Director; Ha Trinh, Senior Data Scientist; Sarah Smith, Senior Program Manager; Ben Cutler, Senior Director; Steven Truitt, Principal Program Manager; Jonathan Larson, Senior Principal Data Architect</p> <li> <p> Introducing DRIFT Search: Combining global and local search methods to improve quality and efficiency</p> <p>Published October 31, 2024 <p>By Julian Whiting, Senior Machine Learning Engineer; Zachary Hills , Senior Software Engineer; Alonso Guevara Fern\u00e1ndez, Sr. Software Engineer; Ha Trinh, Senior Data Scientist; Adam Bradley , Managing Partner, Strategic Research; Jonathan Larson, Senior Principal Data Architect</p> <li> <p> GraphRAG: Improving global search via dynamic community selection</p> <p>Published November 15, 2024 <p>By Bryan Li, Research Intern; Ha Trinh, Senior Data Scientist; Darren Edge, Senior Director; Jonathan Larson, Senior Principal Data Architect</p> <li> <p> LazyGraphRAG: Setting a new standard for quality and cost</p> <p>Published November 25, 2024 <p>By Darren Edge, Senior Director; Ha Trinh, Senior Data Scientist;  Jonathan Larson, Senior Principal Data Architect</p> <li> <p> Moving to GraphRAG 1.0 \u2013 Streamlining ergonomics for developers and users</p> <p>Published December 16, 2024 <p>By Nathan Evans, Principal Software Architect; Alonso Guevara Fern\u00e1ndez, Senior Software Engineer; Joshua Bradley, Senior Data Scientist</p>"}, {"location": "cli/", "title": "CLI Reference", "text": "<p>This page documents the command-line interface of the graphrag library.</p>"}, {"location": "cli/#graphrag", "title": "graphrag", "text": "<p>GraphRAG: A graph-based retrieval-augmented generation (RAG) system.</p> <p>Usage:</p> <pre><code> [OPTIONS] COMMAND [ARGS]...\n</code></pre> <p>Options:</p> <pre><code>  --install-completion  Install completion for the current shell.\n  --show-completion     Show completion for the current shell, to copy it or\n                        customize the installation.\n</code></pre>"}, {"location": "cli/#index", "title": "index", "text": "<p>Build a knowledge graph index.</p> <p>Usage:</p> <pre><code> index [OPTIONS]\n</code></pre> <p>Options:</p> <pre><code>  --config PATH                   The configuration to use.\n  --root PATH                     The project root directory.  \\[default: .]\n  --verbose / --no-verbose        Run the indexing pipeline with verbose\n                                  logging  \\[default: no-verbose]\n  --memprofile / --no-memprofile  Run the indexing pipeline with memory\n                                  profiling  \\[default: no-memprofile]\n  --resume TEXT                   Resume a given indexing run\n  --logger [rich|print|none]      The progress logger to use.  \\[default:\n                                  rich]\n  --dry-run / --no-dry-run        Run the indexing pipeline without executing\n                                  any steps to inspect and validate the\n                                  configuration.  \\[default: no-dry-run]\n  --cache / --no-cache            Use LLM cache.  \\[default: cache]\n  --skip-validation / --no-skip-validation\n                                  Skip any preflight validation. Useful when\n                                  running no LLM steps.  \\[default: no-skip-\n                                  validation]\n  --output PATH                   Indexing pipeline output directory.\n                                  Overrides storage.base_dir in the\n                                  configuration file.\n</code></pre>"}, {"location": "cli/#init", "title": "init", "text": "<p>Generate a default configuration file.</p> <p>Usage:</p> <pre><code> init [OPTIONS]\n</code></pre> <p>Options:</p> <pre><code>  --root PATH  The project root directory.  \\[required]\n</code></pre>"}, {"location": "cli/#prompt-tune", "title": "prompt-tune", "text": "<p>Generate custom graphrag prompts with your own data (i.e. auto templating).</p> <p>Usage:</p> <pre><code> prompt-tune [OPTIONS]\n</code></pre> <p>Options:</p> <pre><code>  --root PATH                     The project root directory.  \\[default: .]\n  --config PATH                   The configuration to use.\n  --domain TEXT                   The domain your input data is related to.\n                                  For example 'space science', 'microbiology',\n                                  'environmental news'. If not defined, a\n                                  domain will be inferred from the input data.\n  --selection-method [all|random|top|auto]\n                                  The text chunk selection method.  \\[default:\n                                  random]\n  --n-subset-max INTEGER          The number of text chunks to embed when\n                                  --selection-method=auto.  \\[default: 300]\n  --k INTEGER                     The maximum number of documents to select\n                                  from each centroid when --selection-\n                                  method=auto.  \\[default: 15]\n  --limit INTEGER                 The number of documents to load when\n                                  --selection-method={random,top}.  \\[default:\n                                  15]\n  --max-tokens INTEGER            The max token count for prompt generation.\n                                  \\[default: 2000]\n  --min-examples-required INTEGER\n                                  The minimum number of examples to\n                                  generate/include in the entity extraction\n                                  prompt.  \\[default: 2]\n  --chunk-size INTEGER            The max token count for prompt generation.\n                                  \\[default: 200]\n  --language TEXT                 The primary language used for inputs and\n                                  outputs in graphrag prompts.\n  --discover-entity-types / --no-discover-entity-types\n                                  Discover and extract unspecified entity\n                                  types.  \\[default: discover-entity-types]\n  --output PATH                   The directory to save prompts to, relative\n                                  to the project root directory.  \\[default:\n                                  prompts]\n</code></pre>"}, {"location": "cli/#query", "title": "query", "text": "<p>Query a knowledge graph index.</p> <p>Usage:</p> <pre><code> query [OPTIONS]\n</code></pre> <p>Options:</p> <pre><code>  --method [local|global|drift|basic]\n                                  The query algorithm to use.  \\[required]\n  --query TEXT                    The query to execute.  \\[required]\n  --config PATH                   The configuration to use.\n  --data PATH                     Indexing pipeline output directory (i.e.\n                                  contains the parquet files).\n  --root PATH                     The project root directory.  \\[default: .]\n  --community-level INTEGER       The community level in the Leiden community\n                                  hierarchy from which to load community\n                                  reports. Higher values represent reports\n                                  from smaller communities.  \\[default: 2]\n  --dynamic-community-selection / --no-dynamic-community-selection\n                                  Use global search with dynamic community\n                                  selection.  \\[default: no-dynamic-community-\n                                  selection]\n  --response-type TEXT            Free form text describing the response type\n                                  and format, can be anything, e.g. Multiple\n                                  Paragraphs, Single Paragraph, Single\n                                  Sentence, List of 3-7 Points, Single Page,\n                                  Multi-Page Report. Default: Multiple\n                                  Paragraphs  \\[default: Multiple Paragraphs]\n  --streaming / --no-streaming    Print response in a streaming manner.\n                                  \\[default: no-streaming]\n</code></pre>"}, {"location": "cli/#update", "title": "update", "text": "<p>Update an existing knowledge graph index.</p> <p>Applies a default storage configuration (if not provided by config), saving the new index to the local file system in the <code>update_output</code> folder.</p> <p>Usage:</p> <pre><code> update [OPTIONS]\n</code></pre> <p>Options:</p> <pre><code>  --config PATH                   The configuration to use.\n  --root PATH                     The project root directory.  \\[default: .]\n  --verbose / --no-verbose        Run the indexing pipeline with verbose\n                                  logging  \\[default: no-verbose]\n  --memprofile / --no-memprofile  Run the indexing pipeline with memory\n                                  profiling  \\[default: no-memprofile]\n  --logger [rich|print|none]      The progress logger to use.  \\[default:\n                                  rich]\n  --cache / --no-cache            Use LLM cache.  \\[default: cache]\n  --skip-validation / --no-skip-validation\n                                  Skip any preflight validation. Useful when\n                                  running no LLM steps.  \\[default: no-skip-\n                                  validation]\n  --output PATH                   Indexing pipeline output directory.\n                                  Overrides storage.base_dir in the\n                                  configuration file.\n</code></pre>"}, {"location": "developing/", "title": "Development Guide", "text": ""}, {"location": "developing/#requirements", "title": "Requirements", "text": "Name Installation Purpose Python 3.10-3.12 Download The library is Python-based. Poetry Instructions Poetry is used for package management and virtualenv management in Python codebases"}, {"location": "developing/#getting-started", "title": "Getting Started", "text": ""}, {"location": "developing/#install-dependencies", "title": "Install Dependencies", "text": "<pre><code># Install Python dependencies.\npoetry install\n</code></pre>"}, {"location": "developing/#execute-the-indexing-engine", "title": "Execute the Indexing Engine", "text": "<pre><code>poetry run poe index &lt;...args&gt;\n</code></pre>"}, {"location": "developing/#executing-queries", "title": "Executing Queries", "text": "<pre><code>poetry run poe query &lt;...args&gt;\n</code></pre>"}, {"location": "developing/#azurite", "title": "Azurite", "text": "<p>Some unit and smoke tests use Azurite to emulate Azure resources. This can be started by running:</p> <pre><code>./scripts/start-azurite.sh\n</code></pre> <p>or by simply running <code>azurite</code> in the terminal if already installed globally. See the Azurite documentation for more information about how to install and use Azurite.</p>"}, {"location": "developing/#lifecycle-scripts", "title": "Lifecycle Scripts", "text": "<p>Our Python package utilizes Poetry to manage dependencies and poethepoet to manage build scripts.</p> <p>Available scripts are:</p> <ul> <li><code>poetry run poe index</code> - Run the Indexing CLI</li> <li><code>poetry run poe query</code> - Run the Query CLI</li> <li><code>poetry build</code> - This invokes <code>poetry build</code>, which will build a wheel file and other distributable artifacts.</li> <li><code>poetry run poe test</code> - This will execute all tests.</li> <li><code>poetry run poe test_unit</code> - This will execute unit tests.</li> <li><code>poetry run poe test_integration</code> - This will execute integration tests.</li> <li><code>poetry run poe test_smoke</code> - This will execute smoke tests.</li> <li><code>poetry run poe check</code> - This will perform a suite of static checks across the package, including:</li> <li>formatting</li> <li>documentation formatting</li> <li>linting</li> <li>security patterns</li> <li>type-checking</li> <li><code>poetry run poe fix</code> - This will apply any available auto-fixes to the package. Usually this is just formatting fixes.</li> <li><code>poetry run poe fix_unsafe</code> - This will apply any available auto-fixes to the package, including those that may be unsafe.</li> <li><code>poetry run poe format</code> - Explicitly run the formatter across the package.</li> </ul>"}, {"location": "developing/#troubleshooting", "title": "Troubleshooting", "text": ""}, {"location": "developing/#runtimeerror-llvm-config-failed-executing-please-point-llvm_config-to-the-path-for-llvm-config-when-running-poetry-install", "title": "\"RuntimeError: llvm-config failed executing, please point LLVM_CONFIG to the path for llvm-config\" when running poetry install", "text": "<p>Make sure llvm-9 and llvm-9-dev are installed:</p> <p><code>sudo apt-get install llvm-9 llvm-9-dev</code></p> <p>and then in your bashrc, add</p> <p><code>export LLVM_CONFIG=/usr/bin/llvm-config-9</code></p>"}, {"location": "developing/#numba_pymoduleh610-fatal-error-pythonh-no-such-file-or-directory-when-running-poetry-install", "title": "\"numba/_pymodule.h:6:10: fatal error: Python.h: No such file or directory\" when running poetry install", "text": "<p>Make sure you have python3.10-dev installed or more generally <code>python&lt;version&gt;-dev</code></p> <p><code>sudo apt-get install python3.10-dev</code></p>"}, {"location": "developing/#llm-call-constantly-exceeds-tpm-rpm-or-time-limits", "title": "LLM call constantly exceeds TPM, RPM or time limits", "text": "<p><code>GRAPHRAG_LLM_THREAD_COUNT</code> and <code>GRAPHRAG_EMBEDDING_THREAD_COUNT</code> are both set to 50 by default. You can modify these values to reduce concurrency. Please refer to the Configuration Documents</p>"}, {"location": "get_started/", "title": "Getting Started", "text": ""}, {"location": "get_started/#requirements", "title": "Requirements", "text": "<p>Python 3.10-3.12</p> <p>To get started with the GraphRAG system, you have a few options:</p> <p>\ud83d\udc49 Use the GraphRAG Accelerator solution  \ud83d\udc49 Install from pypi.  \ud83d\udc49 Use it from source</p>"}, {"location": "get_started/#quickstart", "title": "Quickstart", "text": "<p>To get started with the GraphRAG system we recommend trying the Solution Accelerator package. This provides a user-friendly end-to-end experience with Azure resources.</p>"}, {"location": "get_started/#overview", "title": "Overview", "text": "<p>The following is a simple end-to-end example for using the GraphRAG system. It shows how to use the system to index some text, and then use the indexed data to answer questions about the documents.</p>"}, {"location": "get_started/#install-graphrag", "title": "Install GraphRAG", "text": "<pre><code>pip install graphrag\n</code></pre> <p>The graphrag library includes a CLI for a no-code approach to getting started. Please review the full CLI documentation for further detail.</p>"}, {"location": "get_started/#running-the-indexer", "title": "Running the Indexer", "text": "<p>We need to set up a data project and some initial configuration. First let's get a sample dataset ready:</p> <pre><code>mkdir -p ./ragtest/input\n</code></pre> <p>Get a copy of A Christmas Carol by Charles Dickens from a trusted source:</p> <pre><code>curl https://www.gutenberg.org/cache/epub/24022/pg24022.txt -o ./ragtest/input/book.txt\n</code></pre>"}, {"location": "get_started/#set-up-your-workspace-variables", "title": "Set Up Your Workspace Variables", "text": "<p>To initialize your workspace, first run the <code>graphrag init</code> command. Since we have already configured a directory named <code>./ragtest</code> in the previous step, run the following command:</p> <pre><code>graphrag init --root ./ragtest\n</code></pre> <p>This will create two files: <code>.env</code> and <code>settings.yaml</code> in the <code>./ragtest</code> directory.</p> <ul> <li><code>.env</code> contains the environment variables required to run the GraphRAG pipeline. If you inspect the file, you'll see a single environment variable defined,   <code>GRAPHRAG_API_KEY=&lt;API_KEY&gt;</code>. This is the API key for the OpenAI API or Azure OpenAI endpoint. You can replace this with your own API key. If you are using another form of authentication (i.e. managed identity), please delete this file.</li> <li><code>settings.yaml</code> contains the settings for the pipeline. You can modify this file to change the settings for the pipeline.   </li> </ul>"}, {"location": "get_started/#openai-and-azure-openai", "title": "OpenAI and Azure OpenAI", "text": "<p>If running in OpenAI mode, update the value of <code>GRAPHRAG_API_KEY</code> in the <code>.env</code> file with your OpenAI API key.</p>"}, {"location": "get_started/#azure-openai", "title": "Azure OpenAI", "text": "<p>In addition, Azure OpenAI users should set the following variables in the settings.yaml file. To find the appropriate sections, just search for the <code>llm:</code> configuration, you should see two sections, one for the chat endpoint and one for the embeddings endpoint. Here is an example of how to configure the chat endpoint:</p> <pre><code>type: azure_openai_chat # Or azure_openai_embedding for embeddings\napi_base: https://&lt;instance&gt;.openai.azure.com\napi_version: 2024-02-15-preview # You can customize this for other versions\ndeployment_name: &lt;azure_model_deployment_name&gt;\n</code></pre> <ul> <li>For more details about configuring GraphRAG, see the configuration documentation.</li> <li>To learn more about Initialization, refer to the Initialization documentation.</li> <li>For more details about using the CLI, refer to the CLI documentation.</li> </ul>"}, {"location": "get_started/#running-the-indexing-pipeline", "title": "Running the Indexing pipeline", "text": "<p>Finally we'll run the pipeline!</p> <pre><code>graphrag index --root ./ragtest\n</code></pre> <p></p> <p>This process will take some time to run. This depends on the size of your input data, what model you're using, and the text chunk size being used (these can be configured in your <code>settings.yml</code> file). Once the pipeline is complete, you should see a new folder called <code>./ragtest/output</code> with a series of parquet files.</p>"}, {"location": "get_started/#using-the-query-engine", "title": "Using the Query Engine", "text": ""}, {"location": "get_started/#running-the-query-engine", "title": "Running the Query Engine", "text": "<p>Now let's ask some questions using this dataset.</p> <p>Here is an example using Global search to ask a high-level question:</p> <pre><code>graphrag query \\\n--root ./ragtest \\\n--method global \\\n--query \"What are the top themes in this story?\"\n</code></pre> <p>Here is an example using Local search to ask a more specific question about a particular character:</p> <pre><code>graphrag query \\\n--root ./ragtest \\\n--method local \\\n--query \"Who is Scrooge and what are his main relationships?\"\n</code></pre> <p>Please refer to Query Engine docs for detailed information about how to leverage our Local and Global search mechanisms for extracting meaningful insights from data after the Indexer has wrapped up execution.</p>"}, {"location": "get_started/#visualizing-the-graph", "title": "Visualizing the Graph", "text": "<p>Check out our visualization guide for a more interactive experience in debugging and exploring the knowledge graph.</p>"}, {"location": "visualization_guide/", "title": "Visualizing and Debugging Your Knowledge Graph", "text": "<p>The following step-by-step guide walks through the process to visualize a knowledge graph after it's been constructed by graphrag. Note that some of the settings recommended below are based on our own experience of what works well. Feel free to change and explore other settings for a better visualization experience!</p>"}, {"location": "visualization_guide/#1-run-the-pipeline", "title": "1. Run the Pipeline", "text": "<p>Before building an index, please review your <code>settings.yaml</code> configuration file and ensure that graphml snapshots is enabled. <pre><code>snapshots:\n  graphml: true\n</code></pre> (Optional) To support other visualization tools and exploration, additional parameters can be enabled that provide access to vector embeddings. <pre><code>embed_graph:\n  enabled: true # will generate node2vec embeddings for nodes\numap:\n  enabled: true # will generate UMAP embeddings for nodes\n</code></pre> After running the indexing pipeline over your data, there will be an output folder (defined by the <code>storage.base_dir</code> setting).</p> <ul> <li>Output Folder: Contains artifacts from the LLM\u2019s indexing pass.</li> </ul>"}, {"location": "visualization_guide/#2-locate-the-knowledge-graph", "title": "2. Locate the Knowledge Graph", "text": "<p>In the output folder, look for a file named <code>merged_graph.graphml</code>. graphml is a standard file format supported by many visualization tools. We recommend trying Gephi.</p>"}, {"location": "visualization_guide/#3-open-the-graph-in-gephi", "title": "3. Open the Graph in Gephi", "text": "<ol> <li>Install and open Gephi</li> <li>Navigate to the <code>output</code> folder containing the various parquet files.</li> <li>Import the <code>merged_graph.graphml</code> file into Gephi. This will result in a fairly plain view of the undirected graph nodes and edges.</li> </ol>"}, {"location": "visualization_guide/#4-install-the-leiden-algorithm-plugin", "title": "4. Install the Leiden Algorithm Plugin", "text": "<ol> <li>Go to <code>Tools</code> -&gt; <code>Plugins</code>.</li> <li>Search for \"Leiden Algorithm\".</li> <li>Click <code>Install</code> and restart Gephi.</li> </ol>"}, {"location": "visualization_guide/#5-run-statistics", "title": "5. Run Statistics", "text": "<ol> <li>In the <code>Statistics</code> tab on the right, click <code>Run</code> for <code>Average Degree</code> and <code>Leiden Algorithm</code>.</li> </ol> <ol> <li>For the Leiden Algorithm, adjust the settings:</li> <li>Quality function: Modularity</li> <li>Resolution: 1</li> </ol>"}, {"location": "visualization_guide/#6-color-the-graph-by-clusters", "title": "6. Color the Graph by Clusters", "text": "<ol> <li>Go to the <code>Appearance</code> pane in the upper left side of Gephi.</li> </ol> <ol> <li>Select <code>Nodes</code>, then <code>Partition</code>, and click the color palette icon in the upper right.</li> <li>Choose <code>Cluster</code> from the dropdown.</li> <li>Click the <code>Palette...</code> hyperlink, then <code>Generate...</code>.</li> <li>Uncheck <code>Limit number of colors</code>, click <code>Generate</code>, and then <code>Ok</code>.</li> <li>Click <code>Apply</code> to color the graph. This will color the graph based on the partitions discovered by Leiden.</li> </ol>"}, {"location": "visualization_guide/#7-resize-nodes-by-degree-centrality", "title": "7. Resize Nodes by Degree Centrality", "text": "<ol> <li>In the <code>Appearance</code> pane in the upper left, select <code>Nodes</code> -&gt; <code>Ranking</code></li> <li>Select the <code>Sizing</code> icon in the upper right.</li> <li>Choose <code>Degree</code> and set:</li> <li>Min: 10</li> <li>Max: 150</li> <li>Click <code>Apply</code>.</li> </ol>"}, {"location": "visualization_guide/#8-layout-the-graph", "title": "8. Layout the Graph", "text": "<ol> <li>In the <code>Layout</code> tab in the lower left, select <code>OpenORD</code>.</li> </ol> <ol> <li>Set <code>Liquid</code> and <code>Expansion</code> stages to 50, and everything else to 0.</li> <li>Click <code>Run</code> and monitor the progress.</li> </ol>"}, {"location": "visualization_guide/#9-run-forceatlas2", "title": "9. Run ForceAtlas2", "text": "<ol> <li>Select <code>Force Atlas 2</code> in the layout options.</li> </ol> <ol> <li>Adjust the settings:</li> <li>Scaling: 15</li> <li>Dissuade Hubs: checked</li> <li>LinLog mode: uncheck</li> <li>Prevent Overlap: checked</li> <li>Click <code>Run</code> and wait.</li> <li>Press <code>Stop</code> when it looks like the graph nodes have settled and no longer change position significantly.</li> </ol>"}, {"location": "visualization_guide/#10-add-text-labels-optional", "title": "10. Add Text Labels (Optional)", "text": "<ol> <li>Turn on text labels in the appropriate section.</li> <li>Configure and resize them as needed.</li> </ol> <p>Your final graph should now be visually organized and ready for analysis!</p>"}, {"location": "config/env_vars/", "title": "Default Configuration Mode (using Env Vars)", "text": ""}, {"location": "config/env_vars/#text-embeddings-customization", "title": "Text-Embeddings Customization", "text": "<p>By default, the GraphRAG indexer will only export embeddings required for our query methods. However, the model has embeddings defined for all plaintext fields, and these can be generated by setting the <code>GRAPHRAG_EMBEDDING_TARGET</code> environment variable to <code>all</code>.</p> <p>If the embedding target is <code>all</code>, and you want to only embed a subset of these fields, you may specify which embeddings to skip using the <code>GRAPHRAG_EMBEDDING_SKIP</code> argument described below.</p>"}, {"location": "config/env_vars/#embedded-fields", "title": "Embedded Fields", "text": "<ul> <li><code>text_unit.text</code></li> <li><code>document.text</code></li> <li><code>entity.title</code></li> <li><code>entity.description</code></li> <li><code>relationship.description</code></li> <li><code>community.title</code></li> <li><code>community.summary</code></li> <li><code>community.full_content</code></li> </ul>"}, {"location": "config/env_vars/#input-data", "title": "Input Data", "text": "<p>Our pipeline can ingest .csv or .txt data from an input folder. These files can be nested within subfolders. To configure how input data is handled, what fields are mapped over, and how timestamps are parsed, look for configuration values starting with <code>GRAPHRAG_INPUT_</code> below. In general, CSV-based data provides the most customizability. Each CSV should at least contain a <code>text</code> field (which can be mapped with environment variables), but it's helpful if they also have <code>title</code>, <code>timestamp</code>, and <code>source</code> fields. Additional fields can be included as well, which will land as extra fields on the <code>Document</code> table.</p>"}, {"location": "config/env_vars/#base-llm-settings", "title": "Base LLM Settings", "text": "<p>These are the primary settings for configuring LLM connectivity.</p> Parameter Required? Description Type Default Value <code>GRAPHRAG_API_KEY</code> Yes for OpenAI. Optional for AOAI The API key. (Note: <code>OPENAI_API_KEY is also used as a fallback). If not defined when using AOAI, managed identity will be used. |</code>str<code>|</code>None` <code>GRAPHRAG_API_BASE</code> For AOAI The API Base URL <code>str</code> <code>None</code> <code>GRAPHRAG_API_VERSION</code> For AOAI The AOAI API version. <code>str</code> <code>None</code> <code>GRAPHRAG_API_ORGANIZATION</code> The AOAI organization. <code>str</code> <code>None</code> <code>GRAPHRAG_API_PROXY</code> The AOAI proxy. <code>str</code> <code>None</code>"}, {"location": "config/env_vars/#text-generation-settings", "title": "Text Generation Settings", "text": "<p>These settings control the text generation model used by the pipeline. Any settings with a fallback will use the base LLM settings, if available.</p> Parameter Required? Description Type Default Value <code>GRAPHRAG_LLM_TYPE</code> For AOAI The LLM operation type. Either <code>openai_chat</code> or <code>azure_openai_chat</code> <code>str</code> <code>openai_chat</code> <code>GRAPHRAG_LLM_DEPLOYMENT_NAME</code> For AOAI The AOAI model deployment name. <code>str</code> <code>None</code> <code>GRAPHRAG_LLM_API_KEY</code> Yes (uses fallback) The API key. If not defined when using AOAI, managed identity will be used. <code>str</code> <code>None</code> <code>GRAPHRAG_LLM_API_BASE</code> For AOAI (uses fallback) The API Base URL <code>str</code> <code>None</code> <code>GRAPHRAG_LLM_API_VERSION</code> For AOAI (uses fallback) The AOAI API version. <code>str</code> <code>None</code> <code>GRAPHRAG_LLM_API_ORGANIZATION</code> For AOAI (uses fallback) The AOAI organization. <code>str</code> <code>None</code> <code>GRAPHRAG_LLM_API_PROXY</code> The AOAI proxy. <code>str</code> <code>None</code> <code>GRAPHRAG_LLM_MODEL</code> The LLM model. <code>str</code> <code>gpt-4-turbo-preview</code> <code>GRAPHRAG_LLM_MAX_TOKENS</code> The maximum number of tokens. <code>int</code> <code>4000</code> <code>GRAPHRAG_LLM_REQUEST_TIMEOUT</code> The maximum number of seconds to wait for a response from the chat client. <code>int</code> <code>180</code> <code>GRAPHRAG_LLM_MODEL_SUPPORTS_JSON</code> Indicates whether the given model supports JSON output mode. <code>True</code> to enable. <code>str</code> <code>None</code> <code>GRAPHRAG_LLM_THREAD_COUNT</code> The number of threads to use for LLM parallelization. <code>int</code> 50 <code>GRAPHRAG_LLM_THREAD_STAGGER</code> The time to wait (in seconds) between starting each thread. <code>float</code> 0.3 <code>GRAPHRAG_LLM_CONCURRENT_REQUESTS</code> The number of concurrent requests to allow for the embedding client. <code>int</code> 25 <code>GRAPHRAG_LLM_TOKENS_PER_MINUTE</code> The number of tokens per minute to allow for the LLM client. 0 = Bypass <code>int</code> 0 <code>GRAPHRAG_LLM_REQUESTS_PER_MINUTE</code> The number of requests per minute to allow for the LLM client. 0 = Bypass <code>int</code> 0 <code>GRAPHRAG_LLM_MAX_RETRIES</code> The maximum number of retries to attempt when a request fails. <code>int</code> 10 <code>GRAPHRAG_LLM_MAX_RETRY_WAIT</code> The maximum number of seconds to wait between retries. <code>int</code> 10 <code>GRAPHRAG_LLM_SLEEP_ON_RATE_LIMIT_RECOMMENDATION</code> Whether to sleep on rate limit recommendation. (Azure Only) <code>bool</code> <code>True</code> <code>GRAPHRAG_LLM_TEMPERATURE</code> The temperature to use generation. <code>float</code> 0 <code>GRAPHRAG_LLM_TOP_P</code> The top_p to use for sampling. <code>float</code> 1 <code>GRAPHRAG_LLM_N</code> The number of responses to generate. <code>int</code> 1"}, {"location": "config/env_vars/#text-embedding-settings", "title": "Text Embedding Settings", "text": "<p>These settings control the text embedding model used by the pipeline. Any settings with a fallback will use the base LLM settings, if available.</p> Parameter Required ? Description Type Default <code>GRAPHRAG_EMBEDDING_TYPE</code> For AOAI The embedding client to use. Either <code>openai_embedding</code> or <code>azure_openai_embedding</code> <code>str</code> <code>openai_embedding</code> <code>GRAPHRAG_EMBEDDING_DEPLOYMENT_NAME</code> For AOAI The AOAI deployment name. <code>str</code> <code>None</code> <code>GRAPHRAG_EMBEDDING_API_KEY</code> Yes (uses fallback) The API key to use for the embedding client. If not defined when using AOAI, managed identity will be used. <code>str</code> <code>None</code> <code>GRAPHRAG_EMBEDDING_API_BASE</code> For AOAI (uses fallback) The API base URL. <code>str</code> <code>None</code> <code>GRAPHRAG_EMBEDDING_API_VERSION</code> For AOAI (uses fallback) The AOAI API version to use for the embedding client. <code>str</code> <code>None</code> <code>GRAPHRAG_EMBEDDING_API_ORGANIZATION</code> For AOAI (uses fallback) The AOAI organization to use for the embedding client. <code>str</code> <code>None</code> <code>GRAPHRAG_EMBEDDING_API_PROXY</code> The AOAI proxy to use for the embedding client. <code>str</code> <code>None</code> <code>GRAPHRAG_EMBEDDING_MODEL</code> The model to use for the embedding client. <code>str</code> <code>text-embedding-3-small</code> <code>GRAPHRAG_EMBEDDING_BATCH_SIZE</code> The number of texts to embed at once. (Azure limit is 16) <code>int</code> 16 <code>GRAPHRAG_EMBEDDING_BATCH_MAX_TOKENS</code> The maximum tokens per batch (Azure limit is 8191) <code>int</code> 8191 <code>GRAPHRAG_EMBEDDING_TARGET</code> The target fields to embed. Either <code>required</code> or <code>all</code>. <code>str</code> <code>required</code> <code>GRAPHRAG_EMBEDDING_SKIP</code> A comma-separated list of fields to skip embeddings for . (e.g. 'relationship.description') <code>str</code> <code>None</code> <code>GRAPHRAG_EMBEDDING_THREAD_COUNT</code> The number of threads to use for parallelization for embeddings. <code>int</code> <code>GRAPHRAG_EMBEDDING_THREAD_STAGGER</code> The time to wait (in seconds) between starting each thread for embeddings. <code>float</code> 50 <code>GRAPHRAG_EMBEDDING_CONCURRENT_REQUESTS</code> The number of concurrent requests to allow for the embedding client. <code>int</code> 25 <code>GRAPHRAG_EMBEDDING_TOKENS_PER_MINUTE</code> The number of tokens per minute to allow for the embedding client. 0 = Bypass <code>int</code> 0 <code>GRAPHRAG_EMBEDDING_REQUESTS_PER_MINUTE</code> The number of requests per minute to allow for the embedding client. 0 = Bypass <code>int</code> 0 <code>GRAPHRAG_EMBEDDING_MAX_RETRIES</code> The maximum number of retries to attempt when a request fails. <code>int</code> 10 <code>GRAPHRAG_EMBEDDING_MAX_RETRY_WAIT</code> The maximum number of seconds to wait between retries. <code>int</code> 10 <code>GRAPHRAG_EMBEDDING_SLEEP_ON_RATE_LIMIT_RECOMMENDATION</code> Whether to sleep on rate limit recommendation. (Azure Only) <code>bool</code> <code>True</code>"}, {"location": "config/env_vars/#input-settings", "title": "Input Settings", "text": "<p>These settings control the data input used by the pipeline. Any settings with a fallback will use the base LLM settings, if available.</p>"}, {"location": "config/env_vars/#plaintext-input-data-graphrag_input_file_typetext", "title": "Plaintext Input Data (<code>GRAPHRAG_INPUT_FILE_TYPE</code>=text)", "text": "Parameter Description Type Required or Optional Default <code>GRAPHRAG_INPUT_FILE_PATTERN</code> The file pattern regexp to use when reading input files from the input directory. <code>str</code> optional <code>.*\\.txt$</code>"}, {"location": "config/env_vars/#csv-input-data-graphrag_input_file_typecsv", "title": "CSV Input Data (<code>GRAPHRAG_INPUT_FILE_TYPE</code>=csv)", "text": "Parameter Description Type Required or Optional Default <code>GRAPHRAG_INPUT_TYPE</code> The input storage type to use when reading files. (<code>file</code> or <code>blob</code>) <code>str</code> optional <code>file</code> <code>GRAPHRAG_INPUT_FILE_PATTERN</code> The file pattern regexp to use when reading input files from the input directory. <code>str</code> optional <code>.*\\.txt$</code> <code>GRAPHRAG_INPUT_SOURCE_COLUMN</code> The 'source' column to use when reading CSV input files. <code>str</code> optional <code>source</code> <code>GRAPHRAG_INPUT_TIMESTAMP_COLUMN</code> The 'timestamp' column to use when reading CSV input files. <code>str</code> optional <code>None</code> <code>GRAPHRAG_INPUT_TIMESTAMP_FORMAT</code> The timestamp format to use when parsing timestamps in the timestamp column. <code>str</code> optional <code>None</code> <code>GRAPHRAG_INPUT_TEXT_COLUMN</code> The 'text' column to use when reading CSV input files. <code>str</code> optional <code>text</code> <code>GRAPHRAG_INPUT_DOCUMENT_ATTRIBUTE_COLUMNS</code> A list of CSV columns, comma-separated, to incorporate as document fields. <code>str</code> optional <code>id</code> <code>GRAPHRAG_INPUT_TITLE_COLUMN</code> The 'title' column to use when reading CSV input files. <code>str</code> optional <code>title</code> <code>GRAPHRAG_INPUT_STORAGE_ACCOUNT_BLOB_URL</code> The Azure Storage blob endpoint to use when in <code>blob</code> mode and using managed identity. Will have the format <code>https://&lt;storage_account_name&gt;.blob.core.windows.net</code> <code>str</code> optional <code>None</code> <code>GRAPHRAG_INPUT_CONNECTION_STRING</code> The connection string to use when reading CSV input files from Azure Blob Storage. <code>str</code> optional <code>None</code> <code>GRAPHRAG_INPUT_CONTAINER_NAME</code> The container name to use when reading CSV input files from Azure Blob Storage. <code>str</code> optional <code>None</code> <code>GRAPHRAG_INPUT_BASE_DIR</code> The base directory to read input files from. <code>str</code> optional <code>None</code>"}, {"location": "config/env_vars/#data-mapping-settings", "title": "Data Mapping Settings", "text": "Parameter Description Type Required or Optional Default <code>GRAPHRAG_INPUT_FILE_TYPE</code> The type of input data, <code>csv</code> or <code>text</code> <code>str</code> optional <code>text</code> <code>GRAPHRAG_INPUT_ENCODING</code> The encoding to apply when reading CSV/text input files. <code>str</code> optional <code>utf-8</code>"}, {"location": "config/env_vars/#data-chunking", "title": "Data Chunking", "text": "Parameter Description Type Required or Optional Default <code>GRAPHRAG_CHUNK_SIZE</code> The chunk size in tokens for text-chunk analysis windows. <code>str</code> optional 1200 <code>GRAPHRAG_CHUNK_OVERLAP</code> The chunk overlap in tokens for text-chunk analysis windows. <code>str</code> optional 100 <code>GRAPHRAG_CHUNK_BY_COLUMNS</code> A comma-separated list of document attributes to groupby when performing TextUnit chunking. <code>str</code> optional <code>id</code> <code>GRAPHRAG_CHUNK_ENCODING_MODEL</code> The encoding model to use for chunking. <code>str</code> optional The top-level encoding model."}, {"location": "config/env_vars/#prompting-overrides", "title": "Prompting Overrides", "text": "Parameter Description Type Required or Optional Default <code>GRAPHRAG_ENTITY_EXTRACTION_PROMPT_FILE</code> The path (relative to the root) of an entity extraction prompt template text file. <code>str</code> optional <code>None</code> <code>GRAPHRAG_ENTITY_EXTRACTION_MAX_GLEANINGS</code> The maximum number of redrives (gleanings) to invoke when extracting entities in a loop. <code>int</code> optional 1 <code>GRAPHRAG_ENTITY_EXTRACTION_ENTITY_TYPES</code> A comma-separated list of entity types to extract. <code>str</code> optional <code>organization,person,event,geo</code> <code>GRAPHRAG_ENTITY_EXTRACTION_ENCODING_MODEL</code> The encoding model to use for entity extraction. <code>str</code> optional The top-level encoding model. <code>GRAPHRAG_SUMMARIZE_DESCRIPTIONS_PROMPT_FILE</code> The path (relative to the root) of an description summarization prompt template text file. <code>str</code> optional <code>None</code> <code>GRAPHRAG_SUMMARIZE_DESCRIPTIONS_MAX_LENGTH</code> The maximum number of tokens to generate per description summarization. <code>int</code> optional 500 <code>GRAPHRAG_CLAIM_EXTRACTION_ENABLED</code> Whether claim extraction is enabled for this pipeline. <code>bool</code> optional <code>False</code> <code>GRAPHRAG_CLAIM_EXTRACTION_DESCRIPTION</code> The claim_description prompting argument to utilize. <code>string</code> optional \"Any claims or facts that could be relevant to threat analysis.\" <code>GRAPHRAG_CLAIM_EXTRACTION_PROMPT_FILE</code> The claim extraction prompt to utilize. <code>string</code> optional <code>None</code> <code>GRAPHRAG_CLAIM_EXTRACTION_MAX_GLEANINGS</code> The maximum number of redrives (gleanings) to invoke when extracting claims in a loop. <code>int</code> optional 1 <code>GRAPHRAG_CLAIM_EXTRACTION_ENCODING_MODEL</code> The encoding model to use for claim extraction. <code>str</code> optional The top-level encoding model <code>GRAPHRAG_COMMUNITY_REPORTS_PROMPT_FILE</code> The community reports extraction prompt to utilize. <code>string</code> optional <code>None</code> <code>GRAPHRAG_COMMUNITY_REPORTS_MAX_LENGTH</code> The maximum number of tokens to generate per community reports. <code>int</code> optional 1500"}, {"location": "config/env_vars/#storage", "title": "Storage", "text": "<p>This section controls the storage mechanism used by the pipeline used for exporting output tables.</p> Parameter Description Type Required or Optional Default <code>GRAPHRAG_STORAGE_TYPE</code> The type of storage to use. Options are <code>file</code>, <code>memory</code>, or <code>blob</code> <code>str</code> optional <code>file</code> <code>GRAPHRAG_STORAGE_STORAGE_ACCOUNT_BLOB_URL</code> The Azure Storage blob endpoint to use when in <code>blob</code> mode and using managed identity. Will have the format <code>https://&lt;storage_account_name&gt;.blob.core.windows.net</code> <code>str</code> optional None <code>GRAPHRAG_STORAGE_CONNECTION_STRING</code> The Azure Storage connection string to use when in <code>blob</code> mode. <code>str</code> optional None <code>GRAPHRAG_STORAGE_CONTAINER_NAME</code> The Azure Storage container name to use when in <code>blob</code> mode. <code>str</code> optional None <code>GRAPHRAG_STORAGE_BASE_DIR</code> The base path to data outputs outputs. <code>str</code> optional None"}, {"location": "config/env_vars/#cache", "title": "Cache", "text": "<p>This section controls the cache mechanism used by the pipeline. This is used to cache LLM invocation results.</p> Parameter Description Type Required or Optional Default <code>GRAPHRAG_CACHE_TYPE</code> The type of cache to use. Options are <code>file</code>, <code>memory</code>, <code>none</code> or <code>blob</code> <code>str</code> optional <code>file</code> <code>GRAPHRAG_CACHE_STORAGE_ACCOUNT_BLOB_URL</code> The Azure Storage blob endpoint to use when in <code>blob</code> mode and using managed identity. Will have the format <code>https://&lt;storage_account_name&gt;.blob.core.windows.net</code> <code>str</code> optional None <code>GRAPHRAG_CACHE_CONNECTION_STRING</code> The Azure Storage connection string to use when in <code>blob</code> mode. <code>str</code> optional None <code>GRAPHRAG_CACHE_CONTAINER_NAME</code> The Azure Storage container name to use when in <code>blob</code> mode. <code>str</code> optional None <code>GRAPHRAG_CACHE_BASE_DIR</code> The base path to the cache files. <code>str</code> optional None"}, {"location": "config/env_vars/#reporting", "title": "Reporting", "text": "<p>This section controls the reporting mechanism used by the pipeline, for common events and error messages. The default is to write reports to a file in the output directory. However, you can also choose to write reports to the console or to an Azure Blob Storage container.</p> Parameter Description Type Required or Optional Default <code>GRAPHRAG_REPORTING_TYPE</code> The type of reporter to use. Options are <code>file</code>, <code>console</code>, or <code>blob</code> <code>str</code> optional <code>file</code> <code>GRAPHRAG_REPORTING_STORAGE_ACCOUNT_BLOB_URL</code> The Azure Storage blob endpoint to use when in <code>blob</code> mode and using managed identity. Will have the format <code>https://&lt;storage_account_name&gt;.blob.core.windows.net</code> <code>str</code> optional None <code>GRAPHRAG_REPORTING_CONNECTION_STRING</code> The Azure Storage connection string to use when in <code>blob</code> mode. <code>str</code> optional None <code>GRAPHRAG_REPORTING_CONTAINER_NAME</code> The Azure Storage container name to use when in <code>blob</code> mode. <code>str</code> optional None <code>GRAPHRAG_REPORTING_BASE_DIR</code> The base path to the reporting outputs. <code>str</code> optional None"}, {"location": "config/env_vars/#node2vec-parameters", "title": "Node2Vec Parameters", "text": "Parameter Description Type Required or Optional Default <code>GRAPHRAG_NODE2VEC_ENABLED</code> Whether to enable Node2Vec <code>bool</code> optional False <code>GRAPHRAG_NODE2VEC_NUM_WALKS</code> The Node2Vec number of walks to perform <code>int</code> optional 10 <code>GRAPHRAG_NODE2VEC_WALK_LENGTH</code> The Node2Vec walk length <code>int</code> optional 40 <code>GRAPHRAG_NODE2VEC_WINDOW_SIZE</code> The Node2Vec window size <code>int</code> optional 2 <code>GRAPHRAG_NODE2VEC_ITERATIONS</code> The number of iterations to run node2vec <code>int</code> optional 3 <code>GRAPHRAG_NODE2VEC_RANDOM_SEED</code> The random seed to use for node2vec <code>int</code> optional 597832"}, {"location": "config/env_vars/#data-snapshotting", "title": "Data Snapshotting", "text": "Parameter Description Type Required or Optional Default <code>GRAPHRAG_SNAPSHOT_EMBEDDINGS</code> Whether to enable embeddings snapshots. <code>bool</code> optional False <code>GRAPHRAG_SNAPSHOT_GRAPHML</code> Whether to enable GraphML snapshots. <code>bool</code> optional False <code>GRAPHRAG_SNAPSHOT_RAW_ENTITIES</code> Whether to enable raw entity snapshots. <code>bool</code> optional False <code>GRAPHRAG_SNAPSHOT_TOP_LEVEL_NODES</code> Whether to enable top-level node snapshots. <code>bool</code> optional False <code>GRAPHRAG_SNAPSHOT_TRANSIENT</code> Whether to enable transient table snapshots. <code>bool</code> optional False"}, {"location": "config/env_vars/#miscellaneous-settings", "title": "Miscellaneous Settings", "text": "Parameter Description Type Required or Optional Default <code>GRAPHRAG_ASYNC_MODE</code> Which async mode to use. Either <code>asyncio</code> or <code>threaded</code>. <code>str</code> optional <code>asyncio</code> <code>GRAPHRAG_ENCODING_MODEL</code> The text encoding model, used in tiktoken, to encode text. <code>str</code> optional <code>cl100k_base</code> <code>GRAPHRAG_MAX_CLUSTER_SIZE</code> The maximum number of entities to include in a single Leiden cluster. <code>int</code> optional 10 <code>GRAPHRAG_SKIP_WORKFLOWS</code> A comma-separated list of workflow names to skip. <code>str</code> optional <code>None</code> <code>GRAPHRAG_UMAP_ENABLED</code> Whether to enable UMAP layouts <code>bool</code> optional False"}, {"location": "config/init/", "title": "Configuring GraphRAG Indexing", "text": "<p>To start using GraphRAG, you must generate a configuration file. The <code>init</code> command is the easiest way to get started. It will create a <code>.env</code> and <code>settings.yaml</code> files in the specified directory with the necessary configuration settings. It will also output the default LLM prompts used by GraphRAG.</p>"}, {"location": "config/init/#usage", "title": "Usage", "text": "<pre><code>graphrag init [--root PATH]\n</code></pre>"}, {"location": "config/init/#options", "title": "Options", "text": "<ul> <li><code>--root PATH</code> - The project root directory to initialize graphrag at. Default is the current directory.</li> </ul>"}, {"location": "config/init/#example", "title": "Example", "text": "<pre><code>graphrag init --root ./ragtest\n</code></pre>"}, {"location": "config/init/#output", "title": "Output", "text": "<p>The <code>init</code> command will create the following files in the specified directory:</p> <ul> <li><code>settings.yaml</code> - The configuration settings file. This file contains the configuration settings for GraphRAG.</li> <li><code>.env</code> - The environment variables file. These are referenced in the <code>settings.yaml</code> file.</li> <li><code>prompts/</code> - The LLM prompts folder. This contains the default prompts used by GraphRAG, you can modify them or run the Auto Prompt Tuning command to generate new prompts adapted to your data.</li> </ul>"}, {"location": "config/init/#next-steps", "title": "Next Steps", "text": "<p>After initializing your workspace, you can either run the Prompt Tuning command to adapt the prompts to your data or even start running the Indexing Pipeline to index your data. For more information on configuring GraphRAG, see the Configuration documentation.</p>"}, {"location": "config/overview/", "title": "Configuring GraphRAG Indexing", "text": "<p>The GraphRAG system is highly configurable. This page provides an overview of the configuration options available for the GraphRAG indexing engine.</p>"}, {"location": "config/overview/#default-configuration-mode", "title": "Default Configuration Mode", "text": "<p>The default configuration mode is the simplest way to get started with the GraphRAG system. It is designed to work out-of-the-box with minimal configuration. The primary configuration sections for the Indexing Engine pipelines are described below. The main ways to set up GraphRAG in Default Configuration mode are via:</p> <ul> <li>Init command (recommended)</li> <li>Using YAML for deeper control</li> <li>Purely using environment variables</li> </ul>"}, {"location": "config/yaml/", "title": "Default Configuration Mode (using YAML/JSON)", "text": "<p>The default configuration mode may be configured by using a <code>settings.yml</code> or <code>settings.json</code> file in the data project root. If a <code>.env</code> file is present along with this config file, then it will be loaded, and the environment variables defined therein will be available for token replacements in your configuration document using <code>${ENV_VAR}</code> syntax. We initialize with YML by default in <code>graphrag init</code> but you may use the equivalent JSON form if preferred.</p> <p>Many of these config values have defaults. Rather than replicate them here, please refer to the constants in the code directly.</p> <p>For example:</p> <pre><code># .env\nGRAPHRAG_API_KEY=some_api_key\n\n# settings.yml\nllm: \n  api_key: ${GRAPHRAG_API_KEY}\n</code></pre>"}, {"location": "config/yaml/#config-sections", "title": "Config Sections", "text": ""}, {"location": "config/yaml/#indexing", "title": "Indexing", "text": ""}, {"location": "config/yaml/#llm", "title": "llm", "text": "<p>This is the base LLM configuration section. Other steps may override this configuration with their own LLM configuration.</p>"}, {"location": "config/yaml/#fields", "title": "Fields", "text": "<ul> <li><code>api_key</code> str - The OpenAI API key to use.</li> <li><code>type</code> openai_chat|azure_openai_chat|openai_embedding|azure_openai_embedding - The type of LLM to use.</li> <li><code>model</code> str - The model name.</li> <li><code>max_tokens</code> int - The maximum number of output tokens.</li> <li><code>request_timeout</code> float - The per-request timeout.</li> <li><code>api_base</code> str - The API base url to use.</li> <li><code>api_version</code> str - The API version</li> <li><code>organization</code> str - The client organization.</li> <li><code>proxy</code> str - The proxy URL to use.</li> <li><code>audience</code> str - (Azure OpenAI only) The URI of the target Azure resource/service for which a managed identity token is requested. Used if <code>api_key</code> is not defined. Default=<code>https://cognitiveservices.azure.com/.default</code></li> <li><code>deployment_name</code> str - The deployment name to use (Azure).</li> <li><code>model_supports_json</code> bool - Whether the model supports JSON-mode output.</li> <li><code>tokens_per_minute</code> int - Set a leaky-bucket throttle on tokens-per-minute.</li> <li><code>requests_per_minute</code> int - Set a leaky-bucket throttle on requests-per-minute.</li> <li><code>max_retries</code> int - The maximum number of retries to use.</li> <li><code>max_retry_wait</code> float - The maximum backoff time.</li> <li><code>sleep_on_rate_limit_recommendation</code> bool - Whether to adhere to sleep recommendations (Azure).</li> <li><code>concurrent_requests</code> int The number of open requests to allow at once.</li> <li><code>temperature</code> float - The temperature to use.</li> <li><code>top_p</code> float - The top-p value to use.</li> <li><code>n</code> int - The number of completions to generate.</li> </ul>"}, {"location": "config/yaml/#parallelization", "title": "parallelization", "text": ""}, {"location": "config/yaml/#fields_1", "title": "Fields", "text": "<ul> <li><code>stagger</code> float - The threading stagger value.</li> <li><code>num_threads</code> int - The maximum number of work threads.</li> </ul>"}, {"location": "config/yaml/#async_mode", "title": "async_mode", "text": "<p>asyncio|threaded The async mode to use. Either <code>asyncio</code> or `threaded.</p>"}, {"location": "config/yaml/#embeddings", "title": "embeddings", "text": ""}, {"location": "config/yaml/#fields_2", "title": "Fields", "text": "<ul> <li><code>llm</code> (see LLM top-level config)</li> <li><code>parallelization</code> (see Parallelization top-level config)</li> <li><code>async_mode</code> (see Async Mode top-level config)</li> <li><code>batch_size</code> int - The maximum batch size to use.</li> <li><code>batch_max_tokens</code> int - The maximum batch # of tokens.</li> <li><code>target</code> required|all|none - Determines which set of embeddings to export.</li> <li><code>skip</code> list[str] - Which embeddings to skip. Only useful if target=all to customize the list.</li> <li><code>vector_store</code> dict - The vector store to use. Configured for lancedb by default.<ul> <li><code>type</code> str - <code>lancedb</code> or <code>azure_ai_search</code>. Default=<code>lancedb</code></li> <li><code>db_uri</code> str (only for lancedb) - The database uri. Default=<code>storage.base_dir/lancedb</code></li> <li><code>url</code> str (only for AI Search) - AI Search endpoint</li> <li><code>api_key</code> str (optional - only for AI Search) - The AI Search api key to use.</li> <li><code>audience</code> str (only for AI Search) - Audience for managed identity token if managed identity authentication is used.</li> <li><code>overwrite</code> bool (only used at index creation time) - Overwrite collection if it exist. Default=<code>True</code></li> <li><code>container_name</code> str - The name of a vector container. This stores all indexes (tables) for a given dataset ingest. Default=<code>default</code></li> </ul> </li> <li><code>strategy</code> dict - Fully override the text-embedding strategy.</li> </ul>"}, {"location": "config/yaml/#input", "title": "input", "text": ""}, {"location": "config/yaml/#fields_3", "title": "Fields", "text": "<ul> <li><code>type</code> file|blob - The input type to use. Default=<code>file</code></li> <li><code>file_type</code> text|csv - The type of input data to load. Either <code>text</code> or <code>csv</code>. Default is <code>text</code></li> <li><code>base_dir</code> str - The base directory to read input from, relative to the root.</li> <li><code>connection_string</code> str - (blob only) The Azure Storage connection string.</li> <li><code>storage_account_blob_url</code> str - The storage account blob URL to use.</li> <li><code>container_name</code> str - (blob only) The Azure Storage container name.</li> <li><code>file_encoding</code> str - The encoding of the input file. Default is <code>utf-8</code></li> <li><code>file_pattern</code> str - A regex to match input files. Default is <code>.*\\.csv$</code> if in csv mode and <code>.*\\.txt$</code> if in text mode.</li> <li><code>file_filter</code> dict - Key/value pairs to filter. Default is None.</li> <li><code>source_column</code> str - (CSV Mode Only) The source column name.</li> <li><code>timestamp_column</code> str - (CSV Mode Only) The timestamp column name.</li> <li><code>timestamp_format</code> str - (CSV Mode Only) The source format.</li> <li><code>text_column</code> str - (CSV Mode Only) The text column name.</li> <li><code>title_column</code> str - (CSV Mode Only) The title column name.</li> <li><code>document_attribute_columns</code> list[str] - (CSV Mode Only) The additional document attributes to include.</li> </ul>"}, {"location": "config/yaml/#chunks", "title": "chunks", "text": ""}, {"location": "config/yaml/#fields_4", "title": "Fields", "text": "<ul> <li><code>size</code> int - The max chunk size in tokens.</li> <li><code>overlap</code> int - The chunk overlap in tokens.</li> <li><code>group_by_columns</code> list[str] - group documents by fields before chunking.</li> <li><code>encoding_model</code> str - The text encoding model to use. Default is to use the top-level encoding model.</li> <li><code>strategy</code> dict - Fully override the chunking strategy.</li> </ul>"}, {"location": "config/yaml/#cache", "title": "cache", "text": ""}, {"location": "config/yaml/#fields_5", "title": "Fields", "text": "<ul> <li><code>type</code> file|memory|none|blob - The cache type to use. Default=<code>file</code></li> <li><code>connection_string</code> str - (blob only) The Azure Storage connection string.</li> <li><code>container_name</code> str - (blob only) The Azure Storage container name.</li> <li><code>base_dir</code> str - The base directory to write cache to, relative to the root.</li> <li><code>storage_account_blob_url</code> str - The storage account blob URL to use.</li> </ul>"}, {"location": "config/yaml/#storage", "title": "storage", "text": ""}, {"location": "config/yaml/#fields_6", "title": "Fields", "text": "<ul> <li><code>type</code> file|memory|blob - The storage type to use. Default=<code>file</code></li> <li><code>connection_string</code> str - (blob only) The Azure Storage connection string.</li> <li><code>container_name</code> str - (blob only) The Azure Storage container name.</li> <li><code>base_dir</code> str - The base directory to write output artifacts to, relative to the root.</li> <li><code>storage_account_blob_url</code> str - The storage account blob URL to use.</li> </ul>"}, {"location": "config/yaml/#update_index_storage", "title": "update_index_storage", "text": ""}, {"location": "config/yaml/#fields_7", "title": "Fields", "text": "<ul> <li><code>type</code> file|memory|blob - The storage type to use. Default=<code>file</code></li> <li><code>connection_string</code> str - (blob only) The Azure Storage connection string.</li> <li><code>container_name</code> str - (blob only) The Azure Storage container name.</li> <li><code>base_dir</code> str - The base directory to write output artifacts to, relative to the root.</li> <li><code>storage_account_blob_url</code> str - The storage account blob URL to use.</li> </ul>"}, {"location": "config/yaml/#reporting", "title": "reporting", "text": ""}, {"location": "config/yaml/#fields_8", "title": "Fields", "text": "<ul> <li><code>type</code> file|console|blob - The reporting type to use. Default=<code>file</code></li> <li><code>connection_string</code> str - (blob only) The Azure Storage connection string.</li> <li><code>container_name</code> str - (blob only) The Azure Storage container name.</li> <li><code>base_dir</code> str - The base directory to write reports to, relative to the root.</li> <li><code>storage_account_blob_url</code> str - The storage account blob URL to use.</li> </ul>"}, {"location": "config/yaml/#entity_extraction", "title": "entity_extraction", "text": ""}, {"location": "config/yaml/#fields_9", "title": "Fields", "text": "<ul> <li><code>llm</code> (see LLM top-level config)</li> <li><code>parallelization</code> (see Parallelization top-level config)</li> <li><code>async_mode</code> (see Async Mode top-level config)</li> <li><code>prompt</code> str - The prompt file to use.</li> <li><code>entity_types</code> list[str] - The entity types to identify.</li> <li><code>max_gleanings</code> int - The maximum number of gleaning cycles to use.</li> <li><code>encoding_model</code> str - The text encoding model to use. By default, this will use the top-level encoding model.</li> <li><code>strategy</code> dict - Fully override the entity extraction strategy.</li> </ul>"}, {"location": "config/yaml/#summarize_descriptions", "title": "summarize_descriptions", "text": ""}, {"location": "config/yaml/#fields_10", "title": "Fields", "text": "<ul> <li><code>llm</code> (see LLM top-level config)</li> <li><code>parallelization</code> (see Parallelization top-level config)</li> <li><code>async_mode</code> (see Async Mode top-level config)</li> <li><code>prompt</code> str - The prompt file to use.</li> <li><code>max_length</code> int - The maximum number of output tokens per summarization.</li> <li><code>strategy</code> dict - Fully override the summarize description strategy.</li> </ul>"}, {"location": "config/yaml/#claim_extraction", "title": "claim_extraction", "text": ""}, {"location": "config/yaml/#fields_11", "title": "Fields", "text": "<ul> <li><code>enabled</code> bool - Whether to enable claim extraction. Off by default, because claim prompts really need user tuning.</li> <li><code>llm</code> (see LLM top-level config)</li> <li><code>parallelization</code> (see Parallelization top-level config)</li> <li><code>async_mode</code> (see Async Mode top-level config)</li> <li><code>prompt</code> str - The prompt file to use.</li> <li><code>description</code> str - Describes the types of claims we want to extract.</li> <li><code>max_gleanings</code> int - The maximum number of gleaning cycles to use.</li> <li><code>encoding_model</code> str - The text encoding model to use. By default, this will use the top-level encoding model.</li> <li><code>strategy</code> dict - Fully override the claim extraction strategy.</li> </ul>"}, {"location": "config/yaml/#community_reports", "title": "community_reports", "text": ""}, {"location": "config/yaml/#fields_12", "title": "Fields", "text": "<ul> <li><code>llm</code> (see LLM top-level config)</li> <li><code>parallelization</code> (see Parallelization top-level config)</li> <li><code>async_mode</code> (see Async Mode top-level config)</li> <li><code>prompt</code> str - The prompt file to use.</li> <li><code>max_length</code> int - The maximum number of output tokens per report.</li> <li><code>max_input_length</code> int - The maximum number of input tokens to use when generating reports.</li> <li><code>strategy</code> dict - Fully override the community reports strategy.</li> </ul>"}, {"location": "config/yaml/#cluster_graph", "title": "cluster_graph", "text": ""}, {"location": "config/yaml/#fields_13", "title": "Fields", "text": "<ul> <li><code>max_cluster_size</code> int - The maximum cluster size to export.</li> <li><code>strategy</code> dict - Fully override the cluster_graph strategy.</li> </ul>"}, {"location": "config/yaml/#embed_graph", "title": "embed_graph", "text": ""}, {"location": "config/yaml/#fields_14", "title": "Fields", "text": "<ul> <li><code>enabled</code> bool - Whether to enable graph embeddings.</li> <li><code>num_walks</code> int - The node2vec number of walks.</li> <li><code>walk_length</code> int - The node2vec walk length.</li> <li><code>window_size</code> int - The node2vec window size.</li> <li><code>iterations</code> int - The node2vec number of iterations.</li> <li><code>random_seed</code> int - The node2vec random seed.</li> <li><code>strategy</code> dict - Fully override the embed graph strategy.</li> </ul>"}, {"location": "config/yaml/#umap", "title": "umap", "text": ""}, {"location": "config/yaml/#fields_15", "title": "Fields", "text": "<ul> <li><code>enabled</code> bool - Whether to enable UMAP layouts.</li> </ul>"}, {"location": "config/yaml/#snapshots", "title": "snapshots", "text": ""}, {"location": "config/yaml/#fields_16", "title": "Fields", "text": "<ul> <li><code>embeddings</code> bool - Export embeddings snapshots to parquet.</li> <li><code>graphml</code> bool - Export graph snapshots to GraphML.</li> <li><code>transient</code> bool - Export transient workflow tables snapshots to parquet.</li> </ul>"}, {"location": "config/yaml/#encoding_model", "title": "encoding_model", "text": "<p>str - The text encoding model to use. Default=<code>cl100k_base</code>.</p>"}, {"location": "config/yaml/#skip_workflows", "title": "skip_workflows", "text": "<p>list[str] - Which workflow names to skip.</p>"}, {"location": "config/yaml/#query", "title": "Query", "text": ""}, {"location": "config/yaml/#local_search", "title": "local_search", "text": ""}, {"location": "config/yaml/#fields_17", "title": "Fields", "text": "<ul> <li><code>prompt</code> str - The prompt file to use.</li> <li><code>text_unit_prop</code> float - The text unit proportion. </li> <li><code>community_prop</code> float - The community proportion.</li> <li><code>conversation_history_max_turns</code> int - The conversation history maximum turns.</li> <li><code>top_k_entities</code> int - The top k mapped entities.</li> <li><code>top_k_relationships</code> int - The top k mapped relations.</li> <li><code>temperature</code> float | None - The temperature to use for token generation.</li> <li><code>top_p</code> float | None - The top-p value to use for token generation.</li> <li><code>n</code> int | None - The number of completions to generate.</li> <li><code>max_tokens</code> int - The maximum tokens.</li> <li><code>llm_max_tokens</code> int - The LLM maximum tokens.</li> </ul>"}, {"location": "config/yaml/#global_search", "title": "global_search", "text": ""}, {"location": "config/yaml/#fields_18", "title": "Fields", "text": "<ul> <li><code>map_prompt</code> str - The mapper prompt file to use.</li> <li><code>reduce_prompt</code> str - The reducer prompt file to use.</li> <li><code>knowledge_prompt</code> str - The knowledge prompt file to use.</li> <li><code>map_prompt</code> str | None - The global search mapper prompt to use.</li> <li><code>reduce_prompt</code> str | None - The global search reducer to use.</li> <li><code>knowledge_prompt</code> str | None - The global search general prompt to use.</li> <li><code>temperature</code> float | None - The temperature to use for token generation.</li> <li><code>top_p</code> float | None - The top-p value to use for token generation.</li> <li><code>n</code> int | None - The number of completions to generate.</li> <li><code>max_tokens</code> int - The maximum context size in tokens.</li> <li><code>data_max_tokens</code> int - The data llm maximum tokens.</li> <li><code>map_max_tokens</code> int - The map llm maximum tokens.</li> <li><code>reduce_max_tokens</code> int - The reduce llm maximum tokens.</li> <li><code>concurrency</code> int - The number of concurrent requests.</li> <li><code>dynamic_search_llm</code> str - LLM model to use for dynamic community selection.</li> <li><code>dynamic_search_threshold</code> int - Rating threshold in include a community report.</li> <li><code>dynamic_search_keep_parent</code> bool - Keep parent community if any of the child communities are relevant.</li> <li><code>dynamic_search_num_repeats</code> int - Number of times to rate the same community report.</li> <li><code>dynamic_search_use_summary</code> bool - Use community summary instead of full_context.</li> <li><code>dynamic_search_concurrent_coroutines</code> int - Number of concurrent coroutines to rate community reports.</li> <li><code>dynamic_search_max_level</code> int - The maximum level of community hierarchy to consider if none of the processed communities are relevant.</li> </ul>"}, {"location": "config/yaml/#drift_search", "title": "drift_search", "text": ""}, {"location": "config/yaml/#fields_19", "title": "Fields", "text": "<ul> <li><code>prompt</code> str - The prompt file to use.</li> <li><code>temperature</code> float - The temperature to use for token generation.\",</li> <li><code>top_p</code> float - The top-p value to use for token generation.</li> <li><code>n</code> int - The number of completions to generate.</li> <li><code>max_tokens</code> int - The maximum context size in tokens.</li> <li><code>data_max_tokens</code> int - The data llm maximum tokens.</li> <li><code>concurrency</code> int - The number of concurrent requests.</li> <li><code>drift_k_followups</code> int - The number of top global results to retrieve.</li> <li><code>primer_folds</code> int - The number of folds for search priming.</li> <li><code>primer_llm_max_tokens</code> int - The maximum number of tokens for the LLM in primer.</li> <li><code>n_depth</code> int - The number of drift search steps to take.</li> <li><code>local_search_text_unit_prop</code> float - The proportion of search dedicated to text units.</li> <li><code>local_search_community_prop</code> float - The proportion of search dedicated to community properties.</li> <li><code>local_search_top_k_mapped_entities</code> int - The number of top K entities to map during local search.</li> <li><code>local_search_top_k_relationships</code> int - The number of top K relationships to map during local search.</li> <li><code>local_search_max_data_tokens</code> int - The maximum context size in tokens for local search.</li> <li><code>local_search_temperature</code> float - The temperature to use for token generation in local search.</li> <li><code>local_search_top_p</code> float - The top-p value to use for token generation in local search.</li> <li><code>local_search_n</code> int - The number of completions to generate in local search.</li> <li><code>local_search_llm_max_gen_tokens</code> int - The maximum number of generated tokens for the LLM in local search.</li> </ul>"}, {"location": "data/operation_dulce/ABOUT/", "title": "About", "text": "<p>This document (Operation Dulce) is an AI-generated science fiction novella, included here for the purposes of integration testing.</p>"}, {"location": "index/architecture/", "title": "Indexing Architecture", "text": ""}, {"location": "index/architecture/#key-concepts", "title": "Key Concepts", "text": ""}, {"location": "index/architecture/#knowledge-model", "title": "Knowledge Model", "text": "<p>In order to support the GraphRAG system, the outputs of the indexing engine (in the Default Configuration Mode) are aligned to a knowledge model we call the GraphRAG Knowledge Model. This model is designed to be an abstraction over the underlying data storage technology, and to provide a common interface for the GraphRAG system to interact with. In normal use-cases the outputs of the GraphRAG Indexer would be loaded into a database system, and the GraphRAG's Query Engine would interact with the database using the knowledge model data-store types.</p>"}, {"location": "index/architecture/#workflows", "title": "Workflows", "text": "<p>Because of the complexity of our data indexing tasks, we needed to be able to express our data pipeline as series of multiple, interdependent workflows.</p> <pre><code>---\ntitle: Sample Workflow DAG\n---\nstateDiagram-v2\n    [*] --&gt; Prepare\n    Prepare --&gt; Chunk\n    Chunk --&gt; ExtractGraph\n    Chunk --&gt; EmbedDocuments\n    ExtractGraph --&gt; GenerateReports\n    ExtractGraph --&gt; EmbedEntities\n    ExtractGraph --&gt; EmbedGraph</code></pre>"}, {"location": "index/architecture/#dataframe-message-format", "title": "Dataframe Message Format", "text": "<p>The primary unit of communication between workflows, and between workflow steps is an instance of <code>pandas.DataFrame</code>. Although side-effects are possible, our goal is to be data-centric and table-centric in our approach to data processing. This allows us to easily reason about our data, and to leverage the power of dataframe-based ecosystems. Our underlying dataframe technology may change over time, but our primary goal is to support the workflow schema while retaining single-machine ease of use and developer ergonomics.</p>"}, {"location": "index/architecture/#llm-caching", "title": "LLM Caching", "text": "<p>The GraphRAG library was designed with LLM interactions in mind, and a common setback when working with LLM APIs is various errors due to network latency, throttling, etc.. Because of these potential error cases, we've added a cache layer around LLM interactions. When completion requests are made using the same input set (prompt and tuning parameters), we return a cached result if one exists. This allows our indexer to be more resilient to network issues, to act idempotently, and to provide a more efficient end-user experience.</p>"}, {"location": "index/default_dataflow/", "title": "Indexing Dataflow", "text": ""}, {"location": "index/default_dataflow/#the-graphrag-knowledge-model", "title": "The GraphRAG Knowledge Model", "text": "<p>The knowledge model is a specification for data outputs that conform to our data-model definition. You can find these definitions in the python/graphrag/graphrag/model folder within the GraphRAG repository. The following entity types are provided. The fields here represent the fields that are text-embedded by default.</p> <ul> <li><code>Document</code> - An input document into the system. These either represent individual rows in a CSV or individual .txt file.</li> <li><code>TextUnit</code> - A chunk of text to analyze. The size of these chunks, their overlap, and whether they adhere to any data boundaries may be configured below. A common use case is to set <code>CHUNK_BY_COLUMNS</code> to <code>id</code> so that there is a 1-to-many relationship between documents and TextUnits instead of a many-to-many.</li> <li><code>Entity</code> - An entity extracted from a TextUnit. These represent people, places, events, or some other entity-model that you provide.</li> <li><code>Relationship</code> - A relationship between two entities. These are generated from the covariates.</li> <li><code>Covariate</code> - Extracted claim information, which contains statements about entities which may be time-bound.</li> <li><code>Community</code> - Once the graph of entities and relationships is built, we perform hierarchical community detection on them to create a clustering structure.</li> <li><code>Community Report</code> - The contents of each community are summarized into a generated report, useful for human reading and downstream search.</li> <li><code>Node</code> - This table contains layout information for rendered graph-views of the Entities and Documents which have been embedded and clustered.</li> </ul>"}, {"location": "index/default_dataflow/#the-default-configuration-workflow", "title": "The Default Configuration Workflow", "text": "<p>Let's take a look at how the default-configuration workflow transforms text documents into the GraphRAG Knowledge Model. This page gives a general overview of the major steps in this process. To fully configure this workflow, check out the configuration documentation.</p> <pre><code>---\ntitle: Dataflow Overview\n---\nflowchart TB\n    subgraph phase1[Phase 1: Compose TextUnits]\n    documents[Documents] --&gt; chunk[Chunk]\n    chunk --&gt; embed[Embed] --&gt; textUnits[Text Units]\n    end\n    subgraph phase2[Phase 2: Graph Extraction]\n    textUnits --&gt; graph_extract[Entity &amp; Relationship Extraction]\n    graph_extract --&gt; graph_summarize[Entity &amp; Relationship Summarization]\n    graph_summarize --&gt; claim_extraction[Claim Extraction]\n    claim_extraction --&gt; graph_outputs[Graph Tables]\n    end\n    subgraph phase3[Phase 3: Graph Augmentation]\n    graph_outputs --&gt; community_detect[Community Detection]\n    community_detect --&gt; graph_embed[Graph Embedding]\n    graph_embed --&gt; augmented_graph[Augmented Graph Tables]\n    end\n    subgraph phase4[Phase 4: Community Summarization]\n    augmented_graph --&gt; summarized_communities[Community Summarization]\n    summarized_communities --&gt; embed_communities[Community Embedding]\n    embed_communities --&gt; community_outputs[Community Tables]\n    end\n    subgraph phase5[Phase 5: Document Processing]\n    documents --&gt; link_to_text_units[Link to TextUnits]\n    textUnits --&gt; link_to_text_units\n    link_to_text_units --&gt; embed_documents[Document Embedding]\n    embed_documents --&gt; document_graph[Document Graph Creation]\n    document_graph --&gt; document_outputs[Document Tables]\n    end\n    subgraph phase6[Phase 6: Network Visualization]\n    document_outputs --&gt; umap_docs[Umap Documents]\n    augmented_graph --&gt; umap_entities[Umap Entities]\n    umap_docs --&gt; combine_nodes[Nodes Table]\n    umap_entities --&gt; combine_nodes\n    end</code></pre>"}, {"location": "index/default_dataflow/#phase-1-compose-textunits", "title": "Phase 1: Compose TextUnits", "text": "<p>The first phase of the default-configuration workflow is to transform input documents into TextUnits. A TextUnit is a chunk of text that is used for our graph extraction techniques. They are also used as source-references by extracted knowledge items in order to empower breadcrumbs and provenance by concepts back to their original source tex.</p> <p>The chunk size (counted in tokens), is user-configurable. By default this is set to 300 tokens, although we've had positive experience with 1200-token chunks using a single \"glean\" step. (A \"glean\" step is a follow-on extraction). Larger chunks result in lower-fidelity output and less meaningful reference texts; however, using larger chunks can result in much faster processing time.</p> <p>The group-by configuration is also user-configurable. By default, we align our chunks to document boundaries, meaning that there is a strict 1-to-many relationship between Documents and TextUnits. In rare cases, this can be turned into a many-to-many relationship. This is useful when the documents are very short and we need several of them to compose a meaningful analysis unit (e.g. Tweets or a chat log)</p> <p>Each of these text-units are text-embedded and passed into the next phase of the pipeline.</p> <pre><code>---\ntitle: Documents into Text Chunks\n---\nflowchart LR\n    doc1[Document 1] --&gt; tu1[TextUnit 1]\n    doc1 --&gt; tu2[TextUnit 2]\n    doc2[Document 2] --&gt; tu3[TextUnit 3]\n    doc2 --&gt; tu4[TextUnit 4]\n</code></pre>"}, {"location": "index/default_dataflow/#phase-2-graph-extraction", "title": "Phase 2: Graph Extraction", "text": "<p>In this phase, we analyze each text unit and extract our graph primitives: Entities, Relationships, and Claims. Entities and Relationships are extracted at once in our entity_extract verb, and claims are extracted in our claim_extract verb. Results are then combined and passed into following phases of the pipeline.</p> <pre><code>---\ntitle: Graph Extraction\n---\nflowchart LR\n    tu[TextUnit] --&gt; ge[Graph Extraction] --&gt; gs[Graph Summarization]\n    tu --&gt; ce[Claim Extraction]</code></pre>"}, {"location": "index/default_dataflow/#entity-relationship-extraction", "title": "Entity &amp; Relationship Extraction", "text": "<p>In this first step of graph extraction, we process each text-unit in order to extract entities and relationships out of the raw text using the LLM. The output of this step is a subgraph-per-TextUnit containing a list of entities with a name, type, and description, and a list of relationships with a source, target, and description.</p> <p>These subgraphs are merged together - any entities with the same name and type are merged by creating an array of their descriptions. Similarly, any relationships with the same source and target are merged by creating an array of their descriptions.</p>"}, {"location": "index/default_dataflow/#entity-relationship-summarization", "title": "Entity &amp; Relationship Summarization", "text": "<p>Now that we have a graph of entities and relationships, each with a list of descriptions, we can summarize these lists into a single description per entity and relationship. This is done by asking the LLM for a short summary that captures all of the distinct information from each description. This allows all of our entities and relationships to have a single concise description.</p>"}, {"location": "index/default_dataflow/#claim-extraction-emission", "title": "Claim Extraction &amp; Emission", "text": "<p>Finally, as an independent workflow, we extract claims from the source TextUnits. These claims represent positive factual statements with an evaluated status and time-bounds. These get exported as a primary artifact called Covariates.</p> <p>Note: claim extraction is optional and turned off by default. This is because claim extraction generally requires prompt tuning to be useful.</p>"}, {"location": "index/default_dataflow/#phase-3-graph-augmentation", "title": "Phase 3: Graph Augmentation", "text": "<p>Now that we have a usable graph of entities and relationships, we want to understand their community structure and augment the graph with additional information. This is done in two steps: Community Detection and Graph Embedding. These give us explicit (communities) and implicit (embeddings) ways of understanding the topological structure of our graph.</p> <pre><code>---\ntitle: Graph Augmentation\n---\nflowchart LR\n    cd[Leiden Hierarchical Community Detection] --&gt; ge[Node2Vec Graph Embedding] --&gt; ag[Graph Table Emission]</code></pre>"}, {"location": "index/default_dataflow/#community-detection", "title": "Community Detection", "text": "<p>In this step, we generate a hierarchy of entity communities using the Hierarchical Leiden Algorithm. This method will apply a recursive community-clustering to our graph until we reach a community-size threshold. This will allow us to understand the community structure of our graph and provide a way to navigate and summarize the graph at different levels of granularity.</p>"}, {"location": "index/default_dataflow/#graph-embedding", "title": "Graph Embedding", "text": "<p>In this step, we generate a vector representation of our graph using the Node2Vec algorithm. This will allow us to understand the implicit structure of our graph and provide an additional vector-space in which to search for related concepts during our query phase.</p>"}, {"location": "index/default_dataflow/#graph-tables-emission", "title": "Graph Tables Emission", "text": "<p>Once our graph augmentation steps are complete, the final Entities and Relationships tables are exported after their text fields are text-embedded.</p>"}, {"location": "index/default_dataflow/#phase-4-community-summarization", "title": "Phase 4: Community Summarization", "text": "<pre><code>---\ntitle: Community Summarization\n---\nflowchart LR\n    sc[Generate Community Reports] --&gt; ss[Summarize Community Reports] --&gt; ce[Community Embedding] --&gt; co[Community Tables Emission]</code></pre> <p>At this point, we have a functional graph of entities and relationships, a hierarchy of communities for the entities, as well as node2vec embeddings.</p> <p>Now we want to build on the communities data and generate reports for each community. This gives us a high-level understanding of the graph at several points of graph granularity. For example, if community A is the top-level community, we'll get a report about the entire graph. If the community is lower-level, we'll get a report about a local cluster.</p>"}, {"location": "index/default_dataflow/#generate-community-reports", "title": "Generate Community Reports", "text": "<p>In this step, we generate a summary of each community using the LLM. This will allow us to understand the distinct information contained within each community and provide a scoped understanding of the graph, from either a high-level or a low-level perspective. These reports contain an executive overview and reference the key entities, relationships, and claims within the community sub-structure.</p>"}, {"location": "index/default_dataflow/#summarize-community-reports", "title": "Summarize Community Reports", "text": "<p>In this step, each community report is then summarized via the LLM for shorthand use.</p>"}, {"location": "index/default_dataflow/#community-embedding", "title": "Community Embedding", "text": "<p>In this step, we generate a vector representation of our communities by generating text embeddings of the community report, the community report summary, and the title of the community report.</p>"}, {"location": "index/default_dataflow/#community-tables-emission", "title": "Community Tables Emission", "text": "<p>At this point, some bookkeeping work is performed and we export the Communities and CommunityReports tables.</p>"}, {"location": "index/default_dataflow/#phase-5-document-processing", "title": "Phase 5: Document Processing", "text": "<p>In this phase of the workflow, we create the Documents table for the knowledge model.</p> <pre><code>---\ntitle: Document Processing\n---\nflowchart LR\n    aug[Augment] --&gt; dp[Link to TextUnits] --&gt; de[Avg. Embedding] --&gt; dg[Document Table Emission]</code></pre>"}, {"location": "index/default_dataflow/#augment-with-columns-csv-only", "title": "Augment with Columns (CSV Only)", "text": "<p>If the workflow is operating on CSV data, you may configure your workflow to add additional fields to Documents output. These fields should exist on the incoming CSV tables. Details about configuring this can be found in the configuration documentation.</p>"}, {"location": "index/default_dataflow/#link-to-textunits", "title": "Link to TextUnits", "text": "<p>In this step, we link each document to the text-units that were created in the first phase. This allows us to understand which documents are related to which text-units and vice-versa.</p>"}, {"location": "index/default_dataflow/#document-embedding", "title": "Document Embedding", "text": "<p>In this step, we generate a vector representation of our documents using an average embedding of document slices. We re-chunk documents without overlapping chunks, and then generate an embedding for each chunk. We create an average of these chunks weighted by token-count and use this as the document embedding. This will allow us to understand the implicit relationship between documents, and will help us generate a network representation of our documents.</p>"}, {"location": "index/default_dataflow/#documents-table-emission", "title": "Documents Table Emission", "text": "<p>At this point, we can export the Documents table into the knowledge Model.</p>"}, {"location": "index/default_dataflow/#phase-6-network-visualization", "title": "Phase 6: Network Visualization", "text": "<p>In this phase of the workflow, we perform some steps to support network visualization of our high-dimensional vector spaces within our existing graphs. At this point there are two logical graphs at play: the Entity-Relationship graph and the Document graph.</p> <pre><code>---\ntitle: Network Visualization Workflows\n---\nflowchart LR\n    nv[Umap Documents] --&gt; ne[Umap Entities] --&gt; ng[Nodes Table Emission]</code></pre> <p>For each of the logical graphs, we perform a UMAP dimensionality reduction to generate a 2D representation of the graph. This will allow us to visualize the graph in a 2D space and understand the relationships between the nodes in the graph. The UMAP embeddings are then exported as a table of Nodes. The rows of this table include a discriminator indicating whether the node is a document or an entity, and the UMAP coordinates.</p>"}, {"location": "index/outputs/", "title": "Outputs", "text": "<p>The default pipeline produces a series of output tables that align with the conceptual knowledge model. This page describes the detailed output table schemas. By default we write these tables out as parquet files on disk.</p>"}, {"location": "index/outputs/#shared-fields", "title": "Shared fields", "text": "<p>All tables have two identifier fields:</p> name type description id str Generated UUID, assuring global uniqueness human_readable_id int This is an incremented short ID created per-run. For example, we use this short ID with generated summaries that print citations so they are easy to cross-reference visually."}, {"location": "index/outputs/#create_final_communities", "title": "create_final_communities", "text": "<p>This is a list of the final communities generated by Leiden. Communities are strictly hierarchical, subdividing into children as the cluster affinity is narrowed.</p> name type description community int Leiden-generated cluster ID for the community. Note that these increment with depth, so they are unique through all levels of the community hierarchy. For this table, human_readable_id is a copy of the community ID rather than a plain increment. parent int Parent community ID. level int Depth of the community in the hierarchy. title str Friendly name of the community. entity_ids str[] List of entities that are members of the community. relationship_ids str[] List of relationships that are wholly within the community (source and target are both in the community). text_unit_ids str[] List of text units represented within the community. period str Date of ingest, used for incremental update merges. ISO8601 size int Size of the community (entity count), used for incremental update merges."}, {"location": "index/outputs/#create_final_community_reports", "title": "create_final_community_reports", "text": "<p>This is the list of summarized reports for each community.</p> name type description community int Short ID of the community this report applies to. parent int Parent community ID. level int Level of the community this report applies to. title str LM-generated title for the report. summary str LM-generated summary of the report. full_content str LM-generated full report. rank float LM-derived relevance ranking of the report based on member entity salience rank_explanation str LM-derived explanation of the rank. findings dict LM-derived list of the top 5-10 insights from the community. Contains <code>summary</code> and <code>explanation</code> values. full_content_json json Full JSON output as returned by the LM. Most fields are extracted into columns, but this JSON is sent for query summarization so we leave it to allow for prompt tuning to add fields/content by end users. period str Date of ingest, used for incremental update merges. ISO8601 size int Size of the community (entity count), used for incremental update merges."}, {"location": "index/outputs/#create_final_covariates", "title": "create_final_covariates", "text": "<p>(Optional) If claim extraction is turned on, this is a list of the extracted covariates. Note that claims are typically oriented around identifying malicious behavior such as fraud, so they are not useful for all datasets.</p> name type description covariate_type str This is always \"claim\" with our default covariates. type str Nature of the claim type. description str LM-generated description of the behavior. subject_id str Name of the source entity (that is performing the claimed behavior). object_id str Name of the target entity (that the claimed behavior is performed on). status str LM-derived assessment of the correctness of the claim. One of [TRUE, FALSE, SUSPECTED] start_date str LM-derived start of the claimed activity. ISO8601 end_date str LM-derived end of the claimed activity. ISO8601 source_text str Short string of text containing the claimed behavior. text_unit_id str ID of the text unit the claim text was extracted from."}, {"location": "index/outputs/#create_final_documents", "title": "create_final_documents", "text": "<p>List of document content after import.</p> name type description title str Filename, unless otherwise configured during CSV import. text str Full text of the document. text_unit_ids str[] List of text units (chunks) that were parsed from the document. attributes dict (optional) If specified during CSV import, this is a dict of attributes for the document."}, {"location": "index/outputs/#create_final_entities", "title": "create_final_entities", "text": "<p>List of all entities found in the data by the LM.</p> name type description title str Name of the entity. type str Type of the entity. By default this will be \"organization\", \"person\", \"geo\", or \"event\" unless configured differently or auto-tuning is used. description str Textual description of the entity. Entities may be found in many text units, so this is an LM-derived summary of all descriptions. text_unit_ids str[] List of the text units containing the entity."}, {"location": "index/outputs/#create_final_nodes", "title": "create_final_nodes", "text": "<p>This is graph-related information for the entities. It contains only information relevant to the graph such as community. There is an entry for each entity at every community level it is found within, so you may see \"duplicate\" entities.</p> <p>Note that the ID fields match those in create_final_entities and can be used for joining if additional information about a node is required.</p> name type description title str Name of the referenced entity. Duplicated from create_final_entities for convenient cross-referencing. community int Leiden community the node is found within. Entities are not always assigned a community (they may not be close enough to any), so they may have a ID of -1. level int Level of the community the entity is in. degree int Node degree (connectedness) in the graph. x float X position of the node for visual layouts. If graph embeddings and UMAP are not turned on, this will be 0. y float Y position of the node for visual layouts. If graph embeddings and UMAP are not turned on, this will be 0."}, {"location": "index/outputs/#create_final_relationships", "title": "create_final_relationships", "text": "<p>List of all entity-to-entity relationships found in the data by the LM. This is also the edge list for the graph.</p> name type description source str Name of the source entity. target str Name of the target entity. description str LM-derived description of the relationship. Also see note for entity descriptions. weight float Weight of the edge in the graph. This is summed from an LM-derived \"strength\" measure for each relationship instance. combined_degree int Sum of source and target node degrees. text_unit_ids str[] List of text units the relationship was found within."}, {"location": "index/outputs/#create_final_text_units", "title": "create_final_text_units", "text": "<p>List of all text chunks parsed from the input documents.</p> name type description text str Raw full text of the chunk. n_tokens int Number of tokens in the chunk. This should normally match the <code>chunk_size</code> config parameter, except for the last chunk which is often shorter. document_ids str[] List of document IDs the chunk came from. This is normally only 1 due to our default groupby, but for very short text documents (e.g., microblogs) it can be configured so text units span multiple documents. entity_ids str[] List of entities found in the text unit. relationships_ids str[] List of relationships found in the text unit. covariate_ids str[] Optional list of covariates found in the text unit."}, {"location": "index/overview/", "title": "GraphRAG Indexing \ud83e\udd16", "text": "<p>The GraphRAG indexing package is a data pipeline and transformation suite that is designed to extract meaningful, structured data from unstructured text using LLMs.</p> <p>Indexing Pipelines are configurable. They are composed of workflows, standard and custom steps, prompt templates, and input/output adapters. Our standard pipeline is designed to:</p> <ul> <li>extract entities, relationships and claims from raw text</li> <li>perform community detection in entities</li> <li>generate community summaries and reports at multiple levels of granularity</li> <li>embed entities into a graph vector space</li> <li>embed text chunks into a textual vector space</li> </ul> <p>The outputs of the pipeline can be stored in a variety of formats, including JSON and Parquet - or they can be handled manually via the Python API.</p>"}, {"location": "index/overview/#getting-started", "title": "Getting Started", "text": ""}, {"location": "index/overview/#requirements", "title": "Requirements", "text": "<p>See the requirements section in Get Started for details on setting up a development environment.</p> <p>The Indexing Engine can be used in either a default configuration mode or with a custom pipeline. To configure GraphRAG, see the configuration documentation. After you have a config file you can run the pipeline using the CLI or the Python API.</p>"}, {"location": "index/overview/#usage", "title": "Usage", "text": ""}, {"location": "index/overview/#cli", "title": "CLI", "text": "<pre><code># Via Poetry\npoetry run poe cli --root &lt;data_root&gt; # default config mode\npoetry run poe cli --config your_pipeline.yml # custom config mode\n\n# Via Node\nyarn run:index --root &lt;data_root&gt; # default config mode\nyarn run:index --config your_pipeline.yml # custom config mode\n</code></pre>"}, {"location": "index/overview/#python-api", "title": "Python API", "text": "<p>Please see the examples folder for a handful of functional pipelines illustrating how to create and run via a custom settings.yml or through custom python scripts.</p>"}, {"location": "index/overview/#further-reading", "title": "Further Reading", "text": "<ul> <li>To start developing within the GraphRAG project, see getting started</li> <li>To understand the underlying concepts and execution model of the indexing library, see the architecture documentation</li> <li>To get running with a series of examples, see the examples documentation</li> <li>To read more about configuring the indexing engine, see the configuration documentation</li> </ul>"}, {"location": "prompt_tuning/auto_prompt_tuning/", "title": "Auto Prompt Tuning \u2699\ufe0f", "text": "<p>GraphRAG provides the ability to create domain adapted prompts for the generation of the knowledge graph. This step is optional, though it is highly encouraged to run it as it will yield better results when executing an Index Run.</p> <p>These are generated by loading the inputs, splitting them into chunks (text units) and then running a series of LLM invocations and template substitutions to generate the final prompts. We suggest using the default values provided by the script, but in this page you'll find the detail of each in case you want to further explore and tweak the prompt tuning algorithm.</p> <p> </p> <p> Figure 1: Auto Tuning Conceptual Diagram. </p>"}, {"location": "prompt_tuning/auto_prompt_tuning/#prerequisites", "title": "Prerequisites", "text": "<p>Before running auto tuning, ensure you have already initialized your workspace with the <code>graphrag init</code> command. This will create the necessary configuration files and the default prompts. Refer to the Init Documentation for more information about the initialization process.</p>"}, {"location": "prompt_tuning/auto_prompt_tuning/#usage", "title": "Usage", "text": "<p>You can run the main script from the command line with various options:</p> <pre><code>graphrag prompt-tune [--root ROOT] [--config CONFIG] [--domain DOMAIN]  [--selection-method METHOD] [--limit LIMIT] [--language LANGUAGE] \\\n[--max-tokens MAX_TOKENS] [--chunk-size CHUNK_SIZE] [--n-subset-max N_SUBSET_MAX] [--k K] \\\n[--min-examples-required MIN_EXAMPLES_REQUIRED] [--discover-entity-types] [--output OUTPUT]\n</code></pre>"}, {"location": "prompt_tuning/auto_prompt_tuning/#command-line-options", "title": "Command-Line Options", "text": "<ul> <li> <p><code>--config</code> (required): The path to the configuration file. This is required to load the data and model settings.</p> </li> <li> <p><code>--root</code> (optional): The data project root directory, including the config files (YML, JSON, or .env). Defaults to the current directory.</p> </li> <li> <p><code>--domain</code> (optional): The domain related to your input data, such as 'space science', 'microbiology', or 'environmental news'. If left empty, the domain will be inferred from the input data.</p> </li> <li> <p><code>--method</code> (optional): The method to select documents. Options are all, random, auto or top. Default is random.</p> </li> <li> <p><code>--limit</code> (optional): The limit of text units to load when using random or top selection. Default is 15.</p> </li> <li> <p><code>--language</code> (optional): The language to use for input processing. If it is different from the inputs' language, the LLM will translate. Default is \"\" meaning it will be automatically detected from the inputs.</p> </li> <li> <p><code>--max-tokens</code> (optional): Maximum token count for prompt generation. Default is 2000.</p> </li> <li> <p><code>--chunk-size</code> (optional): The size in tokens to use for generating text units from input documents. Default is 200.</p> </li> <li> <p><code>--n-subset-max</code> (optional): The number of text chunks to embed when using auto selection method. Default is 300.</p> </li> <li> <p><code>--k</code> (optional): The number of documents to select when using auto selection method. Default is 15.</p> </li> <li> <p><code>--min-examples-required</code> (optional): The minimum number of examples required for entity extraction prompts. Default is 2.</p> </li> <li> <p><code>--discover-entity-types</code> (optional): Allow the LLM to discover and extract entities automatically. We recommend using this when your data covers a lot of topics or it is highly randomized.</p> </li> <li> <p><code>--output</code> (optional): The folder to save the generated prompts. Default is \"prompts\".</p> </li> </ul>"}, {"location": "prompt_tuning/auto_prompt_tuning/#example-usage", "title": "Example Usage", "text": "<pre><code>python -m graphrag prompt-tune --root /path/to/project --config /path/to/settings.yaml --domain \"environmental news\" \\\n--method random --limit 10 --language English --max-tokens 2048 --chunk-size 256 --min-examples-required 3 \\\n--no-entity-types --output /path/to/output\n</code></pre> <p>or, with minimal configuration (suggested):</p> <pre><code>python -m graphrag prompt-tune --root /path/to/project --config /path/to/settings.yaml --no-entity-types\n</code></pre>"}, {"location": "prompt_tuning/auto_prompt_tuning/#document-selection-methods", "title": "Document Selection Methods", "text": "<p>The auto tuning feature ingests the input data and then divides it into text units the size of the chunk size parameter. After that, it uses one of the following selection methods to pick a sample to work with for prompt generation:</p> <ul> <li><code>random</code>: Select text units randomly. This is the default and recommended option.</li> <li><code>top</code>: Select the head n text units.</li> <li><code>all</code>: Use all text units for the generation. Use only with small datasets; this option is not usually recommended.</li> <li><code>auto</code>: Embed text units in a lower-dimensional space and select the k nearest neighbors to the centroid. This is useful when you have a large dataset and want to select a representative sample.</li> </ul>"}, {"location": "prompt_tuning/auto_prompt_tuning/#modify-env-vars", "title": "Modify Env Vars", "text": "<p>After running auto tuning, you should modify the following environment variables (or config variables) to pick up the new prompts on your index run. Note: Please make sure to update the correct path to the generated prompts, in this example we are using the default \"prompts\" path.</p> <ul> <li> <p><code>GRAPHRAG_ENTITY_EXTRACTION_PROMPT_FILE</code> = \"prompts/entity_extraction.txt\"</p> </li> <li> <p><code>GRAPHRAG_COMMUNITY_REPORT_PROMPT_FILE</code> = \"prompts/community_report.txt\"</p> </li> <li> <p><code>GRAPHRAG_SUMMARIZE_DESCRIPTIONS_PROMPT_FILE</code> = \"prompts/summarize_descriptions.txt\"</p> </li> </ul> <p>or in your yaml config file:</p> <pre><code>entity_extraction:\n  prompt: \"prompts/entity_extraction.txt\"\n\nsummarize_descriptions:\n  prompt: \"prompts/summarize_descriptions.txt\"\n\ncommunity_reports:\n  prompt: \"prompts/community_report.txt\"\n</code></pre>"}, {"location": "prompt_tuning/manual_prompt_tuning/", "title": "Manual Prompt Tuning \u2699\ufe0f", "text": "<p>The GraphRAG indexer, by default, will run with a handful of prompts that are designed to work well in the broad context of knowledge discovery. However, it is quite common to want to tune the prompts to better suit your specific use case. We provide a means for you to do this by allowing you to specify a custom prompt file, which will each use a series of token-replacements internally.</p> <p>Each of these prompts may be overridden by writing a custom prompt file in plaintext. We use token-replacements in the form of <code>{token_name}</code>, and the descriptions for the available tokens can be found below.</p>"}, {"location": "prompt_tuning/manual_prompt_tuning/#indexing-prompts", "title": "Indexing Prompts", "text": ""}, {"location": "prompt_tuning/manual_prompt_tuning/#entityrelationship-extraction", "title": "Entity/Relationship Extraction", "text": "<p>Prompt Source</p>"}, {"location": "prompt_tuning/manual_prompt_tuning/#tokens", "title": "Tokens", "text": "<ul> <li>{input_text} - The input text to be processed.</li> <li>{entity_types} - A list of entity types</li> <li>{tuple_delimiter} - A delimiter for separating values within a tuple. A single tuple is used to represent an individual entity or relationship.</li> <li>{record_delimiter} - A delimiter for separating tuple instances.</li> <li>{completion_delimiter} - An indicator for when generation is complete.</li> </ul>"}, {"location": "prompt_tuning/manual_prompt_tuning/#summarize-entityrelationship-descriptions", "title": "Summarize Entity/Relationship Descriptions", "text": "<p>Prompt Source</p>"}, {"location": "prompt_tuning/manual_prompt_tuning/#tokens_1", "title": "Tokens", "text": "<ul> <li>{entity_name} - The name of the entity or the source/target pair of the relationship.</li> <li>{description_list} - A list of descriptions for the entity or relationship.</li> </ul>"}, {"location": "prompt_tuning/manual_prompt_tuning/#claim-extraction", "title": "Claim Extraction", "text": "<p>Prompt Source</p>"}, {"location": "prompt_tuning/manual_prompt_tuning/#tokens_2", "title": "Tokens", "text": "<ul> <li>{input_text} - The input text to be processed.</li> <li>{tuple_delimiter} - A delimiter for separating values within a tuple. A single tuple is used to represent an individual entity or relationship.</li> <li>{record_delimiter} - A delimiter for separating tuple instances.</li> <li>{completion_delimiter} - An indicator for when generation is complete.</li> <li>{entity_specs} - A list of entity types.</li> <li>{claim_description} - Description of what claims should look like. Default is: <code>\"Any claims or facts that could be relevant to information discovery.\"</code></li> </ul> <p>See the configuration documentation for details on how to change this.</p>"}, {"location": "prompt_tuning/manual_prompt_tuning/#generate-community-reports", "title": "Generate Community Reports", "text": "<p>Prompt Source</p>"}, {"location": "prompt_tuning/manual_prompt_tuning/#tokens_3", "title": "Tokens", "text": "<ul> <li>{input_text} - The input text to generate the report with. This will contain tables of entities and relationships.</li> </ul>"}, {"location": "prompt_tuning/manual_prompt_tuning/#query-prompts", "title": "Query Prompts", "text": ""}, {"location": "prompt_tuning/manual_prompt_tuning/#local-search", "title": "Local Search", "text": "<p>Prompt Source</p>"}, {"location": "prompt_tuning/manual_prompt_tuning/#tokens_4", "title": "Tokens", "text": "<ul> <li>{response_type} - Describe how the response should look. We default to \"multiple paragraphs\".</li> <li>{context_data} - The data tables from GraphRAG's index.</li> </ul>"}, {"location": "prompt_tuning/manual_prompt_tuning/#global-search", "title": "Global Search", "text": "<p>Mapper Prompt Source</p> <p>Reducer Prompt Source</p> <p>Knowledge Prompt Source</p> <p>Global search uses a map/reduce approach to summarization. You can tune these prompts independently. This search also includes the ability to adjust the use of general knowledge from the model's training.</p>"}, {"location": "prompt_tuning/manual_prompt_tuning/#tokens_5", "title": "Tokens", "text": "<ul> <li>{response_type} - Describe how the response should look (reducer only). We default to \"multiple paragraphs\".</li> <li>{context_data} - The data tables from GraphRAG's index.</li> </ul>"}, {"location": "prompt_tuning/manual_prompt_tuning/#drift-search", "title": "Drift Search", "text": "<p>Prompt Source</p>"}, {"location": "prompt_tuning/manual_prompt_tuning/#tokens_6", "title": "Tokens", "text": "<ul> <li>{response_type} - Describe how the response should look. We default to \"multiple paragraphs\".</li> <li>{context_data} - The data tables from GraphRAG's index.</li> <li>{community_reports} - The most relevant community reports to include in the summarization.</li> <li>{query} - The query text as injected into the context.</li> </ul>"}, {"location": "prompt_tuning/overview/", "title": "Prompt Tuning \u2699\ufe0f", "text": "<p>This page provides an overview of the prompt tuning options available for the GraphRAG indexing engine.</p>"}, {"location": "prompt_tuning/overview/#default-prompts", "title": "Default Prompts", "text": "<p>The default prompts are the simplest way to get started with the GraphRAG system. It is designed to work out-of-the-box with minimal configuration. More details about each of the default prompts for indexing and query can be found on the manual tuning page.</p>"}, {"location": "prompt_tuning/overview/#auto-tuning", "title": "Auto Tuning", "text": "<p>Auto Tuning leverages your input data and LLM interactions to create domain adapted prompts for the generation of the knowledge graph. It is highly encouraged to run it as it will yield better results when executing an Index Run. For more details about how to use it, please refer to the Auto Tuning documentation.</p>"}, {"location": "prompt_tuning/overview/#manual-tuning", "title": "Manual Tuning", "text": "<p>Manual tuning is an advanced use-case. Most users will want to use the Auto Tuning feature instead. Details about how to use manual configuration are available in the manual tuning documentation.</p>"}, {"location": "query/drift_search/", "title": "DRIFT Search \ud83d\udd0e", "text": ""}, {"location": "query/drift_search/#combining-local-and-global-search", "title": "Combining Local and Global Search", "text": "<p>GraphRAG is a technique that uses large language models (LLMs) to create knowledge graphs and summaries from unstructured text documents and leverages them to improve retrieval-augmented generation (RAG) operations on private datasets. It offers comprehensive global overviews of large, private troves of unstructured text documents while also enabling exploration of detailed, localized information. By using LLMs to create comprehensive knowledge graphs that connect and describe entities and relationships contained in those documents, GraphRAG leverages semantic structuring of the data to generate responses to a wide variety of complex user queries.</p> <p>DRIFT search (Dynamic Reasoning and Inference with Flexible Traversal) builds upon Microsoft\u2019s GraphRAG technique, combining characteristics of both global and local search to generate detailed responses in a method that balances computational costs with quality outcomes using our drift search method.</p>"}, {"location": "query/drift_search/#methodology", "title": "Methodology", "text": "<p> Figure 1. An entire DRIFT search hierarchy highlighting the three core phases of the DRIFT search process. A (Primer): DRIFT compares the user\u2019s query with the top K most semantically relevant community reports, generating a broad initial answer and follow-up questions to steer further exploration. B (Follow-Up): DRIFT uses local search to refine queries, producing additional intermediate answers and follow-up questions that enhance specificity, guiding the engine towards context-rich information. A glyph on each node in the diagram shows the confidence the algorithm has to continue the query expansion step.  C (Output Hierarchy): The final output is a hierarchical structure of questions and answers ranked by relevance, reflecting a balanced mix of global insights and local refinements, making the results adaptable and comprehensive.</p> <p>DRIFT Search introduces a new approach to local search queries by including community information in the search process. This greatly expands the breadth of the query\u2019s starting point and leads to retrieval and usage of a far higher variety of facts in the final answer. This addition expands the GraphRAG query engine by providing a more comprehensive option for local search, which uses community insights to refine a query into detailed follow-up questions.</p>"}, {"location": "query/drift_search/#configuration", "title": "Configuration", "text": "<p>Below are the key parameters of the DRIFTSearch class:</p> <ul> <li><code>llm</code>: OpenAI model object to be used for response generation</li> <li><code>context_builder</code>: context builder object to be used for preparing context data from community reports and query information</li> <li><code>config</code>: model to define the DRIFT Search hyperparameters. DRIFT Config model</li> <li><code>token_encoder</code>: token encoder for tracking the budget for the algorithm.</li> <li><code>query_state</code>: a state object as defined in Query State that allows to track execution of a DRIFT Search instance, alongside follow ups and DRIFT actions.</li> </ul>"}, {"location": "query/drift_search/#how-to-use", "title": "How to Use", "text": "<p>An example of a drift search scenario can be found in the following notebook.</p>"}, {"location": "query/drift_search/#learn-more", "title": "Learn More", "text": "<p>For a more in-depth look at the DRIFT search method, please refer to our DRIFT Search blog post</p>"}, {"location": "query/global_search/", "title": "Global Search \ud83d\udd0e", "text": ""}, {"location": "query/global_search/#whole-dataset-reasoning", "title": "Whole Dataset Reasoning", "text": "<p>Baseline RAG struggles with queries that require aggregation of information across the dataset to compose an answer. Queries such as \u201cWhat are the top 5 themes in the data?\u201d perform terribly because baseline RAG relies on a vector search of semantically similar text content within the dataset. There is nothing in the query to direct it to the correct information.</p> <p>However, with GraphRAG we can answer such questions, because the structure of the LLM-generated knowledge graph tells us about the structure (and thus themes) of the dataset as a whole. This allows the private dataset to be organized into meaningful semantic clusters that are pre-summarized. Using our global search method, the LLM uses these clusters to summarize these themes when responding to a user query.</p>"}, {"location": "query/global_search/#methodology", "title": "Methodology", "text": "<pre><code>---\ntitle: Global Search Dataflow\n---\n%%{ init: { 'flowchart': { 'curve': 'step' } } }%%\nflowchart LR\n\n    uq[User Query] --- .1\n    ch1[Conversation History] --- .1\n\n    subgraph RIR\n        direction TB\n        ri1[Rated Intermediate&lt;br/&gt;Response 1]~~~ri2[Rated Intermediate&lt;br/&gt;Response 2] -.\"{1..N}\".-rin[Rated Intermediate&lt;br/&gt;Response N]\n    end\n\n    .1--Shuffled Community&lt;br/&gt;Report Batch 1--&gt;RIR\n    .1--Shuffled Community&lt;br/&gt;Report Batch 2--&gt;RIR---.2\n    .1--Shuffled Community&lt;br/&gt;Report Batch N--&gt;RIR\n\n    .2--Ranking +&lt;br/&gt;Filtering--&gt;agr[Aggregated Intermediate&lt;br/&gt;Responses]--&gt;res[Response]\n\n\n\n     classDef green fill:#26B653,stroke:#333,stroke-width:2px,color:#fff;\n     classDef turquoise fill:#19CCD3,stroke:#333,stroke-width:2px,color:#fff;\n     classDef rose fill:#DD8694,stroke:#333,stroke-width:2px,color:#fff;\n     classDef orange fill:#F19914,stroke:#333,stroke-width:2px,color:#fff;\n     classDef purple fill:#B356CD,stroke:#333,stroke-width:2px,color:#fff;\n     classDef invisible fill:#fff,stroke:#fff,stroke-width:0px,color:#fff, width:0px;\n     class uq,ch1 turquoise;\n     class ri1,ri2,rin rose;\n     class agr orange;\n     class res purple;\n     class .1,.2 invisible;\n</code></pre> <p>Given a user query and, optionally, the conversation history, the global search method uses a collection of LLM-generated community reports from a specified level of the graph's community hierarchy as context data to generate response in a map-reduce manner. At the <code>map</code> step, community reports are segmented into text chunks of pre-defined size. Each text chunk is then used to produce an intermediate response containing a list of point, each of which is accompanied by a numerical rating indicating the importance of the point. At the <code>reduce</code> step, a filtered set of the most important points from the intermediate responses are aggregated and used as the context to generate the final response. </p> <p>The quality of the global search\u2019s response can be heavily influenced by the level of the community hierarchy chosen for sourcing community reports. Lower hierarchy levels, with their detailed reports, tend to yield more thorough responses, but may also increase the time and LLM resources needed to generate the final response due to the volume of reports.</p>"}, {"location": "query/global_search/#configuration", "title": "Configuration", "text": "<p>Below are the key parameters of the GlobalSearch class:</p> <ul> <li><code>llm</code>: OpenAI model object to be used for response generation</li> <li><code>context_builder</code>: context builder object to be used for preparing context data from community reports</li> <li><code>map_system_prompt</code>: prompt template used in the <code>map</code> stage. Default template can be found at map_system_prompt</li> <li><code>reduce_system_prompt</code>: prompt template used in the <code>reduce</code> stage, default template can be found at reduce_system_prompt</li> <li><code>response_type</code>: free-form text describing the desired response type and format (e.g., <code>Multiple Paragraphs</code>, <code>Multi-Page Report</code>)</li> <li><code>allow_general_knowledge</code>: setting this to True will include additional instructions to the <code>reduce_system_prompt</code> to prompt the LLM to incorporate relevant real-world knowledge outside of the dataset. Note that this may increase hallucinations, but can be useful for certain scenarios. Default is False *<code>general_knowledge_inclusion_prompt</code>: instruction to add to the <code>reduce_system_prompt</code> if <code>allow_general_knowledge</code> is enabled. Default instruction can be found at general_knowledge_instruction</li> <li><code>max_data_tokens</code>: token budget for the context data</li> <li><code>map_llm_params</code>: a dictionary of additional parameters (e.g., temperature, max_tokens) to be passed to the LLM call at the <code>map</code> stage</li> <li><code>reduce_llm_params</code>: a dictionary of additional parameters (e.g., temperature, max_tokens) to passed to the LLM call at the <code>reduce</code> stage</li> <li><code>context_builder_params</code>: a dictionary of additional parameters to be passed to the <code>context_builder</code> object when building context window for the <code>map</code> stage.</li> <li><code>concurrent_coroutines</code>: controls the degree of parallelism in the <code>map</code> stage.</li> <li><code>callbacks</code>: optional callback functions, can be used to provide custom event handlers for LLM's completion streaming events</li> </ul>"}, {"location": "query/global_search/#how-to-use", "title": "How to Use", "text": "<p>An example of a global search scenario can be found in the following notebook.</p>"}, {"location": "query/local_search/", "title": "Local Search \ud83d\udd0e", "text": ""}, {"location": "query/local_search/#entity-based-reasoning", "title": "Entity-based Reasoning", "text": "<p>The local search method combines structured data from the knowledge graph with unstructured data from the input documents to augment the LLM context with relevant entity information at query time. It is well-suited for answering questions that require an understanding of specific entities mentioned in the input documents (e.g., \u201cWhat are the healing properties of chamomile?\u201d).</p>"}, {"location": "query/local_search/#methodology", "title": "Methodology", "text": "<pre><code>---\ntitle: Local Search Dataflow\n---\n%%{ init: { 'flowchart': { 'curve': 'step' } } }%%\nflowchart LR\n\n    uq[User Query] ---.1\n    ch1[Conversation&lt;br/&gt;History]---.1\n\n    .1--Entity&lt;br/&gt;Description&lt;br/&gt;Embedding--&gt; ee[Extracted Entities]\n\n    ee[Extracted Entities] ---.2--Entity-Text&lt;br/&gt;Unit Mapping--&gt; ctu[Candidate&lt;br/&gt;Text Units]--Ranking + &lt;br/&gt;Filtering --&gt;ptu[Prioritized&lt;br/&gt;Text Units]---.3\n    .2--Entity-Report&lt;br/&gt;Mapping--&gt; ccr[Candidate&lt;br/&gt;Community Reports]--Ranking + &lt;br/&gt;Filtering --&gt;pcr[Prioritized&lt;br/&gt;Community Reports]---.3\n    .2--Entity-Entity&lt;br/&gt;Relationships--&gt; ce[Candidate&lt;br/&gt;Entities]--Ranking + &lt;br/&gt;Filtering --&gt;pe[Prioritized&lt;br/&gt;Entities]---.3\n    .2--Entity-Entity&lt;br/&gt;Relationships--&gt; cr[Candidate&lt;br/&gt;Relationships]--Ranking + &lt;br/&gt;Filtering --&gt;pr[Prioritized&lt;br/&gt;Relationships]---.3\n    .2--Entity-Covariate&lt;br/&gt;Mappings--&gt; cc[Candidate&lt;br/&gt;Covariates]--Ranking + &lt;br/&gt;Filtering --&gt;pc[Prioritized&lt;br/&gt;Covariates]---.3\n    ch1 --&gt;ch2[Conversation History]---.3\n    .3--&gt;res[Response]\n\n     classDef green fill:#26B653,stroke:#333,stroke-width:2px,color:#fff;\n     classDef turquoise fill:#19CCD3,stroke:#333,stroke-width:2px,color:#fff;\n     classDef rose fill:#DD8694,stroke:#333,stroke-width:2px,color:#fff;\n     classDef orange fill:#F19914,stroke:#333,stroke-width:2px,color:#fff;\n     classDef purple fill:#B356CD,stroke:#333,stroke-width:2px,color:#fff;\n     classDef invisible fill:#fff,stroke:#fff,stroke-width:0px,color:#fff, width:0px;\n     class uq,ch1 turquoise\n     class ee green\n     class ctu,ccr,ce,cr,cc rose\n     class ptu,pcr,pe,pr,pc,ch2 orange\n     class res purple\n     class .1,.2,.3 invisible\n\n</code></pre> <p>Given a user query and, optionally, the conversation history, the local search method identifies a set of entities from the knowledge graph that are semantically-related to the user input. These entities serve as access points into the knowledge graph, enabling the extraction of further relevant details such as connected entities, relationships, entity covariates, and community reports. Additionally, it also extracts relevant text chunks from the raw input documents that are associated with the identified entities. These candidate data sources are then prioritized and filtered to fit within a single context window of pre-defined size, which is used to generate a response to the user query.</p>"}, {"location": "query/local_search/#configuration", "title": "Configuration", "text": "<p>Below are the key parameters of the LocalSearch class:</p> <ul> <li><code>llm</code>: OpenAI model object to be used for response generation</li> <li><code>context_builder</code>: context builder object to be used for preparing context data from collections of knowledge model objects</li> <li><code>system_prompt</code>: prompt template used to generate the search response. Default template can be found at system_prompt</li> <li><code>response_type</code>: free-form text describing the desired response type and format (e.g., <code>Multiple Paragraphs</code>, <code>Multi-Page Report</code>)</li> <li><code>llm_params</code>: a dictionary of additional parameters (e.g., temperature, max_tokens) to be passed to the LLM call</li> <li><code>context_builder_params</code>: a dictionary of additional parameters to be passed to the <code>context_builder</code> object when building context for the search prompt</li> <li><code>callbacks</code>: optional callback functions, can be used to provide custom event handlers for LLM's completion streaming events</li> </ul>"}, {"location": "query/local_search/#how-to-use", "title": "How to Use", "text": "<p>An example of a local search scenario can be found in the following notebook.</p>"}, {"location": "query/overview/", "title": "Query Engine \ud83d\udd0e", "text": "<p>The Query Engine is the retrieval module of the Graph RAG Library. It is one of the two main components of the Graph RAG library, the other being the Indexing Pipeline (see Indexing Pipeline). It is responsible for the following tasks:</p> <ul> <li>Local Search</li> <li>Global Search</li> <li>DRIFT Search</li> <li>Question Generation</li> </ul>"}, {"location": "query/overview/#local-search", "title": "Local Search", "text": "<p>Local search method generates answers by combining relevant data from the AI-extracted knowledge-graph with text chunks of the raw documents. This method is suitable for questions that require an understanding of specific entities mentioned in the documents (e.g. What are the healing properties of chamomile?).</p> <p>For more details about how Local Search works please refer to the Local Search documentation.</p>"}, {"location": "query/overview/#global-search", "title": "Global Search", "text": "<p>Global search method generates answers by searching over all AI-generated community reports in a map-reduce fashion. This is a resource-intensive method, but often gives good responses for questions that require an understanding of the dataset as a whole (e.g. What are the most significant values of the herbs mentioned in this notebook?).</p> <p>More about this can be checked at the Global Search documentation.</p>"}, {"location": "query/overview/#drift-search", "title": "DRIFT Search", "text": "<p>DRIFT Search introduces a new approach to local search queries by including community information in the search process. This greatly expands the breadth of the query\u2019s starting point and leads to retrieval and usage of a far higher variety of facts in the final answer. This addition expands the GraphRAG query engine by providing a more comprehensive option for local search, which uses community insights to refine a query into detailed follow-up questions.</p> <p>To learn more about DRIFT Search, please refer to the DRIFT Search documentation.</p>"}, {"location": "query/overview/#question-generation", "title": "Question Generation", "text": "<p>This functionality takes a list of user queries and generates the next candidate questions. This is useful for generating follow-up questions in a conversation or for generating a list of questions for the investigator to dive deeper into the dataset.</p> <p>Information about how question generation works can be found at the Question Generation documentation page.</p>"}, {"location": "query/question_generation/", "title": "Question Generation \u2754", "text": ""}, {"location": "query/question_generation/#entity-based-question-generation", "title": "Entity-based Question Generation", "text": "<p>The question generation method combines structured data from the knowledge graph with unstructured data from the input documents to generate candidate questions related to specific entities.</p>"}, {"location": "query/question_generation/#methodology", "title": "Methodology", "text": "<p>Given a list of prior user questions, the question generation method uses the same context-building approach employed in local search to extract and prioritize relevant structured and unstructured data, including entities, relationships, covariates, community reports and raw text chunks. These data records are then fitted into a single LLM prompt to generate candidate follow-up questions that represent the most important or urgent information content or themes in the data.</p>"}, {"location": "query/question_generation/#configuration", "title": "Configuration", "text": "<p>Below are the key parameters of the Question Generation class:</p> <ul> <li><code>llm</code>: OpenAI model object to be used for response generation</li> <li><code>context_builder</code>: context builder object to be used for preparing context data from collections of knowledge model objects, using the same context builder class as in local search</li> <li><code>system_prompt</code>: prompt template used to generate candidate questions. Default template can be found at system_prompt</li> <li><code>llm_params</code>: a dictionary of additional parameters (e.g., temperature, max_tokens) to be passed to the LLM call</li> <li><code>context_builder_params</code>: a dictionary of additional parameters to be passed to the <code>context_builder</code> object when building context for the question generation prompt</li> <li><code>callbacks</code>: optional callback functions, can be used to provide custom event handlers for LLM's completion streaming events</li> </ul>"}, {"location": "query/question_generation/#how-to-use", "title": "How to Use", "text": "<p>An example of the question generation function can be found in the following notebook.</p>"}, {"location": "query/notebooks/overview/", "title": "API Notebooks", "text": "<ul> <li>API Overview Notebook</li> </ul>"}, {"location": "query/notebooks/overview/#query-engine-notebooks", "title": "Query Engine Notebooks", "text": "<p>For examples about running Query please refer to the following notebooks:</p> <ul> <li>Global Search Notebook</li> <li>Local Search Notebook</li> <li>DRIFT Search Notebook</li> </ul> <p>The test dataset for these notebooks can be found in dataset.zip.</p>"}]}
\ No newline at end of file
diff --git a/sitemap.xml.gz b/sitemap.xml.gz
index 969953a28c..274803bc55 100644
Binary files a/sitemap.xml.gz and b/sitemap.xml.gz differ