Collapse create base documents (#1176)

* Collapse non-attribute verbs * Include document_column_attributes in collapse * Remove merge_override verb * Semver * Clean up some df/tests
microsoft · Sep 23, 2024 · fbc483e · fbc483e
1 parent ea46820
commit fbc483e
Show file tree

Hide file tree

Showing 11 changed files with 178 additions and 169 deletions.
diff --git a/.semversioner/next-release/patch-20240920192804408249.json b/.semversioner/next-release/patch-20240920192804408249.json
@@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "Collapse create_base_documents."
+}
diff --git a/graphrag/index/verbs/__init__.py b/graphrag/index/verbs/__init__.py
@@ -15,7 +15,7 @@
     merge_graphs,
     unpack_graph,
 )
-from .overrides import aggregate, concat, merge
+from .overrides import aggregate, concat
 from .snapshot import snapshot
 from .snapshot_rows import snapshot_rows
 from .spread_json import spread_json
@@ -35,7 +35,6 @@
     "extract_covariates",
     "genid",
     "layout_graph",
-    "merge",
     "merge_graphs",
     "snapshot",
     "snapshot_rows",

diff --git a/graphrag/index/verbs/overrides/__init__.py b/graphrag/index/verbs/overrides/__init__.py
@@ -5,6 +5,5 @@
 
 from .aggregate import aggregate
 from .concat import concat
-from .merge import merge
 
-__all__ = ["aggregate", "concat", "merge"]
+__all__ = ["aggregate", "concat"]
diff --git a/graphrag/index/verbs/overrides/merge.py b/graphrag/index/verbs/overrides/merge.py
diff --git a/graphrag/index/workflows/v1/create_base_documents.py b/graphrag/index/workflows/v1/create_base_documents.py
@@ -22,84 +22,13 @@ def build_steps(
     document_attribute_columns = config.get("document_attribute_columns", [])
     return [
         {
-            "verb": "unroll",
-            "args": {"column": "document_ids"},
-            "input": {"source": "workflow:create_final_text_units"},
-        },
-        {
-            "verb": "select",
-            "args": {
-                # We only need the chunk id and the document id
-                "columns": ["id", "document_ids", "text"]
-            },
-        },
-        {
-            "id": "rename_chunk_doc_id",
-            "verb": "rename",
-            "args": {
-                "columns": {
-                    "document_ids": "chunk_doc_id",
-                    "id": "chunk_id",
-                    "text": "chunk_text",
-                }
-            },
-        },
-        {
-            "verb": "join",
-            "args": {
-                # Join the doc id from the chunk onto the original document
-                "on": ["chunk_doc_id", "id"]
-            },
-            "input": {"source": "rename_chunk_doc_id", "others": [DEFAULT_INPUT_NAME]},
-        },
-        {
-            "id": "docs_with_text_units",
-            "verb": "aggregate_override",
-            "args": {
-                "groupby": ["id"],
-                "aggregations": [
-                    {
-                        "column": "chunk_id",
-                        "operation": "array_agg",
-                        "to": "text_units",
-                    }
-                ],
-            },
-        },
-        {
-            "verb": "join",
+            "verb": "create_base_documents",
             "args": {
-                "on": ["id", "id"],
-                "strategy": "right outer",
+                "document_attribute_columns": document_attribute_columns,
             },
             "input": {
-                "source": "docs_with_text_units",
-                "others": [DEFAULT_INPUT_NAME],
-            },
-        },
-        {
-            "verb": "rename",
-            "args": {"columns": {"text": "raw_content"}},
-        },
-        *[
-            {
-                "verb": "convert",
-                "args": {
-                    "column": column,
-                    "to": column,
-                    "type": "string",
-                },
-            }
-            for column in document_attribute_columns
-        ],
-        {
-            "verb": "merge_override",
-            "enabled": len(document_attribute_columns) > 0,
-            "args": {
-                "columns": document_attribute_columns,
-                "strategy": "json",
-                "to": "attributes",
+                "source": DEFAULT_INPUT_NAME,
+                "others": ["workflow:create_final_text_units"],
             },
         },
-        {"verb": "convert", "args": {"column": "id", "to": "id", "type": "string"}},
     ]
diff --git a/graphrag/index/workflows/v1/subflows/__init__.py b/graphrag/index/workflows/v1/subflows/__init__.py
@@ -3,6 +3,7 @@
 
 """The Indexing Engine workflows -> subflows package root."""
 
+from .create_base_documents import create_base_documents
 from .create_final_communities import create_final_communities
 from .create_final_nodes import create_final_nodes
 from .create_final_relationships_post_embedding import (
@@ -14,6 +15,7 @@
 from .create_final_text_units_pre_embedding import create_final_text_units_pre_embedding
 
 __all__ = [
+    "create_base_documents",
     "create_final_communities",
     "create_final_nodes",
     "create_final_relationships_post_embedding",

diff --git a/graphrag/index/workflows/v1/subflows/create_base_documents.py b/graphrag/index/workflows/v1/subflows/create_base_documents.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+
+"""All the steps to transform base documents."""
+
+from typing import cast
+
+import pandas as pd
+from datashaper import (
+    Table,
+    VerbInput,
+    verb,
+)
+from datashaper.table_store.types import VerbResult, create_verb_result
+
+from graphrag.index.verbs.overrides.aggregate import aggregate_df
+
+
+@verb(name="create_base_documents", treats_input_tables_as_immutable=True)
+def create_base_documents(
+    input: VerbInput,
+    document_attribute_columns: list[str] | None = None,
+    **_kwargs: dict,
+) -> VerbResult:
+    """All the steps to transform base documents."""
+    source = cast(pd.DataFrame, input.get_input())
+    text_units = cast(pd.DataFrame, input.get_others()[0])
+
+    text_units = cast(
+        pd.DataFrame, text_units.explode("document_ids")[["id", "document_ids", "text"]]
+    )
+    text_units.rename(
+        columns={
+            "document_ids": "chunk_doc_id",
+            "id": "chunk_id",
+            "text": "chunk_text",
+        },
+        inplace=True,
+    )
+
+    joined = text_units.merge(
+        source,
+        left_on="chunk_doc_id",
+        right_on="id",
+        how="inner",
+    )
+
+    docs_with_text_units = aggregate_df(
+        joined,
+        groupby=["id"],
+        aggregations=[
+            {
+                "column": "chunk_id",
+                "operation": "array_agg",
+                "to": "text_units",
+            }
+        ],
+    )
+
+    rejoined = docs_with_text_units.merge(
+        source,
+        on="id",
+        how="right",
+    )
+    rejoined.rename(columns={"text": "raw_content"}, inplace=True)
+    rejoined["id"] = rejoined["id"].astype(str)
+
+    # attribute columns are converted to strings and then collapsed into a single json object
+    if document_attribute_columns:
+        for column in document_attribute_columns:
+            rejoined[column] = rejoined[column].astype(str)
+        rejoined["attributes"] = rejoined[document_attribute_columns].apply(
+            lambda row: {**row},
+            axis=1,
+        )
+        rejoined.drop(columns=document_attribute_columns, inplace=True)
+        rejoined.reset_index()
+
+    return create_verb_result(
+        cast(
+            Table,
+            rejoined,
+        )
+    )
diff --git a/tests/fixtures/min-csv/config.json b/tests/fixtures/min-csv/config.json
@@ -113,7 +113,7 @@
                 1,
                 2000
             ],
-            "subworkflows": 8,
+            "subworkflows": 1,
             "max_runtime": 10
         },
         "create_final_documents": {

diff --git a/tests/fixtures/text/config.json b/tests/fixtures/text/config.json
@@ -132,7 +132,7 @@
                 1,
                 2000
             ],
-            "subworkflows": 8,
+            "subworkflows": 1,
             "max_runtime": 10
         },
         "create_final_documents": {

diff --git a/tests/verbs/test_create_base_documents.py b/tests/verbs/test_create_base_documents.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+
+from graphrag.index.workflows.v1.create_base_documents import (
+    build_steps,
+    workflow_name,
+)
+
+from .util import (
+    compare_outputs,
+    get_config_for_workflow,
+    get_workflow_output,
+    load_expected,
+    load_input_tables,
+)
+
+
+async def test_create_base_documents():
+    input_tables = load_input_tables(["workflow:create_final_text_units"])
+    expected = load_expected(workflow_name)
+
+    config = get_config_for_workflow(workflow_name)
+
+    steps = build_steps(config)
+
+    actual = await get_workflow_output(
+        input_tables,
+        {
+            "steps": steps,
+        },
+    )
+
+    compare_outputs(actual, expected)
+
+
+async def test_create_base_documents_with_attribute_columns():
+    input_tables = load_input_tables(["workflow:create_final_text_units"])
+    expected = load_expected(workflow_name)
+
+    config = get_config_for_workflow(workflow_name)
+
+    config["document_attribute_columns"] = ["title"]
+
+    steps = build_steps(config)
+
+    actual = await get_workflow_output(
+        input_tables,
+        {
+            "steps": steps,
+        },
+    )
+
+    # we should have dropped "title" and added "attributes"
+    # our test dataframe does not have attributes, so we'll assert without it
+    # and separately confirm it is in the output
+    compare_outputs(actual, expected, columns=["id", "text_units", "raw_content"])
+    assert len(actual.columns) == 4
+    assert "attributes" in actual.columns