Skip to content

Commit

Permalink
Collapse create base documents (#1176)
Browse files Browse the repository at this point in the history
* Collapse non-attribute verbs

* Include document_column_attributes in collapse

* Remove merge_override verb

* Semver

* Clean up some df/tests
  • Loading branch information
natoverse authored Sep 23, 2024
1 parent ea46820 commit fbc483e
Show file tree
Hide file tree
Showing 11 changed files with 178 additions and 169 deletions.
4 changes: 4 additions & 0 deletions .semversioner/next-release/patch-20240920192804408249.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"type": "patch",
"description": "Collapse create_base_documents."
}
3 changes: 1 addition & 2 deletions graphrag/index/verbs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
merge_graphs,
unpack_graph,
)
from .overrides import aggregate, concat, merge
from .overrides import aggregate, concat
from .snapshot import snapshot
from .snapshot_rows import snapshot_rows
from .spread_json import spread_json
Expand All @@ -35,7 +35,6 @@
"extract_covariates",
"genid",
"layout_graph",
"merge",
"merge_graphs",
"snapshot",
"snapshot_rows",
Expand Down
3 changes: 1 addition & 2 deletions graphrag/index/verbs/overrides/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,5 @@

from .aggregate import aggregate
from .concat import concat
from .merge import merge

__all__ = ["aggregate", "concat", "merge"]
__all__ = ["aggregate", "concat"]
78 changes: 0 additions & 78 deletions graphrag/index/verbs/overrides/merge.py

This file was deleted.

79 changes: 4 additions & 75 deletions graphrag/index/workflows/v1/create_base_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,84 +22,13 @@ def build_steps(
document_attribute_columns = config.get("document_attribute_columns", [])
return [
{
"verb": "unroll",
"args": {"column": "document_ids"},
"input": {"source": "workflow:create_final_text_units"},
},
{
"verb": "select",
"args": {
# We only need the chunk id and the document id
"columns": ["id", "document_ids", "text"]
},
},
{
"id": "rename_chunk_doc_id",
"verb": "rename",
"args": {
"columns": {
"document_ids": "chunk_doc_id",
"id": "chunk_id",
"text": "chunk_text",
}
},
},
{
"verb": "join",
"args": {
# Join the doc id from the chunk onto the original document
"on": ["chunk_doc_id", "id"]
},
"input": {"source": "rename_chunk_doc_id", "others": [DEFAULT_INPUT_NAME]},
},
{
"id": "docs_with_text_units",
"verb": "aggregate_override",
"args": {
"groupby": ["id"],
"aggregations": [
{
"column": "chunk_id",
"operation": "array_agg",
"to": "text_units",
}
],
},
},
{
"verb": "join",
"verb": "create_base_documents",
"args": {
"on": ["id", "id"],
"strategy": "right outer",
"document_attribute_columns": document_attribute_columns,
},
"input": {
"source": "docs_with_text_units",
"others": [DEFAULT_INPUT_NAME],
},
},
{
"verb": "rename",
"args": {"columns": {"text": "raw_content"}},
},
*[
{
"verb": "convert",
"args": {
"column": column,
"to": column,
"type": "string",
},
}
for column in document_attribute_columns
],
{
"verb": "merge_override",
"enabled": len(document_attribute_columns) > 0,
"args": {
"columns": document_attribute_columns,
"strategy": "json",
"to": "attributes",
"source": DEFAULT_INPUT_NAME,
"others": ["workflow:create_final_text_units"],
},
},
{"verb": "convert", "args": {"column": "id", "to": "id", "type": "string"}},
]
2 changes: 2 additions & 0 deletions graphrag/index/workflows/v1/subflows/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

"""The Indexing Engine workflows -> subflows package root."""

from .create_base_documents import create_base_documents
from .create_final_communities import create_final_communities
from .create_final_nodes import create_final_nodes
from .create_final_relationships_post_embedding import (
Expand All @@ -14,6 +15,7 @@
from .create_final_text_units_pre_embedding import create_final_text_units_pre_embedding

__all__ = [
"create_base_documents",
"create_final_communities",
"create_final_nodes",
"create_final_relationships_post_embedding",
Expand Down
84 changes: 84 additions & 0 deletions graphrag/index/workflows/v1/subflows/create_base_documents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

"""All the steps to transform base documents."""

from typing import cast

import pandas as pd
from datashaper import (
Table,
VerbInput,
verb,
)
from datashaper.table_store.types import VerbResult, create_verb_result

from graphrag.index.verbs.overrides.aggregate import aggregate_df


@verb(name="create_base_documents", treats_input_tables_as_immutable=True)
def create_base_documents(
input: VerbInput,
document_attribute_columns: list[str] | None = None,
**_kwargs: dict,
) -> VerbResult:
"""All the steps to transform base documents."""
source = cast(pd.DataFrame, input.get_input())
text_units = cast(pd.DataFrame, input.get_others()[0])

text_units = cast(
pd.DataFrame, text_units.explode("document_ids")[["id", "document_ids", "text"]]
)
text_units.rename(
columns={
"document_ids": "chunk_doc_id",
"id": "chunk_id",
"text": "chunk_text",
},
inplace=True,
)

joined = text_units.merge(
source,
left_on="chunk_doc_id",
right_on="id",
how="inner",
)

docs_with_text_units = aggregate_df(
joined,
groupby=["id"],
aggregations=[
{
"column": "chunk_id",
"operation": "array_agg",
"to": "text_units",
}
],
)

rejoined = docs_with_text_units.merge(
source,
on="id",
how="right",
)
rejoined.rename(columns={"text": "raw_content"}, inplace=True)
rejoined["id"] = rejoined["id"].astype(str)

# attribute columns are converted to strings and then collapsed into a single json object
if document_attribute_columns:
for column in document_attribute_columns:
rejoined[column] = rejoined[column].astype(str)
rejoined["attributes"] = rejoined[document_attribute_columns].apply(
lambda row: {**row},
axis=1,
)
rejoined.drop(columns=document_attribute_columns, inplace=True)
rejoined.reset_index()

return create_verb_result(
cast(
Table,
rejoined,
)
)
2 changes: 1 addition & 1 deletion tests/fixtures/min-csv/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@
1,
2000
],
"subworkflows": 8,
"subworkflows": 1,
"max_runtime": 10
},
"create_final_documents": {
Expand Down
2 changes: 1 addition & 1 deletion tests/fixtures/text/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@
1,
2000
],
"subworkflows": 8,
"subworkflows": 1,
"max_runtime": 10
},
"create_final_documents": {
Expand Down
58 changes: 58 additions & 0 deletions tests/verbs/test_create_base_documents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

from graphrag.index.workflows.v1.create_base_documents import (
build_steps,
workflow_name,
)

from .util import (
compare_outputs,
get_config_for_workflow,
get_workflow_output,
load_expected,
load_input_tables,
)


async def test_create_base_documents():
input_tables = load_input_tables(["workflow:create_final_text_units"])
expected = load_expected(workflow_name)

config = get_config_for_workflow(workflow_name)

steps = build_steps(config)

actual = await get_workflow_output(
input_tables,
{
"steps": steps,
},
)

compare_outputs(actual, expected)


async def test_create_base_documents_with_attribute_columns():
input_tables = load_input_tables(["workflow:create_final_text_units"])
expected = load_expected(workflow_name)

config = get_config_for_workflow(workflow_name)

config["document_attribute_columns"] = ["title"]

steps = build_steps(config)

actual = await get_workflow_output(
input_tables,
{
"steps": steps,
},
)

# we should have dropped "title" and added "attributes"
# our test dataframe does not have attributes, so we'll assert without it
# and separately confirm it is in the output
compare_outputs(actual, expected, columns=["id", "text_units", "raw_content"])
assert len(actual.columns) == 4
assert "attributes" in actual.columns
Loading

0 comments on commit fbc483e

Please sign in to comment.