Skip to content

Commit

Permalink
Replace md5 hash (#1470)
Browse files Browse the repository at this point in the history
* switched hashing function helper to sha256

* refactored references to hashing util

* semversioner

* switched from sha256 to sha512

* new semversioner

* updated tests/verbs/data folder

* generated fresh parquet files in data folder

* moved ignore flag
  • Loading branch information
KennyZhang1 authored Dec 5, 2024
1 parent d17dfd0 commit 10f84c9
Show file tree
Hide file tree
Showing 21 changed files with 21 additions and 12 deletions.
4 changes: 4 additions & 0 deletions .semversioner/next-release/patch-20241204203534799756.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"type": "patch",
"description": "replaced md5 hash with sha256"
}
4 changes: 4 additions & 0 deletions .semversioner/next-release/patch-20241204211013990211.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"type": "patch",
"description": "replaced md5 hash with sha512"
}
5 changes: 3 additions & 2 deletions graphrag/index/flows/create_base_entity_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,5 +169,6 @@ def _prep_communities(communities) -> pd.DataFrame:

def _compute_degree(graph: nx.Graph) -> pd.DataFrame:
return pd.DataFrame([
{"name": node, "degree": int(degree)} for node, degree in graph.degree
]) # type: ignore
{"name": node, "degree": int(degree)}
for node, degree in graph.degree # type: ignore
])
4 changes: 2 additions & 2 deletions graphrag/index/flows/create_base_text_units.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from graphrag.index.operations.chunk_text import chunk_text
from graphrag.index.operations.snapshot import snapshot
from graphrag.index.utils.hashing import gen_md5_hash
from graphrag.index.utils.hashing import gen_sha512_hash
from graphrag.storage.pipeline_storage import PipelineStorage


Expand Down Expand Up @@ -67,7 +67,7 @@ async def create_base_text_units(
},
inplace=True,
)
chunked["id"] = chunked.apply(lambda row: gen_md5_hash(row, ["chunk"]), axis=1)
chunked["id"] = chunked.apply(lambda row: gen_sha512_hash(row, ["chunk"]), axis=1)
chunked[["document_ids", "chunk", "n_tokens"]] = pd.DataFrame(
chunked["chunk"].tolist(), index=chunked.index
)
Expand Down
4 changes: 2 additions & 2 deletions graphrag/index/input/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import pandas as pd

from graphrag.index.config.input import PipelineCSVInputConfig, PipelineInputConfig
from graphrag.index.utils.hashing import gen_md5_hash
from graphrag.index.utils.hashing import gen_sha512_hash
from graphrag.logging.base import ProgressReporter
from graphrag.storage.pipeline_storage import PipelineStorage

Expand Down Expand Up @@ -42,7 +42,7 @@ async def load_file(path: str, group: dict | None) -> pd.DataFrame:
lambda _row: pd.Series([group[key] for key in additional_keys]), axis=1
)
if "id" not in data.columns:
data["id"] = data.apply(lambda x: gen_md5_hash(x, x.keys()), axis=1)
data["id"] = data.apply(lambda x: gen_sha512_hash(x, x.keys()), axis=1)
if csv_config.source_column is not None and "source" not in data.columns:
if csv_config.source_column not in data.columns:
log.warning(
Expand Down
4 changes: 2 additions & 2 deletions graphrag/index/input/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import pandas as pd

from graphrag.index.config.input import PipelineInputConfig
from graphrag.index.utils.hashing import gen_md5_hash
from graphrag.index.utils.hashing import gen_sha512_hash
from graphrag.logging.base import ProgressReporter
from graphrag.storage.pipeline_storage import PipelineStorage

Expand All @@ -36,7 +36,7 @@ async def load_file(
group = {}
text = await storage.get(path, encoding="utf-8")
new_item = {**group, "text": text}
new_item["id"] = gen_md5_hash(new_item, new_item.keys())
new_item["id"] = gen_sha512_hash(new_item, new_item.keys())
new_item["title"] = str(Path(path).name)
return new_item

Expand Down
8 changes: 4 additions & 4 deletions graphrag/index/utils/hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
"""Hashing utilities."""

from collections.abc import Iterable
from hashlib import md5
from hashlib import sha512
from typing import Any


def gen_md5_hash(item: dict[str, Any], hashcode: Iterable[str]):
"""Generate an md5 hash."""
def gen_sha512_hash(item: dict[str, Any], hashcode: Iterable[str]):
"""Generate a SHA512 hash."""
hashed = "".join([str(item[column]) for column in hashcode])
return f"{md5(hashed.encode('utf-8'), usedforsecurity=False).hexdigest()}"
return f"{sha512(hashed.encode('utf-8'), usedforsecurity=False).hexdigest()}"
Binary file modified tests/verbs/data/base_communities.parquet
Binary file not shown.
Binary file modified tests/verbs/data/base_entity_nodes.parquet
Binary file not shown.
Binary file modified tests/verbs/data/base_relationship_edges.parquet
Binary file not shown.
Binary file removed tests/verbs/data/create_base_entity_graph.parquet
Binary file not shown.
Binary file modified tests/verbs/data/create_base_text_units.parquet
Binary file not shown.
Binary file modified tests/verbs/data/create_final_communities.parquet
Binary file not shown.
Binary file modified tests/verbs/data/create_final_community_reports.parquet
Binary file not shown.
Binary file modified tests/verbs/data/create_final_covariates.parquet
Binary file not shown.
Binary file modified tests/verbs/data/create_final_documents.parquet
Binary file not shown.
Binary file modified tests/verbs/data/create_final_entities.parquet
Binary file not shown.
Binary file modified tests/verbs/data/create_final_nodes.parquet
Binary file not shown.
Binary file modified tests/verbs/data/create_final_relationships.parquet
Binary file not shown.
Binary file modified tests/verbs/data/create_final_text_units.parquet
Binary file not shown.
Binary file modified tests/verbs/data/source_documents.parquet
Binary file not shown.

0 comments on commit 10f84c9

Please sign in to comment.