Skip to content

Commit

Permalink
feat(ingest/mongodb) re-order aggregation logic (#12428)
Browse files Browse the repository at this point in the history
Co-authored-by: Kade Ryu <[email protected]>
  • Loading branch information
Haebuk and KadeRyu authored Feb 5, 2025
1 parent 8773ff5 commit 06bee0d
Show file tree
Hide file tree
Showing 3 changed files with 4,617 additions and 16 deletions.
33 changes: 17 additions & 16 deletions metadata-ingestion/src/datahub/ingestion/source/mongodb.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,26 +219,27 @@ def construct_schema_pymongo(
"""

aggregations: List[Dict] = []

# The order of the aggregations impacts execution time. By setting the sample/limit aggregation first,
# the subsequent aggregations process a much smaller dataset, improving performance.
if sample_size:
if use_random_sampling:
aggregations.append({"$sample": {"size": sample_size}})
else:
aggregations.append({"$limit": sample_size})

if should_add_document_size_filter:
doc_size_field = "temporary_doc_size_field"
# create a temporary field to store the size of the document. filter on it and then remove it.
aggregations = [
{"$addFields": {doc_size_field: {"$bsonSize": "$$ROOT"}}},
{"$match": {doc_size_field: {"$lt": max_document_size}}},
{"$project": {doc_size_field: 0}},
]
if use_random_sampling:
# get sample documents in collection
if sample_size:
aggregations.append({"$sample": {"size": sample_size}})
documents = collection.aggregate(
aggregations,
allowDiskUse=True,
aggregations.extend(
[
{"$addFields": {doc_size_field: {"$bsonSize": "$$ROOT"}}},
{"$match": {doc_size_field: {"$lt": max_document_size}}},
{"$project": {doc_size_field: 0}},
]
)
else:
if sample_size:
aggregations.append({"$limit": sample_size})
documents = collection.aggregate(aggregations, allowDiskUse=True)

documents = collection.aggregate(aggregations, allowDiskUse=True)

return construct_schema(list(documents), delimiter)

Expand Down
Loading

0 comments on commit 06bee0d

Please sign in to comment.