Skip to content

Commit

Permalink
Mark snapshot-id as deprecated in SetStatisticsUpdate (#1566)
Browse files Browse the repository at this point in the history
Closes #1556

---------

Co-authored-by: Fokko Driesprong <[email protected]>
  • Loading branch information
ndrluis and Fokko authored Jan 24, 2025
1 parent a93e300 commit 3b53edc
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 25 deletions.
23 changes: 16 additions & 7 deletions pyiceberg/table/update/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@
from abc import ABC, abstractmethod
from datetime import datetime
from functools import singledispatch
from typing import TYPE_CHECKING, Any, Dict, Generic, List, Literal, Optional, Tuple, TypeVar, Union
from typing import TYPE_CHECKING, Any, Dict, Generic, List, Literal, Optional, Tuple, TypeVar, Union, cast

from pydantic import Field, field_validator
from pydantic import Field, field_validator, model_validator
from typing_extensions import Annotated

from pyiceberg.exceptions import CommitFailedException
Expand Down Expand Up @@ -177,8 +177,20 @@ class RemovePropertiesUpdate(IcebergBaseModel):

class SetStatisticsUpdate(IcebergBaseModel):
action: Literal["set-statistics"] = Field(default="set-statistics")
snapshot_id: int = Field(alias="snapshot-id")
statistics: StatisticsFile
snapshot_id: Optional[int] = Field(
None,
alias="snapshot-id",
description="snapshot-id is **DEPRECATED for REMOVAL** since it contains redundant information. Use `statistics.snapshot-id` field instead.",
)

@model_validator(mode="before")
def validate_snapshot_id(cls, data: Dict[str, Any]) -> Dict[str, Any]:
stats = cast(StatisticsFile, data["statistics"])

data["snapshot_id"] = stats.snapshot_id

return data


class RemoveStatisticsUpdate(IcebergBaseModel):
Expand Down Expand Up @@ -491,10 +503,7 @@ def _(

@_apply_table_update.register(SetStatisticsUpdate)
def _(update: SetStatisticsUpdate, base_metadata: TableMetadata, context: _TableMetadataUpdateContext) -> TableMetadata:
if update.snapshot_id != update.statistics.snapshot_id:
raise ValueError("Snapshot id in statistics does not match the snapshot id in the update")

statistics = filter_statistics_by_snapshot_id(base_metadata.statistics, update.snapshot_id)
statistics = filter_statistics_by_snapshot_id(base_metadata.statistics, update.statistics.snapshot_id)
context.add_update(update)

return base_metadata.model_copy(update={"statistics": statistics + [update.statistics]})
Expand Down
3 changes: 1 addition & 2 deletions pyiceberg/table/update/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,9 @@ class UpdateStatistics(UpdateTableMetadata["UpdateStatistics"]):
def __init__(self, transaction: "Transaction") -> None:
super().__init__(transaction)

def set_statistics(self, snapshot_id: int, statistics_file: StatisticsFile) -> "UpdateStatistics":
def set_statistics(self, statistics_file: StatisticsFile) -> "UpdateStatistics":
self._updates += (
SetStatisticsUpdate(
snapshot_id=snapshot_id,
statistics=statistics_file,
),
)
Expand Down
4 changes: 2 additions & 2 deletions tests/integration/test_statistics_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ def create_statistics_file(snapshot_id: int, type_name: str) -> StatisticsFile:
statistics_file_snap_2 = create_statistics_file(add_snapshot_id_2, "deletion-vector-v1")

with tbl.update_statistics() as update:
update.set_statistics(add_snapshot_id_1, statistics_file_snap_1)
update.set_statistics(add_snapshot_id_2, statistics_file_snap_2)
update.set_statistics(statistics_file_snap_1)
update.set_statistics(statistics_file_snap_2)

assert len(tbl.metadata.statistics) == 2

Expand Down
14 changes: 0 additions & 14 deletions tests/table/test_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -1310,20 +1310,6 @@ def test_set_statistics_update(table_v2_with_statistics: Table) -> None:
assert len(updated_statistics) == 1
assert json.loads(updated_statistics[0].model_dump_json()) == json.loads(expected)

update = SetStatisticsUpdate(
snapshot_id=123456789,
statistics=statistics_file,
)

with pytest.raises(
ValueError,
match="Snapshot id in statistics does not match the snapshot id in the update",
):
update_table_metadata(
table_v2_with_statistics.metadata,
(update,),
)


def test_remove_statistics_update(table_v2_with_statistics: Table) -> None:
update = RemoveStatisticsUpdate(
Expand Down

0 comments on commit 3b53edc

Please sign in to comment.