Skip to content

Commit

Permalink
drop dataset stats from catalog and cli (#878)
Browse files Browse the repository at this point in the history
  • Loading branch information
mattseddon authored Jan 31, 2025
1 parent 10c2702 commit 2a4693e
Show file tree
Hide file tree
Showing 10 changed files with 30 additions and 102 deletions.
12 changes: 0 additions & 12 deletions src/datachain/catalog/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
DatasetDependency,
DatasetListRecord,
DatasetRecord,
DatasetStats,
DatasetStatus,
StorageURI,
create_dataset_uri,
Expand Down Expand Up @@ -1235,17 +1234,6 @@ def dataset_table_export_file_names(self, name: str, version: int) -> list[str]:
dataset = self.get_dataset(name)
return self.warehouse.dataset_table_export_file_names(dataset, version)

def dataset_stats(self, name: str, version: Optional[int]) -> DatasetStats:
"""
Returns tuple with dataset stats: total number of rows and total dataset size.
"""
dataset = self.get_dataset(name)
dataset_version = dataset.get_version(version or dataset.latest_version)
return DatasetStats(
num_objects=dataset_version.num_objects,
size=dataset_version.size,
)

def remove_dataset(
self,
name: str,
Expand Down
8 changes: 0 additions & 8 deletions src/datachain/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from .commands import (
clear_cache,
completion,
dataset_stats,
du,
edit_dataset,
garbage_collect,
Expand Down Expand Up @@ -182,13 +181,6 @@ def handle_dataset_command(args, catalog):
all=args.all,
team=args.team,
),
"stats": lambda: dataset_stats(
catalog,
args.name,
args.version,
show_bytes=args.bytes,
si=args.si,
),
}

handler = dataset_commands.get(args.datasets_cmd)
Expand Down
2 changes: 0 additions & 2 deletions src/datachain/cli/commands/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from .datasets import (
dataset_stats,
edit_dataset,
list_datasets,
list_datasets_local,
Expand All @@ -15,7 +14,6 @@
__all__ = [
"clear_cache",
"completion",
"dataset_stats",
"du",
"edit_dataset",
"garbage_collect",
Expand Down
19 changes: 0 additions & 19 deletions src/datachain/cli/commands/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@

from tabulate import tabulate

from datachain import utils

if TYPE_CHECKING:
from datachain.catalog import Catalog

Expand Down Expand Up @@ -109,20 +107,3 @@ def edit_dataset(

if (all or studio) and token:
edit_studio_dataset(team, name, new_name, description, labels)


def dataset_stats(
catalog: "Catalog",
name: str,
version: int,
show_bytes=False,
si=False,
):
stats = catalog.dataset_stats(name, version)

if stats:
print(f"Number of objects: {stats.num_objects}")
if show_bytes:
print(f"Total objects size: {stats.size}")
else:
print(f"Total objects size: {utils.sizeof_fmt(stats.size, si=si): >7}")
25 changes: 0 additions & 25 deletions src/datachain/cli/parser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,31 +307,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
help="The team to delete a dataset. By default, it will use team from config",
)

dataset_stats_parser = datasets_subparser.add_parser(
"stats", parents=[parent_parser], description="Show basic dataset statistics."
)
dataset_stats_parser.add_argument("name", type=str, help="Dataset name")
dataset_stats_parser.add_argument(
"--version",
action="store",
default=None,
type=int,
help="Dataset version",
)
dataset_stats_parser.add_argument(
"-b",
"--bytes",
default=False,
action="store_true",
help="Display size in bytes instead of human-readable size",
)
dataset_stats_parser.add_argument(
"--si",
default=False,
action="store_true",
help="Display size using powers of 1000 not 1024",
)

parse_ls = subp.add_parser(
"ls", parents=[parent_parser], description="List storage contents."
)
Expand Down
6 changes: 0 additions & 6 deletions src/datachain/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,12 +150,6 @@ def __hash__(self):
return hash(f"{self.type}_{self.name}_{self.version}")


@dataclass
class DatasetStats:
num_objects: Optional[int] # None if table is missing
size: Optional[int] # in bytes None if table is missing or empty


class DatasetStatus:
CREATED = 1
PENDING = 2
Expand Down
2 changes: 0 additions & 2 deletions src/datachain/remote/studio.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,12 @@
import websockets

from datachain.config import Config
from datachain.dataset import DatasetStats
from datachain.error import DataChainError
from datachain.utils import STUDIO_URL, retry_with_backoff

T = TypeVar("T")
LsData = Optional[list[dict[str, Any]]]
DatasetInfoData = Optional[dict[str, Any]]
DatasetStatsData = Optional[DatasetStats]
DatasetRowsData = Optional[Iterable[dict[str, Any]]]
DatasetJobVersionsData = Optional[dict[str, Any]]
DatasetExportStatus = Optional[dict[str, Any]]
Expand Down
45 changes: 23 additions & 22 deletions tests/func/test_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
def listing_stats(uri, catalog):
list_dataset_name, _, _ = parse_listing_uri(uri, catalog.client_config)
dataset = catalog.get_dataset(list_dataset_name)
return catalog.dataset_stats(dataset.name, dataset.latest_version)
dataset_version = dataset.get_version(dataset.latest_version)
return dataset_version.num_objects, dataset_version.size


@pytest.fixture
Expand Down Expand Up @@ -582,23 +583,23 @@ def test_listing_stats(cloud_test_catalog):
listing_stats(src_uri, catalog)

catalog.enlist_source(src_uri)
stats = listing_stats(src_uri, catalog)
assert stats.num_objects == 7
assert stats.size == 36
num_objects, size = listing_stats(src_uri, catalog)
assert num_objects == 7
assert size == 36

catalog.enlist_source(f"{src_uri}/dogs/", update=True)
stats = listing_stats(src_uri, catalog)
assert stats.num_objects == 7
assert stats.size == 36
num_objects, size = listing_stats(src_uri, catalog)
assert num_objects == 7
assert size == 36

stats = listing_stats(f"{src_uri}/dogs/", catalog)
assert stats.num_objects == 4
assert stats.size == 15
num_objects, size = listing_stats(f"{src_uri}/dogs/", catalog)
assert num_objects == 4
assert size == 15

catalog.enlist_source(f"{src_uri}/dogs/")
stats = listing_stats(src_uri, catalog)
assert stats.num_objects == 7
assert stats.size == 36
num_objects, size = listing_stats(src_uri, catalog)
assert num_objects == 7
assert size == 36


@pytest.mark.parametrize("cloud_type", ["s3", "azure", "gs"], indirect=True)
Expand All @@ -608,15 +609,15 @@ def test_enlist_source_handles_slash(cloud_test_catalog):
src_path = f"{src_uri}/dogs"

catalog.enlist_source(src_path)
stats = listing_stats(src_path, catalog)
assert stats.num_objects == len(DEFAULT_TREE["dogs"])
assert stats.size == 15
num_objects, size = listing_stats(src_path, catalog)
assert num_objects == len(DEFAULT_TREE["dogs"])
assert size == 15

src_path = f"{src_uri}/dogs"
catalog.enlist_source(src_path, update=True)
stats = listing_stats(src_path, catalog)
assert stats.num_objects == len(DEFAULT_TREE["dogs"])
assert stats.size == 15
num_objects, size = listing_stats(src_path, catalog)
assert num_objects == len(DEFAULT_TREE["dogs"])
assert size == 15


@pytest.mark.parametrize("cloud_type", ["s3", "azure", "gs"], indirect=True)
Expand All @@ -626,10 +627,10 @@ def test_enlist_source_handles_glob(cloud_test_catalog):
src_path = f"{src_uri}/dogs/*.jpg"

catalog.enlist_source(src_path)
stats = listing_stats(src_path, catalog)
num_objects, size = listing_stats(src_path, catalog)

assert stats.num_objects == len(DEFAULT_TREE["dogs"])
assert stats.size == 15
assert num_objects == len(DEFAULT_TREE["dogs"])
assert size == 15


@pytest.mark.parametrize("cloud_type", ["s3", "azure", "gs"], indirect=True)
Expand Down
7 changes: 4 additions & 3 deletions tests/func/test_datachain.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from datachain import DataModel, func
from datachain.catalog.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE
from datachain.data_storage.sqlite import SQLiteWarehouse
from datachain.dataset import DatasetDependencyType, DatasetStats
from datachain.dataset import DatasetDependencyType
from datachain.func import path as pathfunc
from datachain.lib.dc import C, DataChain
from datachain.lib.file import File, ImageFile
Expand Down Expand Up @@ -515,8 +515,9 @@ def test_from_storage_dataset_stats(tmp_dir, test_session):
dc = DataChain.from_storage(tmp_dir.as_uri(), session=test_session).save(
"test-data"
)
stats = test_session.catalog.dataset_stats(dc.name, dc.version)
assert stats == DatasetStats(num_objects=4, size=20)
version = test_session.catalog.get_dataset(dc.name).get_version(dc.version)
assert version.num_objects == 4
assert version.size == 20


def test_from_storage_check_rows(tmp_dir, test_session):
Expand Down
6 changes: 3 additions & 3 deletions tests/func/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -845,9 +845,9 @@ def test_row_random(cloud_test_catalog):

def test_dataset_stats_registered_ds(cloud_test_catalog, dogs_dataset):
catalog = cloud_test_catalog.catalog
stats = catalog.dataset_stats(dogs_dataset.name, 1)
assert stats.num_objects == 4
assert stats.size == 15
dataset = catalog.get_dataset(dogs_dataset.name).get_version(1)
assert dataset.num_objects == 4
assert dataset.size == 15
rows_count = catalog.warehouse.dataset_rows_count(dogs_dataset, 1)
assert rows_count == 4

Expand Down

0 comments on commit 2a4693e

Please sign in to comment.