From 162de46b6f681c6d2947890171a0c08310b93f40 Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Tue, 9 Jan 2024 12:25:27 +0100 Subject: [PATCH 01/19] Refactor registrar into multiple files --- src/cli/register.py | 2 +- src/dataregistry/__init__.py | 2 +- src/dataregistry/query.py | 2 +- src/dataregistry/registrar/__init__.py | 1 + .../{registrar.py => registrar/dataset.py} | 473 ++++++------------ src/dataregistry/registrar/dataset_alias.py | 63 +++ src/dataregistry/registrar/execution.py | 99 ++++ src/dataregistry/registrar/registrar.py | 77 +++ .../{ => registrar}/registrar_util.py | 5 +- tests/end_to_end_tests/test_end_to_end.py | 6 +- .../test_query_cli_entries.py | 4 +- 11 files changed, 399 insertions(+), 335 deletions(-) create mode 100644 src/dataregistry/registrar/__init__.py rename src/dataregistry/{registrar.py => registrar/dataset.py} (65%) create mode 100644 src/dataregistry/registrar/dataset_alias.py create mode 100644 src/dataregistry/registrar/execution.py create mode 100644 src/dataregistry/registrar/registrar.py rename src/dataregistry/{ => registrar}/registrar_util.py (98%) diff --git a/src/cli/register.py b/src/cli/register.py index f5f002f2..60c578e5 100644 --- a/src/cli/register.py +++ b/src/cli/register.py @@ -37,7 +37,7 @@ def register_dataset(args): ) # Register new dataset. - new_id = datareg.Registrar.register_dataset( + new_id = datareg.Registrar.dataset.create( args.relative_path, args.version, name=args.name, diff --git a/src/dataregistry/__init__.py b/src/dataregistry/__init__.py index b940e1f5..c32e52e1 100644 --- a/src/dataregistry/__init__.py +++ b/src/dataregistry/__init__.py @@ -1,7 +1,7 @@ from ._version import __version__ from .db_basic import * from .registrar import * -from .registrar_util import * +#from .registrar_util import * from .query import * from .git_util import * from .DataRegistry import DataRegistry diff --git a/src/dataregistry/query.py b/src/dataregistry/query.py index b855909d..4463824a 100644 --- a/src/dataregistry/query.py +++ b/src/dataregistry/query.py @@ -2,7 +2,7 @@ from sqlalchemy import text, select import sqlalchemy.sql.sqltypes as sqltypes import pandas as pd -from dataregistry.registrar_util import _form_dataset_path +from dataregistry.registrar.registrar_util import _form_dataset_path from dataregistry.exceptions import DataRegistryNYI, DataRegistryException import os diff --git a/src/dataregistry/registrar/__init__.py b/src/dataregistry/registrar/__init__.py new file mode 100644 index 00000000..34cafb33 --- /dev/null +++ b/src/dataregistry/registrar/__init__.py @@ -0,0 +1 @@ +from .registrar import _OWNER_TYPES, Registrar diff --git a/src/dataregistry/registrar.py b/src/dataregistry/registrar/dataset.py similarity index 65% rename from src/dataregistry/registrar.py rename to src/dataregistry/registrar/dataset.py index baee35c1..300d3ad2 100644 --- a/src/dataregistry/registrar.py +++ b/src/dataregistry/registrar/dataset.py @@ -1,304 +1,38 @@ -import time import os +import time from datetime import datetime -# from sqlalchemy import MetaData, Table, Column, insert, text, -from sqlalchemy import update, select - -# from sqlalchemy.exc import DBAPIError, IntegrityError from dataregistry.db_basic import add_table_row -from dataregistry.registrar_util import _form_dataset_path, get_directory_info -from dataregistry.registrar_util import _parse_version_string, _bump_version -from dataregistry.registrar_util import ( +from sqlalchemy import select, update + +from .registrar_util import ( + _bump_version, + _copy_data, + _form_dataset_path, _name_from_relpath, + _parse_version_string, _read_configuration_file, - _copy_data, + get_directory_info, ) -from dataregistry.db_basic import TableMetadata - -# from dataregistry.exceptions import * - -__all__ = ["Registrar"] # Default maximum allowed length of configuration file allowed to be ingested _DEFAULT_MAX_CONFIG = 10000 -# Allowed owner types -_OWNER_TYPES = {"user", "project", "group", "production"} - - -class Registrar: - def __init__( - self, - db_connection, - root_dir, - owner=None, - owner_type=None, - ): - """ - Class to register new datasets, executions and alias names. - - Parameters - ---------- - db_connection : DbConnection object - Encompasses sqlalchemy engine, dialect (database backend) - and schema version - root_dir : str - Root directory of the dataregistry on disk - owner : str - To set the default owner for all registered datasets in this - instance. - owner_type : str - To set the default owner_type for all registered datasets in this - instance. - """ - - # Root directory on disk for data registry files - self._root_dir = root_dir - - # Database engine and dialect. - self._engine = db_connection.engine - self._schema = db_connection.schema - - # Link to Table Metadata. - self._metadata_getter = TableMetadata(db_connection) - - # Store user id - self._uid = os.getenv("USER") - - # Default owner and owner_type's - self._owner = owner - self._owner_type = owner_type - - def get_owner_types(self): - """ - Returns a list of allowed owner_types that can be registered within the - data registry. - - Returns - ------- - - : set - Set of owner_types - """ - - return _OWNER_TYPES - - def _get_table_metadata(self, tbl): - return self._metadata_getter.get(tbl) - - def _find_previous(self, relative_path, dataset_table, owner, owner_type): - """ - Check to see if a dataset exists already in the registry, and if we are - allowed to overwrite it. - - Parameters - ---------- - relative_path : str - Relative path to dataset - dataset_table : SQLAlchemy Table object - Link to the dataset table - owner : str - Owner of the dataset - owner_type : str - - Returns - ------- - previous : list - List of dataset IDs that are overwritable - """ - - # Search for dataset in the registry. - stmt = ( - select(dataset_table.c.dataset_id, dataset_table.c.is_overwritable) - .where( - dataset_table.c.relative_path == relative_path, - dataset_table.c.owner == owner, - dataset_table.c.owner_type == owner_type, - ) - .order_by(dataset_table.c.dataset_id.desc()) - ) - - with self._engine.connect() as conn: - result = conn.execute(stmt) - conn.commit() - - # If the datasets are overwritable, log their ID, else return None - previous = [] - for r in result: - if not r.is_overwritable: - return None - else: - previous.append(r.dataset_id) - - return previous - - def _handle_data(self, relative_path, old_location, owner, owner_type, verbose): - """ - Find characteristics of dataset (i.e., is it a file or directory, how - many files and total disk space of the dataset). - - If old_location is not None, copy the dataset files and directories - into the data registry. - - Parameters - ---------- - relative_path : str - Relative path of dataset in the data registry - old_location : str - Location of data (if not already in the data registry root) - Data will be copied from this location - owner : str - Owner of the dataset - owner_type : str - Owner type of the dataset - verbose : bool - True for extra output - - Returns - ------- - dataset_organization : str - "file", "directory", or "dummy" - num_files : int - Total number of files making up dataset - total_size : float - Total disk space of dataset in bytes - ds_creation_date : datetime - When file or directory was created - success : bool - True if data copy was successful, else False - """ - - # Get destination directory in data registry. - dest = _form_dataset_path( - owner_type, - owner, - relative_path, - schema=self._schema, - root_dir=self._root_dir, - ) - - # Is the data already on location, or coming from somewhere new? - if old_location: - loc = old_location - else: - loc = dest - - # Get metadata on dataset. - if os.path.isfile(loc): - dataset_organization = "file" - elif os.path.isdir(loc): - dataset_organization = "directory" - else: - raise FileNotFoundError(f"Dataset {loc} not found") - - if verbose: - tic = time.time() - print("Collecting metadata...", end="") - - ds_creation_date = datetime.fromtimestamp(os.path.getctime(loc)) - - if dataset_organization == "directory": - num_files, total_size = get_directory_info(loc) - else: - num_files = 1 - total_size = os.path.getsize(loc) - if verbose: - print(f"took {time.time()-tic:.2f}s") - - # Copy data into data registry - if old_location: - if verbose: - tic = time.time() - print( - f"Copying {num_files} files ({total_size/1024/1024:.2f} Mb)...", - end="", - ) - _copy_data(dataset_organization, old_location, dest) - if verbose: - print(f"took {time.time()-tic:.2f}") - else: - success = True - - return dataset_organization, num_files, total_size, ds_creation_date - def register_execution( - self, - name, - description=None, - execution_start=None, - locale=None, - configuration=None, - input_datasets=[], - input_production_datasets=[], - max_config_length=_DEFAULT_MAX_CONFIG, - ): +class RegistrarDataset: + def __init__(self, parent): """ - Register a new execution in the DESC data registry. - - Any args marked with '**' share their name with the associated column - in the registry schema. Descriptions of what these columns are can be - found in `schema.yaml` or the documentation. + Wrapper class to register/modify/delete dataset entries. Parameters ---------- - name** : str - description** : str, optional - execution_start** : datetime, optional - locale** : str, optional - configuration** : str, optional - input_datasets** : list, optional - input_production_datasets** : list, optional - max_config_length : int, optional - Maxiumum number of lines to read from a configuration file - - Returns - ------- - my_id : int - The execution ID of the new row relating to this entry + parent : Registrar class + Contains db_connection, engine, etc """ - # Put the execution information together - values = {"name": name} - if locale: - values["locale"] = locale - if execution_start: - values["execution_start"] = execution_start - if description: - values["description"] = description - values["register_date"] = datetime.now() - values["creator_uid"] = self._uid - - exec_table = self._get_table_metadata("execution") - dependency_table = self._get_table_metadata("dependency") + self.parent = parent - # Read configuration file. Enter contents as a raw string. - if configuration: - values["configuration"] = _read_configuration_file( - configuration, max_config_length - ) - - # Enter row into data registry database - with self._engine.connect() as conn: - my_id = add_table_row(conn, exec_table, values, commit=False) - - # handle dependencies - for d in input_datasets: - values["register_date"] = datetime.now() - values["input_id"] = d - values["execution_id"] = my_id - add_table_row(conn, dependency_table, values, commit=False) - - # handle production dependencies - for d in input_production_datasets: - values["register_date"] = datetime.now() - values["input_production_id"] = d - values["execution_id"] = my_id - add_table_row(conn, dependency_table, values, commit=False) - - conn.commit() - return my_id - - def register_dataset( + def create( self, relative_path, version, @@ -385,19 +119,19 @@ def register_dataset( # Make sure the owner_type is legal if owner_type is None: - if self._owner_type is not None: - owner_type = self._owner_type + if self.parent._owner_type is not None: + owner_type = self.parent._owner_type else: owner_type = "user" - if owner_type not in _OWNER_TYPES: + if owner_type not in self.parent.get_owner_types(): raise ValueError(f"{owner_type} is not a valid owner_type") # Establish the dataset owner if owner is None: - if self._owner is not None: - owner = self._owner + if self.parent._owner is not None: + owner = self.parent._owner else: - owner = self._uid + owner = self.parent._uid if owner_type == "production": owner = "production" @@ -407,12 +141,12 @@ def register_dataset( raise ValueError("Cannot overwrite production entries") if version_suffix is not None: raise ValueError("Production entries can't have version suffix") - if self._schema != "production": + if self.parent._schema != "production": raise ValueError( "Only the production schema can handle owner_type='production'" ) else: - if self._schema == "production": + if self.parent._schema == "production": raise ValueError( "Only the production schema can handle owner_type='production'" ) @@ -422,7 +156,7 @@ def register_dataset( name = _name_from_relpath(relative_path) # Look for previous entries. Fail if not overwritable - dataset_table = self._get_table_metadata("dataset") + dataset_table = self.parent._get_table_metadata("dataset") previous = self._find_previous(relative_path, dataset_table, owner, owner_type) if previous is None: @@ -437,7 +171,7 @@ def register_dataset( # Generate new version fields based on previous entries # with the same name field and same suffix (i.e., bump) v_fields = _bump_version( - name, version, version_suffix, dataset_table, self._engine + name, version, version_suffix, dataset_table, self.parent._engine ) version_string = ( f"{v_fields['major']}.{v_fields['minor']}.{v_fields['patch']}" @@ -451,7 +185,7 @@ def register_dataset( execution_name = f"{execution_name}-{version_suffix}" if execution_description is None: execution_description = "Fabricated execution for dataset" - execution_id = self.register_execution( + execution_id = self.parent.execution.create( execution_name, description=execution_description, execution_start=execution_start, @@ -487,15 +221,15 @@ def register_dataset( values["register_date"] = datetime.now() values["owner_type"] = owner_type values["owner"] = owner - values["creator_uid"] = self._uid - values["register_root_dir"] = self._root_dir + values["creator_uid"] = self.parent._uid + values["register_root_dir"] = self.parent._root_dir # We tentatively start with an "invalid" dataset in the database. This # will be upgraded to True if the data copying (if any) was successful. values["is_valid"] = False # Create a new row in the data registry database. - with self._engine.connect() as conn: + with self.parent._engine.connect() as conn: prim_key = add_table_row(conn, dataset_table, values, commit=False) if len(previous) > 0: @@ -529,7 +263,7 @@ def register_dataset( ds_creation_date = creation_date # Copy was successful, update the entry with dataset metadata - with self._engine.connect() as conn: + with self.parent._engine.connect() as conn: update_stmt = ( update(dataset_table) .where(dataset_table.c.dataset_id == prim_key) @@ -546,44 +280,133 @@ def register_dataset( return prim_key, execution_id - def register_dataset_alias(self, aliasname, dataset_id): + def _handle_data(self, relative_path, old_location, owner, owner_type, verbose): """ - Register a new dataset alias in the DESC data registry. + Find characteristics of dataset (i.e., is it a file or directory, how + many files and total disk space of the dataset). - Any args marked with '**' share their name with the associated column - in the registry schema. Descriptions of what these columns are can be - found in `schema.yaml` or the documentation. + If old_location is not None, copy the dataset files and directories + into the data registry. Parameters ---------- - aliasname** : str - dataset_id** : int + relative_path : str + Relative path of dataset in the data registry + old_location : str + Location of data (if not already in the data registry root) + Data will be copied from this location + owner : str + Owner of the dataset + owner_type : str + Owner type of the dataset + verbose : bool + True for extra output Returns ------- - prim_key : int - The dataset_alias ID of the new row relating to this entry + dataset_organization : str + "file", "directory", or "dummy" + num_files : int + Total number of files making up dataset + total_size : float + Total disk space of dataset in bytes + ds_creation_date : datetime + When file or directory was created """ - now = datetime.now() - values = {"alias": aliasname} - values["dataset_id"] = dataset_id - values["register_date"] = now - values["creator_uid"] = self._uid - - alias_table = self._get_table_metadata("dataset_alias") - with self._engine.connect() as conn: - prim_key = add_table_row(conn, alias_table, values) - - # Update any other alias rows which have been superseded - stmt = ( - update(alias_table) - .where( - alias_table.c.alias == aliasname, - alias_table.c.dataset_alias_id != prim_key, + # Get destination directory in data registry. + dest = _form_dataset_path( + owner_type, + owner, + relative_path, + schema=self.parent._schema, + root_dir=self.parent._root_dir, + ) + + # Is the data already on location, or coming from somewhere new? + if old_location: + loc = old_location + else: + loc = dest + + # Get metadata on dataset. + if os.path.isfile(loc): + dataset_organization = "file" + elif os.path.isdir(loc): + dataset_organization = "directory" + else: + raise FileNotFoundError(f"Dataset {loc} not found") + + if verbose: + tic = time.time() + print("Collecting metadata...", end="") + + ds_creation_date = datetime.fromtimestamp(os.path.getctime(loc)) + + if dataset_organization == "directory": + num_files, total_size = get_directory_info(loc) + else: + num_files = 1 + total_size = os.path.getsize(loc) + if verbose: + print(f"took {time.time()-tic:.2f}s") + + # Copy data into data registry + if old_location: + if verbose: + tic = time.time() + print( + f"Copying {num_files} files ({total_size/1024/1024:.2f} Mb)...", + end="", ) - .values(supersede_date=now) + _copy_data(dataset_organization, old_location, dest) + if verbose: + print(f"took {time.time()-tic:.2f}") + + return dataset_organization, num_files, total_size, ds_creation_date + + def _find_previous(self, relative_path, dataset_table, owner, owner_type): + """ + Check to see if a dataset exists already in the registry, and if we are + allowed to overwrite it. + + Parameters + ---------- + relative_path : str + Relative path to dataset + dataset_table : SQLAlchemy Table object + Link to the dataset table + owner : str + Owner of the dataset + owner_type : str + + Returns + ------- + previous : list + List of dataset IDs that are overwritable + """ + + # Search for dataset in the registry. + stmt = ( + select(dataset_table.c.dataset_id, dataset_table.c.is_overwritable) + .where( + dataset_table.c.relative_path == relative_path, + dataset_table.c.owner == owner, + dataset_table.c.owner_type == owner_type, ) - conn.execute(stmt) + .order_by(dataset_table.c.dataset_id.desc()) + ) + + with self.parent._engine.connect() as conn: + result = conn.execute(stmt) conn.commit() - return prim_key + + # If the datasets are overwritable, log their ID, else return None + previous = [] + for r in result: + if not r.is_overwritable: + return None + else: + previous.append(r.dataset_id) + + return previous diff --git a/src/dataregistry/registrar/dataset_alias.py b/src/dataregistry/registrar/dataset_alias.py new file mode 100644 index 00000000..f963b0c0 --- /dev/null +++ b/src/dataregistry/registrar/dataset_alias.py @@ -0,0 +1,63 @@ +from datetime import datetime + +from dataregistry.db_basic import add_table_row +from sqlalchemy import update + +# Default maximum allowed length of configuration file allowed to be ingested +_DEFAULT_MAX_CONFIG = 10000 + + +class RegistrarDatasetAlias: + def __init__(self, parent): + """ + Wrapper class to register/modify/delete execution entries. + + Parameters + ---------- + parent : Registrar class + Contains db_connection, engine, etc + """ + + self.parent = parent + + def create(self, aliasname, dataset_id): + """ + Register a new dataset alias in the DESC data registry. + + Any args marked with '**' share their name with the associated column + in the registry schema. Descriptions of what these columns are can be + found in `schema.yaml` or the documentation. + + Parameters + ---------- + aliasname** : str + dataset_id** : int + + Returns + ------- + prim_key : int + The dataset_alias ID of the new row relating to this entry + """ + + now = datetime.now() + values = {"alias": aliasname} + values["dataset_id"] = dataset_id + values["register_date"] = now + values["creator_uid"] = self.parent._uid + + alias_table = self.parent._get_table_metadata("dataset_alias") + with self.parent._engine.connect() as conn: + prim_key = add_table_row(conn, alias_table, values) + + # Update any other alias rows which have been superseded + stmt = ( + update(alias_table) + .where( + alias_table.c.alias == aliasname, + alias_table.c.dataset_alias_id != prim_key, + ) + .values(supersede_date=now) + ) + conn.execute(stmt) + conn.commit() + return prim_key diff --git a/src/dataregistry/registrar/execution.py b/src/dataregistry/registrar/execution.py new file mode 100644 index 00000000..11d6f959 --- /dev/null +++ b/src/dataregistry/registrar/execution.py @@ -0,0 +1,99 @@ +from datetime import datetime + +from dataregistry.db_basic import add_table_row + +from .registrar_util import _read_configuration_file + +# Default maximum allowed length of configuration file allowed to be ingested +_DEFAULT_MAX_CONFIG = 10000 + + +class RegistrarExecution: + def __init__(self, parent): + """ + Wrapper class to register/modify/delete execution entries. + + Parameters + ---------- + parent : Registrar class + Contains db_connection, engine, etc + """ + + self.parent = parent + + def create( + self, + name, + description=None, + execution_start=None, + locale=None, + configuration=None, + input_datasets=[], + input_production_datasets=[], + max_config_length=_DEFAULT_MAX_CONFIG, + ): + """ + Register a new execution in the DESC data registry. + + Any args marked with '**' share their name with the associated column + in the registry schema. Descriptions of what these columns are can be + found in `schema.yaml` or the documentation. + + Parameters + ---------- + name** : str + description** : str, optional + execution_start** : datetime, optional + locale** : str, optional + configuration** : str, optional + input_datasets** : list, optional + input_production_datasets** : list, optional + max_config_length : int, optional + Maxiumum number of lines to read from a configuration file + + Returns + ------- + my_id : int + The execution ID of the new row relating to this entry + """ + + # Put the execution information together + values = {"name": name} + if locale: + values["locale"] = locale + if execution_start: + values["execution_start"] = execution_start + if description: + values["description"] = description + values["register_date"] = datetime.now() + values["creator_uid"] = self.parent._uid + + exec_table = self.parent._get_table_metadata("execution") + dependency_table = self.parent._get_table_metadata("dependency") + + # Read configuration file. Enter contents as a raw string. + if configuration: + values["configuration"] = _read_configuration_file( + configuration, max_config_length + ) + + # Enter row into data registry database + with self.parent._engine.connect() as conn: + my_id = add_table_row(conn, exec_table, values, commit=False) + + # handle dependencies + for d in input_datasets: + values["register_date"] = datetime.now() + values["input_id"] = d + values["execution_id"] = my_id + add_table_row(conn, dependency_table, values, commit=False) + + # handle production dependencies + for d in input_production_datasets: + values["register_date"] = datetime.now() + values["input_production_id"] = d + values["execution_id"] = my_id + add_table_row(conn, dependency_table, values, commit=False) + + conn.commit() + return my_id diff --git a/src/dataregistry/registrar/registrar.py b/src/dataregistry/registrar/registrar.py new file mode 100644 index 00000000..b29bfea1 --- /dev/null +++ b/src/dataregistry/registrar/registrar.py @@ -0,0 +1,77 @@ +import os + +from dataregistry.db_basic import TableMetadata + +from .dataset import RegistrarDataset +from .dataset_alias import RegistrarDatasetAlias +from .execution import RegistrarExecution + +__all__ = ["Registrar"] + +# Allowed owner types +_OWNER_TYPES = {"user", "project", "group", "production"} + + +class Registrar: + def __init__( + self, + db_connection, + root_dir, + owner=None, + owner_type=None, + ): + """ + Class to register new datasets, executions and alias names. + + Parameters + ---------- + db_connection : DbConnection object + Encompasses sqlalchemy engine, dialect (database backend) + and schema version + root_dir : str + Root directory of the dataregistry on disk + owner : str + To set the default owner for all registered datasets in this + instance. + owner_type : str + To set the default owner_type for all registered datasets in this + instance. + """ + + # Root directory on disk for data registry files + self._root_dir = root_dir + + # Database engine and dialect. + self._engine = db_connection.engine + self._schema = db_connection.schema + + # Link to Table Metadata. + self._metadata_getter = TableMetadata(db_connection) + + # Store user id + self._uid = os.getenv("USER") + + # Default owner and owner_type's + self._owner = owner + self._owner_type = owner_type + + # Class wrappers which are used to create/modify/delete entries + self.dataset = RegistrarDataset(self) + self.execution = RegistrarExecution(self) + self.dataset_alias = RegistrarDatasetAlias(self) + + def get_owner_types(self): + """ + Returns a list of allowed owner_types that can be registered within the + data registry. + + Returns + ------- + - : set + Set of owner_types + """ + + return _OWNER_TYPES + + def _get_table_metadata(self, tbl): + return self._metadata_getter.get(tbl) diff --git a/src/dataregistry/registrar_util.py b/src/dataregistry/registrar/registrar_util.py similarity index 98% rename from src/dataregistry/registrar_util.py rename to src/dataregistry/registrar/registrar_util.py index 17d8adee..2ca21e6e 100644 --- a/src/dataregistry/registrar_util.py +++ b/src/dataregistry/registrar/registrar_util.py @@ -2,9 +2,10 @@ import os import re import warnings -from sqlalchemy import MetaData, Table, Column, text, select from shutil import copyfile, copytree, rmtree +from sqlalchemy import select + __all__ = [ "_parse_version_string", "_bump_version", @@ -329,7 +330,7 @@ def _compute_checksum(file_path): os.rename(temp_dest, dest) print( - f"Something went wrong during data copying, aborting." + "Something went wrong during data copying, aborting." "Note an entry in the registry database will still have" "been created" ) diff --git a/tests/end_to_end_tests/test_end_to_end.py b/tests/end_to_end_tests/test_end_to_end.py index 3943131c..47cf792c 100644 --- a/tests/end_to_end_tests/test_end_to_end.py +++ b/tests/end_to_end_tests/test_end_to_end.py @@ -82,7 +82,7 @@ def _insert_alias_entry(datareg, name, dataset_id): The alias ID for this new entry """ - new_id = datareg.Registrar.register_dataset_alias(name, dataset_id) + new_id = datareg.Registrar.dataset_alias.create(name, dataset_id) assert new_id is not None, "Trying to create a dataset alias that already exists" print(f"Created dataset alias entry with id {new_id}") @@ -113,7 +113,7 @@ def _insert_execution_entry( The execution ID for this new entry """ - new_id = datareg.Registrar.register_execution( + new_id = datareg.Registrar.execution.create( name, description=description, input_datasets=input_datasets, @@ -201,7 +201,7 @@ def _insert_dataset_entry( make_sym_link = False # Add new entry. - dataset_id, execution_id = datareg.Registrar.register_dataset( + dataset_id, execution_id = datareg.Registrar.dataset.create( relpath, version, version_suffix=version_suffix, diff --git a/tests/end_to_end_tests/test_query_cli_entries.py b/tests/end_to_end_tests/test_query_cli_entries.py index 209a8588..154ed768 100644 --- a/tests/end_to_end_tests/test_query_cli_entries.py +++ b/tests/end_to_end_tests/test_query_cli_entries.py @@ -6,7 +6,7 @@ from dataregistry.db_basic import SCHEMA_VERSION # Establish connection to database (default schema) -datareg = DataRegistry(root_dir="temp") +datareg = DataRegistry(root_dir="temp_root_dir") def test_cli_basic_dataset(): @@ -44,7 +44,7 @@ def test_cli_production_entry(): if datareg.Query._dialect != "sqlite": # Establish connection to database (production schema) - datareg_prod = DataRegistry(schema="production") + datareg_prod = DataRegistry(schema="production", root_dir="temp_root_dir") f = datareg_prod.Query.gen_filter( "dataset.name", "==", "my_production_cli_dataset" From fa4a22ef931a7ff8bc7b811649107b8899a200a3 Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Tue, 9 Jan 2024 13:26:11 +0100 Subject: [PATCH 02/19] Fix unit tests --- tests/unit_tests/test_registrar_util.py | 2 +- tests/unit_tests/test_rutil_copy_data.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit_tests/test_registrar_util.py b/tests/unit_tests/test_registrar_util.py index ebad041c..247a0805 100644 --- a/tests/unit_tests/test_registrar_util.py +++ b/tests/unit_tests/test_registrar_util.py @@ -1,4 +1,4 @@ -from dataregistry.registrar_util import ( +from dataregistry.registrar.registrar_util import ( _parse_version_string, _name_from_relpath, _form_dataset_path, diff --git a/tests/unit_tests/test_rutil_copy_data.py b/tests/unit_tests/test_rutil_copy_data.py index 91a20c38..4f6c4ced 100644 --- a/tests/unit_tests/test_rutil_copy_data.py +++ b/tests/unit_tests/test_rutil_copy_data.py @@ -1,6 +1,6 @@ import pytest import os -from dataregistry.registrar_util import _copy_data +from dataregistry.registrar.registrar_util import _copy_data @pytest.fixture From f40ae362437da02f249ee31ac0bb45e06c122cd4 Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Tue, 9 Jan 2024 13:36:43 +0100 Subject: [PATCH 03/19] Added placeholder functions --- src/dataregistry/registrar/dataset.py | 20 ++++++++++++++++++-- src/dataregistry/registrar/dataset_alias.py | 18 +++++++++++++++++- src/dataregistry/registrar/execution.py | 18 +++++++++++++++++- 3 files changed, 52 insertions(+), 4 deletions(-) diff --git a/src/dataregistry/registrar/dataset.py b/src/dataregistry/registrar/dataset.py index 300d3ad2..52cbf025 100644 --- a/src/dataregistry/registrar/dataset.py +++ b/src/dataregistry/registrar/dataset.py @@ -60,7 +60,7 @@ def create( max_config_length=_DEFAULT_MAX_CONFIG, ): """ - Register a new dataset in the DESC data registry. + Create a new dataset entry in the DESC data registry. Any args marked with '**' share their name with the associated column in the registry schema. Descriptions of what these columns are can be @@ -151,7 +151,7 @@ def create( "Only the production schema can handle owner_type='production'" ) - # If name not passed, automatically generate a name from the relative path + # If `name` not passed, automatically generate a name from the relative path if name is None: name = _name_from_relpath(relative_path) @@ -410,3 +410,19 @@ def _find_previous(self, relative_path, dataset_table, owner, owner_type): previous.append(r.dataset_id) return previous + + def delete(self): + """ + Delete a dataset entry from the DESC data registry. + + """ + + raise NotImplementedError + + def modify(self): + """ + Modify a dataset entry in the DESC data registry. + + """ + + raise NotImplementedError diff --git a/src/dataregistry/registrar/dataset_alias.py b/src/dataregistry/registrar/dataset_alias.py index f963b0c0..baaf5054 100644 --- a/src/dataregistry/registrar/dataset_alias.py +++ b/src/dataregistry/registrar/dataset_alias.py @@ -22,7 +22,7 @@ def __init__(self, parent): def create(self, aliasname, dataset_id): """ - Register a new dataset alias in the DESC data registry. + Create a new dataset alias entry in the DESC data registry. Any args marked with '**' share their name with the associated column in the registry schema. Descriptions of what these columns are can be @@ -61,3 +61,19 @@ def create(self, aliasname, dataset_id): conn.execute(stmt) conn.commit() return prim_key + + def delete(self): + """ + Delete a dataset alias entry from the DESC data registry. + + """ + + raise NotImplementedError + + def modify(self): + """ + Modify a dataset alias entry in the DESC data registry. + + """ + + raise NotImplementedError diff --git a/src/dataregistry/registrar/execution.py b/src/dataregistry/registrar/execution.py index 11d6f959..fed95823 100644 --- a/src/dataregistry/registrar/execution.py +++ b/src/dataregistry/registrar/execution.py @@ -33,7 +33,7 @@ def create( max_config_length=_DEFAULT_MAX_CONFIG, ): """ - Register a new execution in the DESC data registry. + Create a new execution entry in the DESC data registry. Any args marked with '**' share their name with the associated column in the registry schema. Descriptions of what these columns are can be @@ -97,3 +97,19 @@ def create( conn.commit() return my_id + + def delete(self): + """ + Delete an execution entry from the DESC data registry. + + """ + + raise NotImplementedError + + def modify(self): + """ + Modify an execution entry in the DESC data registry. + + """ + + raise NotImplementedError From 76a21984d9a7aaa07db924a49085eb8329d5e080 Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Tue, 9 Jan 2024 16:00:18 +0100 Subject: [PATCH 04/19] Add dataset status field to the dataset table, and archive/delete/move information --- scripts/create_registry_db.py | 4 ++-- src/dataregistry/registrar/dataset.py | 9 ++++----- src/dataregistry/schema/schema.yaml | 25 ++++++++++++++++++------- tests/end_to_end_tests/test_database.py | 2 +- 4 files changed, 25 insertions(+), 15 deletions(-) diff --git a/scripts/create_registry_db.py b/scripts/create_registry_db.py index 126701c7..35c431a1 100644 --- a/scripts/create_registry_db.py +++ b/scripts/create_registry_db.py @@ -240,9 +240,9 @@ def _Dependency(schema, has_production): # The following should be adjusted whenever there is a change to the structure # of the database tables. _DB_VERSION_MAJOR = 2 -_DB_VERSION_MINOR = 0 +_DB_VERSION_MINOR = 1 _DB_VERSION_PATCH = 0 -_DB_VERSION_COMMENT = "Added production dependencies" +_DB_VERSION_COMMENT = "Add dataset status" # Parse command line arguments parser = argparse.ArgumentParser( diff --git a/src/dataregistry/registrar/dataset.py b/src/dataregistry/registrar/dataset.py index 52cbf025..23e2ecbb 100644 --- a/src/dataregistry/registrar/dataset.py +++ b/src/dataregistry/registrar/dataset.py @@ -68,7 +68,7 @@ def create( First, the dataset entry is created in the database. If success, the data is then copied (if `old_location` was provided). Only if both - steps are successful will there be `is_valid=True` entry in the registry. + steps are successful will there be `status=1` entry in the registry. Parameters ---------- @@ -217,7 +217,6 @@ def create( values["is_overwritten"] = False values["is_external_link"] = False values["is_archived"] = False - values["is_valid"] = True values["register_date"] = datetime.now() values["owner_type"] = owner_type values["owner"] = owner @@ -225,8 +224,8 @@ def create( values["register_root_dir"] = self.parent._root_dir # We tentatively start with an "invalid" dataset in the database. This - # will be upgraded to True if the data copying (if any) was successful. - values["is_valid"] = False + # will be upgraded to valid if the data copying (if any) was successful. + values["status"] = -1 # Create a new row in the data registry database. with self.parent._engine.connect() as conn: @@ -272,7 +271,7 @@ def create( nfiles=num_files, total_disk_space=total_size / 1024 / 1024, creation_date=ds_creation_date, - is_valid=True, + status=1, ) ) conn.execute(update_stmt) diff --git a/src/dataregistry/schema/schema.yaml b/src/dataregistry/schema/schema.yaml index 0e5896da..b8cb646e 100644 --- a/src/dataregistry/schema/schema.yaml +++ b/src/dataregistry/schema/schema.yaml @@ -248,10 +248,6 @@ dataset: type: "String" description: "User provided human-readable description of the dataset" cli_optional: True - is_valid: - type: "Boolean" - nullable: False - description: "False if, e.g., copy failed" execution_id: type: "Integer" foreign_key: True @@ -273,7 +269,22 @@ dataset: type: "Boolean" nullable: False description: "True if an external link" - is_archived: - type: "Boolean" + status: + type: "Integer" nullable: False - description: "True if dataset is archived" + description: "What is the status of the dataset? -1: Invalid (e.g., copy data failed during creation), 1: Valid, 2: Archived, 3: deleted" + archive_date: + type: "DateTime" + description: "Dataset archive date" + archive_path: + type: "String" + description: "Path the dataset was archived to" + delete_date: + type: "DateTime" + description: "Date the dataset was deleted" + delete_uid: + type: "String" + description: "User ID of person who deleted the dataset" + move_date: + type: "DateTime" + description: "Date the dataset was last moved" diff --git a/tests/end_to_end_tests/test_database.py b/tests/end_to_end_tests/test_database.py index 24f4b22a..0aa27954 100644 --- a/tests/end_to_end_tests/test_database.py +++ b/tests/end_to_end_tests/test_database.py @@ -52,5 +52,5 @@ def test_db_version(): """ actual_major, actual_minor, actual_patch = datareg.Query.get_db_versioning() assert actual_major == 2, "db major version doesn't match expected" - assert actual_minor == 0, "db minor version doesn't match expected" + assert actual_minor == 1, "db minor version doesn't match expected" assert actual_patch == 0, "db patch version doesn't match expected" From 416a46106ac35d83c1367cd09866aceb743d8d04 Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Wed, 10 Jan 2024 14:58:35 +0100 Subject: [PATCH 05/19] Add ability to delete a dataset --- src/dataregistry/registrar/dataset.py | 149 +++++++++++++++++----- src/dataregistry/schema/schema.yaml | 2 +- tests/end_to_end_tests/test_end_to_end.py | 108 +++++++++++++++- 3 files changed, 226 insertions(+), 33 deletions(-) diff --git a/src/dataregistry/registrar/dataset.py b/src/dataregistry/registrar/dataset.py index 23e2ecbb..d19ec3d6 100644 --- a/src/dataregistry/registrar/dataset.py +++ b/src/dataregistry/registrar/dataset.py @@ -157,11 +157,17 @@ def create( # Look for previous entries. Fail if not overwritable dataset_table = self.parent._get_table_metadata("dataset") - previous = self._find_previous(relative_path, dataset_table, owner, owner_type) + previous_dataset = self._find_previous( + dataset_table, + relative_path=relative_path, + owner=owner, + owner_type=owner_type, + ) - if previous is None: - print(f"Dataset {relative_path} exists, and is not overwritable") - return None + if previous_dataset is not None: + if not previous_dataset.is_overwritable: + print(f"Dataset {relative_path} exists, and is not overwritable") + return None # Deal with version string (non-special case) if version not in ["major", "minor", "patch"]: @@ -231,11 +237,11 @@ def create( with self.parent._engine.connect() as conn: prim_key = add_table_row(conn, dataset_table, values, commit=False) - if len(previous) > 0: + if previous_dataset is not None: # Update previous rows, setting is_overwritten to True update_stmt = ( update(dataset_table) - .where(dataset_table.c.dataset_id.in_(previous)) + .where(dataset_table.c.dataset_id == previous_dataset.dataset_id) .values(is_overwritten=True) ) conn.execute(update_stmt) @@ -251,11 +257,13 @@ def create( ) = self._handle_data( relative_path, old_location, owner, owner_type, verbose ) + valid_status = 1 else: dataset_organization = "dummy" num_files = 0 total_size = 0 ds_creation_date = None + valid_status = 0 # Case where use is overwriting the dateset `creation_date` if creation_date: @@ -271,7 +279,7 @@ def create( nfiles=num_files, total_disk_space=total_size / 1024 / 1024, creation_date=ds_creation_date, - status=1, + status=valid_status, ) ) conn.execute(update_stmt) @@ -364,59 +372,140 @@ def _handle_data(self, relative_path, old_location, owner, owner_type, verbose): return dataset_organization, num_files, total_size, ds_creation_date - def _find_previous(self, relative_path, dataset_table, owner, owner_type): + def _find_previous( + self, + dataset_table, + relative_path=None, + owner=None, + owner_type=None, + dataset_id=None, + ): """ Check to see if a dataset exists already in the registry, and if we are allowed to overwrite it. + Can search either by `dataset_id`, or a combination of `relative_path`, + `owner` and `owner_type`. + + Only one dataset should ever be found. + Parameters ---------- - relative_path : str - Relative path to dataset dataset_table : SQLAlchemy Table object Link to the dataset table - owner : str + relative_path : str, optional + Relative path to dataset + owner : str, optional Owner of the dataset - owner_type : str + owner_type : str, optional + dataset_id : int, optional Returns ------- - previous : list - List of dataset IDs that are overwritable + r : CursorResult object + Searched dataset """ + # Make sure we have all the relavant information + if dataset_id is None: + if (relative_path is None) or (owner is None) or (owner_type is None): + raise ValueError( + "Must pass relative_path, owner and owner_type to _find_previous" + ) + # Search for dataset in the registry. - stmt = ( - select(dataset_table.c.dataset_id, dataset_table.c.is_overwritable) - .where( - dataset_table.c.relative_path == relative_path, - dataset_table.c.owner == owner, - dataset_table.c.owner_type == owner_type, + if dataset_id is None: + stmt = ( + select( + dataset_table.c.dataset_id, + dataset_table.c.is_overwritable, + dataset_table.c.status, + dataset_table.c.owner, + dataset_table.c.owner_type, + dataset_table.c.relative_path, + ) + .where( + dataset_table.c.relative_path == relative_path, + dataset_table.c.owner == owner, + dataset_table.c.owner_type == owner_type, + ) + .order_by(dataset_table.c.dataset_id.desc()) + ) + else: + stmt = ( + select( + dataset_table.c.dataset_id, + dataset_table.c.is_overwritable, + dataset_table.c.status, + dataset_table.c.owner, + dataset_table.c.owner_type, + dataset_table.c.relative_path, + ) + .where( + dataset_table.c.dataset_id == dataset_id, + ) + .order_by(dataset_table.c.dataset_id.desc()) ) - .order_by(dataset_table.c.dataset_id.desc()) - ) with self.parent._engine.connect() as conn: result = conn.execute(stmt) conn.commit() # If the datasets are overwritable, log their ID, else return None - previous = [] for r in result: - if not r.is_overwritable: - return None - else: - previous.append(r.dataset_id) + return r - return previous + return None - def delete(self): + def delete(self, dataset_id): """ Delete a dataset entry from the DESC data registry. + This will remove the raw data from the root dir, but the dataset entry + remains in the registry (now with `status=3`). + + Parameters + ---------- + dataset_id : int + Dataset we want to delete from the registry """ - raise NotImplementedError + # First make sure the given dataset id is in the registry + dataset_table = self.parent._get_table_metadata("dataset") + previous_dataset = self._find_previous(dataset_table, dataset_id=dataset_id) + + if previous_dataset is None: + raise ValueError(f"Dataset ID {dataset_id} does not exist") + if previous_dataset.status not in [0, 1]: + raise ValueError(f"Dataset ID {dataset_id} does not have a valid status") + + # Update the status of the dataset to deleted + with self.parent._engine.connect() as conn: + update_stmt = ( + update(dataset_table) + .where(dataset_table.c.dataset_id == dataset_id) + .values( + status=3, + delete_date=datetime.now(), + delete_uid=self.parent._uid, + ) + ) + conn.execute(update_stmt) + conn.commit() + + # Delete the physical data in the root_dir + if previous_dataset.status == 1: + data_path = _form_dataset_path( + previous_dataset.owner_type, + previous_dataset.owner, + previous_dataset.relative_path, + schema=self.parent._schema, + root_dir=self.parent._root_dir, + ) + print(f"Deleting data {data_path}") + os.remove(data_path) + + print(f"Deleted {dataset_id} from data registry") def modify(self): """ diff --git a/src/dataregistry/schema/schema.yaml b/src/dataregistry/schema/schema.yaml index b8cb646e..31200428 100644 --- a/src/dataregistry/schema/schema.yaml +++ b/src/dataregistry/schema/schema.yaml @@ -272,7 +272,7 @@ dataset: status: type: "Integer" nullable: False - description: "What is the status of the dataset? -1: Invalid (e.g., copy data failed during creation), 1: Valid, 2: Archived, 3: deleted" + description: "What is the status of the dataset? -1: Invalid (e.g., copy data failed during creation), 0: Valid dummy dataset 1: Valid dataset, 2: Archived, 3: deleted" archive_date: type: "DateTime" description: "Dataset archive date" diff --git a/tests/end_to_end_tests/test_end_to_end.py b/tests/end_to_end_tests/test_end_to_end.py index 47cf792c..bda597d8 100644 --- a/tests/end_to_end_tests/test_end_to_end.py +++ b/tests/end_to_end_tests/test_end_to_end.py @@ -7,6 +7,8 @@ from dataregistry.registrar import _OWNER_TYPES import pytest +from dataregistry.registrar.registrar_util import _form_dataset_path + @pytest.fixture def dummy_file(tmp_path): @@ -30,8 +32,9 @@ def dummy_file(tmp_path): tmp_src_dir = tmp_path / "source" tmp_src_dir.mkdir() - f = tmp_src_dir / "file1.txt" - f.write_text("i am a dummy file") + for i in range(2): + f = tmp_src_dir / f"file{i+1}.txt" + f.write_text("i am a dummy file") p = tmp_src_dir / "directory1" p.mkdir() @@ -845,3 +848,104 @@ def test_get_dataset_absolute_path(dummy_file): assert v == os.path.join( str(tmp_root_dir), SCHEMA_VERSION, dset_ownertype, dset_owner, dset_relpath ) + + +def test_delete_entry_dummy(dummy_file): + """Make a simple (dummy) entry, then delete it, then check it was deleted""" + + # Establish connection to database + tmp_src_dir, tmp_root_dir = dummy_file + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + + # Make sure we raise an exception trying to delete a dataset that doesn't exist + with pytest.raises(ValueError, match="does not exist"): + datareg.Registrar.dataset.delete(10000) + + # Add entry + d_id = _insert_dataset_entry( + datareg, + "DESC/datasets/dummy_dataset_to_delete", + "0.0.1", + "user", + None, + "A dataset to delete", + ) + + # Now delete that entry + datareg.Registrar.dataset.delete(d_id) + + # Check the entry was deleted + f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id) + results = datareg.Query.find_datasets( + [ + "dataset.status", + "dataset.delete_date", + "dataset.delete_uid", + ], + [f], + return_format="cursorresult", + ) + + for r in results: + assert getattr(r, "dataset.status") == 3 + assert getattr(r, "dataset.delete_date") is not None + assert getattr(r, "dataset.delete_uid") is not None + + +def test_delete_entry_real(dummy_file): + """Make a simple (real data) entry, then delete it, then check it was deleted""" + + # Establish connection to database + tmp_src_dir, tmp_root_dir = dummy_file + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + + # Make sure we raise an exception trying to delete a dataset that doesn't exist + with pytest.raises(ValueError, match="does not exist"): + datareg.Registrar.dataset.delete(10000) + + # Add entry + data_path = str(tmp_src_dir / "file2.txt") + assert os.path.isfile(data_path) + d_id = _insert_dataset_entry( + datareg, + "DESC/datasets/real_dataset_to_delete", + "0.0.1", + "user", + None, + "A dataset to delete", + old_location=data_path, + is_dummy=False, + ) + + # Now delete that entry + datareg.Registrar.dataset.delete(d_id) + + # Check the entry was set to deleted in the registry + f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id) + results = datareg.Query.find_datasets( + [ + "dataset.status", + "dataset.delete_date", + "dataset.delete_uid", + "dataset.owner", + "dataset.owner_type", + "dataset.relative_path", + ], + [f], + return_format="cursorresult", + ) + + for r in results: + assert getattr(r, "dataset.status") == 3 + assert getattr(r, "dataset.delete_date") is not None + assert getattr(r, "dataset.delete_uid") is not None + + # Make sure the file in the root_dir has gone + data_path = _form_dataset_path( + getattr(r, "dataset.owner_type"), + getattr(r, "dataset.owner"), + getattr(r, "dataset.relative_path"), + schema=SCHEMA_VERSION, + root_dir=str(tmp_root_dir), + ) + assert not os.path.isfile(data_path) From 0a50c73d0128c3dbccb84e57d37786044ea0f124 Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Fri, 12 Jan 2024 14:28:30 +0100 Subject: [PATCH 06/19] Update delete test --- tests/end_to_end_tests/test_end_to_end.py | 1720 ++++++++++----------- 1 file changed, 849 insertions(+), 871 deletions(-) diff --git a/tests/end_to_end_tests/test_end_to_end.py b/tests/end_to_end_tests/test_end_to_end.py index bda597d8..87d6e420 100644 --- a/tests/end_to_end_tests/test_end_to_end.py +++ b/tests/end_to_end_tests/test_end_to_end.py @@ -12,940 +12,918 @@ @pytest.fixture def dummy_file(tmp_path): - """ - Create some dummy (temporary) files and directories - - Parameters - ---------- - tmp_path : pathlib.Path object - - Returns - ------- - tmp_src_dir : pathlib.Path object - Temporary files we are going to be copying into the registry will be - created in here - tmp_root_dir : pathlib.Path object - Temporary root_dir for the registry we can copy files to - """ - - # Temp dir for files that we copy files from (old_location) - tmp_src_dir = tmp_path / "source" - tmp_src_dir.mkdir() - - for i in range(2): - f = tmp_src_dir / f"file{i+1}.txt" - f.write_text("i am a dummy file") - - p = tmp_src_dir / "directory1" - p.mkdir() - f = p / "file2.txt" - f.write_text("i am another dummy file") - - # Temp root_dir of the registry - tmp_root_dir = tmp_path / "root_dir" - for THIS_SCHEMA in [SCHEMA_VERSION + "/", ""]: - p = tmp_root_dir / f"{THIS_SCHEMA}user/{os.getenv('USER')}/dummy_dir" - p.mkdir(parents=True) - - f = p / "file1.txt" - f.write_text("i am another dummy file (but on location in a dir)") - - p = tmp_root_dir / f"{THIS_SCHEMA}user/{os.getenv('USER')}" - f = p / "file1.txt" - f.write_text("i am another dummy file (but on location)") - - # Make a dummy configuration yaml file - data = { - "run_by": "somebody", - "software_version": {"major": 1, "minor": 1, "patch": 0}, - "an_important_list": [1, 2, 3], - } - - # Write the data to the YAML file - with open(tmp_src_dir / "dummy_configuration_file.yaml", "w") as file: - yaml.dump(data, file, default_flow_style=False) - - return tmp_src_dir, tmp_root_dir + """ + Create some dummy (temporary) files and directories + + Parameters + ---------- + tmp_path : pathlib.Path object + + Returns + ------- + tmp_src_dir : pathlib.Path object + Temporary files we are going to be copying into the registry will be + created in here + tmp_root_dir : pathlib.Path object + Temporary root_dir for the registry we can copy files to + """ + + # Temp dir for files that we copy files from (old_location) + tmp_src_dir = tmp_path / "source" + tmp_src_dir.mkdir() + + for i in range(2): + f = tmp_src_dir / f"file{i+1}.txt" + f.write_text("i am a dummy file") + + p = tmp_src_dir / "directory1" + p.mkdir() + f = p / "file2.txt" + f.write_text("i am another dummy file") + + # Temp root_dir of the registry + tmp_root_dir = tmp_path / "root_dir" + for THIS_SCHEMA in [SCHEMA_VERSION + "/", ""]: + p = tmp_root_dir / f"{THIS_SCHEMA}user/{os.getenv('USER')}/dummy_dir" + p.mkdir(parents=True) + + f = p / "file1.txt" + f.write_text("i am another dummy file (but on location in a dir)") + + p = tmp_root_dir / f"{THIS_SCHEMA}user/{os.getenv('USER')}" + f = p / "file1.txt" + f.write_text("i am another dummy file (but on location)") + + # Make a dummy configuration yaml file + data = { + "run_by": "somebody", + "software_version": {"major": 1, "minor": 1, "patch": 0}, + "an_important_list": [1, 2, 3], + } + + # Write the data to the YAML file + with open(tmp_src_dir / "dummy_configuration_file.yaml", "w") as file: + yaml.dump(data, file, default_flow_style=False) + + return tmp_src_dir, tmp_root_dir def _insert_alias_entry(datareg, name, dataset_id): - """ - Wrapper to create dataset alias entry + """ + Wrapper to create dataset alias entry - Parameters - ---------- - name : str - Name of alias - dataset_id : int - Dataset we are assigning alias name to + Parameters + ---------- + name : str + Name of alias + dataset_id : int + Dataset we are assigning alias name to - Returns - ------- - new_id : int - The alias ID for this new entry - """ + Returns + ------- + new_id : int + The alias ID for this new entry + """ - new_id = datareg.Registrar.dataset_alias.create(name, dataset_id) + new_id = datareg.Registrar.dataset_alias.create(name, dataset_id) - assert new_id is not None, "Trying to create a dataset alias that already exists" - print(f"Created dataset alias entry with id {new_id}") + assert new_id is not None, "Trying to create a dataset alias that already exists" + print(f"Created dataset alias entry with id {new_id}") - return new_id + return new_id def _insert_execution_entry( - datareg, name, description, input_datasets=[], configuration=None + datareg, name, description, input_datasets=[], configuration=None ): - """ - Wrapper to create execution entry - - Parameters - ---------- - name : str - Name of execution - description : str - Description of execution - intput_datasets : list - List of dataset ids - configuration : str - Path to configuration file for execution - - Returns - ------- - new_id : int - The execution ID for this new entry - """ - - new_id = datareg.Registrar.execution.create( - name, - description=description, - input_datasets=input_datasets, - configuration=configuration, - ) - - assert new_id is not None, "Trying to create a execution that already exists" - print(f"Created execution entry with id {new_id}") - - return new_id + """ + Wrapper to create execution entry + + Parameters + ---------- + name : str + Name of execution + description : str + Description of execution + intput_datasets : list + List of dataset ids + configuration : str + Path to configuration file for execution + + Returns + ------- + new_id : int + The execution ID for this new entry + """ + + new_id = datareg.Registrar.execution.create( + name, + description=description, + input_datasets=input_datasets, + configuration=configuration, + ) + + assert new_id is not None, "Trying to create a execution that already exists" + print(f"Created execution entry with id {new_id}") + + return new_id def _insert_dataset_entry( - datareg, - relpath, - version, - owner_type, - owner, - description, - name=None, - execution_id=None, - version_suffix=None, - is_dummy=True, - old_location=None, - is_overwritable=False, - which_datareg=None, - execution_name=None, - execution_description=None, - execution_start=None, - execution_locale=None, - execution_configuration=None, - input_datasets=[], + datareg, + relpath, + version, + owner_type, + owner, + description, + name=None, + execution_id=None, + version_suffix=None, + is_dummy=True, + old_location=None, + is_overwritable=False, + which_datareg=None, + execution_name=None, + execution_description=None, + execution_start=None, + execution_locale=None, + execution_configuration=None, + input_datasets=[], ): - """ - Wrapper to create dataset entry - - Parameters - ---------- - relpath : str - Relative path within the data registry to store the data - Relative to ///... - version : str - Semantic version string (i.e., M.N.P) or - "major", "minor", "patch" to automatically bump the version previous - owner_type : str - Either "production", "group", "user" - owner : str - Dataset owner - description : str - Description of dataset - name : str - A manually selected name for the dataset - execution_id : int - Execution entry related to this dataset - version_suffix : str - Append a suffix to the version string - is_dummy : bool - True for dummy dataset (copies no data) - old_location : str - Path to data to be copied to data registry - which_datareg : DataRegistry object - In case we want to register using a custom DataRegistry object - execution_name : str, optional - Typically pipeline name or program name - execution_description : str, optional - Human readible description of execution - execution_start : datetime, optional - Date the execution started - execution_locale : str, optional - Where was the execution performed? - execution_configuration : str, optional - Path to text file used to configure the execution - input_datasets : list, optional - List of dataset ids that were the input to this execution - - Returns - ------- - dataset_id : int - The dataset it created for this entry - """ - - # Some defaults over all test datasets - locale = "NERSC" - creation_data = None - make_sym_link = False - - # Add new entry. - dataset_id, execution_id = datareg.Registrar.dataset.create( - relpath, - version, - version_suffix=version_suffix, - name=name, - creation_date=creation_data, - description=description, - old_location=old_location, - copy=(not make_sym_link), - is_dummy=is_dummy, - execution_id=execution_id, - verbose=True, - owner=owner, - owner_type=owner_type, - is_overwritable=is_overwritable, - execution_name=execution_name, - execution_description=execution_description, - execution_start=execution_start, - execution_locale=execution_locale, - execution_configuration=execution_configuration, - input_datasets=input_datasets, - ) - - assert dataset_id is not None, "Trying to create a dataset that already exists" - assert execution_id is not None, "Trying to create a execution that already exists" - print(f"Created dataset entry with id {dataset_id}") - - return dataset_id + """ + Wrapper to create dataset entry + + Parameters + ---------- + relpath : str + Relative path within the data registry to store the data + Relative to ///... + version : str + Semantic version string (i.e., M.N.P) or + "major", "minor", "patch" to automatically bump the version previous + owner_type : str + Either "production", "group", "user" + owner : str + Dataset owner + description : str + Description of dataset + name : str + A manually selected name for the dataset + execution_id : int + Execution entry related to this dataset + version_suffix : str + Append a suffix to the version string + is_dummy : bool + True for dummy dataset (copies no data) + old_location : str + Path to data to be copied to data registry + which_datareg : DataRegistry object + In case we want to register using a custom DataRegistry object + execution_name : str, optional + Typically pipeline name or program name + execution_description : str, optional + Human readible description of execution + execution_start : datetime, optional + Date the execution started + execution_locale : str, optional + Where was the execution performed? + execution_configuration : str, optional + Path to text file used to configure the execution + input_datasets : list, optional + List of dataset ids that were the input to this execution + + Returns + ------- + dataset_id : int + The dataset it created for this entry + """ + + # Some defaults over all test datasets + locale = "NERSC" + creation_data = None + make_sym_link = False + + # Add new entry. + dataset_id, execution_id = datareg.Registrar.dataset.create( + relpath, + version, + version_suffix=version_suffix, + name=name, + creation_date=creation_data, + description=description, + old_location=old_location, + copy=(not make_sym_link), + is_dummy=is_dummy, + execution_id=execution_id, + verbose=True, + owner=owner, + owner_type=owner_type, + is_overwritable=is_overwritable, + execution_name=execution_name, + execution_description=execution_description, + execution_start=execution_start, + execution_locale=execution_locale, + execution_configuration=execution_configuration, + input_datasets=input_datasets, + ) + + assert dataset_id is not None, "Trying to create a dataset that already exists" + assert execution_id is not None, "Trying to create a execution that already exists" + print(f"Created dataset entry with id {dataset_id}") + + return dataset_id def test_simple_query(dummy_file): - """Make a simple entry, and make sure the query returns the correct result""" - - # Establish connection to database - tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) - - # Add entry - d_id = _insert_dataset_entry( - datareg, - "DESC/datasets/my_first_dataset", - "0.0.1", - "user", - None, - "This is my first DESC dataset", - ) - - # Query - f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id) - results = datareg.Query.find_datasets( - [ - "dataset.name", - "dataset.version_string", - "dataset.owner", - "dataset.owner_type", - "dataset.description", - "dataset.version_major", - "dataset.version_minor", - "dataset.version_patch", - "dataset.relative_path", - "dataset.version_suffix", - "dataset.data_org", - ], - [f], - return_format="cursorresult", - ) - - for i, r in enumerate(results): - assert getattr(r, "dataset.name") == "my_first_dataset" - assert getattr(r, "dataset.version_string") == "0.0.1" - assert getattr(r, "dataset.version_major") == 0 - assert getattr(r, "dataset.version_minor") == 0 - assert getattr(r, "dataset.version_patch") == 1 - assert getattr(r, "dataset.owner") == os.getenv("USER") - assert getattr(r, "dataset.owner_type") == "user" - assert getattr(r, "dataset.description") == "This is my first DESC dataset" - assert getattr(r, "dataset.relative_path") == "DESC/datasets/my_first_dataset" - assert getattr(r, "dataset.version_suffix") == None - assert getattr(r, "dataset.data_org") == "dummy" - assert i < 1 + """Make a simple entry, and make sure the query returns the correct result""" + + # Establish connection to database + tmp_src_dir, tmp_root_dir = dummy_file + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + + # Add entry + d_id = _insert_dataset_entry( + datareg, + "DESC/datasets/my_first_dataset", + "0.0.1", + "user", + None, + "This is my first DESC dataset", + ) + + # Query + f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id) + results = datareg.Query.find_datasets( + [ + "dataset.name", + "dataset.version_string", + "dataset.owner", + "dataset.owner_type", + "dataset.description", + "dataset.version_major", + "dataset.version_minor", + "dataset.version_patch", + "dataset.relative_path", + "dataset.version_suffix", + "dataset.data_org", + ], + [f], + return_format="cursorresult", + ) + + for i, r in enumerate(results): + assert getattr(r, "dataset.name") == "my_first_dataset" + assert getattr(r, "dataset.version_string") == "0.0.1" + assert getattr(r, "dataset.version_major") == 0 + assert getattr(r, "dataset.version_minor") == 0 + assert getattr(r, "dataset.version_patch") == 1 + assert getattr(r, "dataset.owner") == os.getenv("USER") + assert getattr(r, "dataset.owner_type") == "user" + assert getattr(r, "dataset.description") == "This is my first DESC dataset" + assert getattr(r, "dataset.relative_path") == "DESC/datasets/my_first_dataset" + assert getattr(r, "dataset.version_suffix") == None + assert getattr(r, "dataset.data_org") == "dummy" + assert i < 1 def test_manual_name_and_vsuffix(dummy_file): - """Test setting the name and version suffix manually""" - - # Establish connection to database - tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) - - # Add entry - d_id = _insert_dataset_entry( - datareg, - "DESC/datasets/my_second_dataset", - "0.0.1", - "user", - None, - "This is my first DESC dataset", - name="custom name", - version_suffix="custom_suffix", - ) - - # Query - f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id) - results = datareg.Query.find_datasets( - ["dataset.name", "dataset.version_suffix"], [f], return_format="cursorresult" - ) - - for i, r in enumerate(results): - assert getattr(r, "dataset.name") == "custom name" - assert getattr(r, "dataset.version_suffix") == "custom_suffix" - assert i < 1 + """Test setting the name and version suffix manually""" + + # Establish connection to database + tmp_src_dir, tmp_root_dir = dummy_file + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + + # Add entry + d_id = _insert_dataset_entry( + datareg, + "DESC/datasets/my_second_dataset", + "0.0.1", + "user", + None, + "This is my first DESC dataset", + name="custom name", + version_suffix="custom_suffix", + ) + + # Query + f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id) + results = datareg.Query.find_datasets( + ["dataset.name", "dataset.version_suffix"], [f], return_format="cursorresult" + ) + + for i, r in enumerate(results): + assert getattr(r, "dataset.name") == "custom name" + assert getattr(r, "dataset.version_suffix") == "custom_suffix" + assert i < 1 @pytest.mark.parametrize( - "v_type,ans,name", - [ - ("major", "1.0.0", "my_first_dataset"), - ("minor", "0.1.0", "my_first_dataset"), - ("patch", "0.0.2", "my_first_dataset"), - ("patch", "0.0.1", "my_second_dataset"), - ], + "v_type,ans,name", + [ + ("major", "1.0.0", "my_first_dataset"), + ("minor", "0.1.0", "my_first_dataset"), + ("patch", "0.0.2", "my_first_dataset"), + ("patch", "0.0.1", "my_second_dataset"), + ], ) def test_dataset_bumping(dummy_file, v_type, ans, name): - """ - Test bumping a dataset and make sure the new version is correct. - - Tests bumping datasets with and without a version suffix. - """ - - # Establish connection to database - tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) - - # Add entry - d_id = _insert_dataset_entry( - datareg, - f"DESC/datasets/bumped_dataset_{v_type}_{name}", - v_type, - "user", - None, - "This is my first bumped DESC dataset", - name=name, - ) - - # Query - f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id) - results = datareg.Query.find_datasets( - ["dataset.name", "dataset.version_string"], [f], return_format="cursorresult" - ) - - for i, r in enumerate(results): - assert getattr(r, "dataset.name") == name - assert getattr(r, "dataset.version_string") == ans - assert i < 1 + """ + Test bumping a dataset and make sure the new version is correct. + + Tests bumping datasets with and without a version suffix. + """ + + # Establish connection to database + tmp_src_dir, tmp_root_dir = dummy_file + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + + # Add entry + d_id = _insert_dataset_entry( + datareg, + f"DESC/datasets/bumped_dataset_{v_type}_{name}", + v_type, + "user", + None, + "This is my first bumped DESC dataset", + name=name, + ) + + # Query + f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id) + results = datareg.Query.find_datasets( + ["dataset.name", "dataset.version_string"], [f], return_format="cursorresult" + ) + + for i, r in enumerate(results): + assert getattr(r, "dataset.name") == name + assert getattr(r, "dataset.version_string") == ans + assert i < 1 @pytest.mark.parametrize("owner_type", ["user", "group", "project"]) def test_owner_types(dummy_file, owner_type): - """Test the different owner types""" + """Test the different owner types""" - # Establish connection to database - tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + # Establish connection to database + tmp_src_dir, tmp_root_dir = dummy_file + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) - # Add entry - d_id = _insert_dataset_entry( - datareg, - f"DESC/datasets/owner_type_{owner_type}", - "0.0.1", - owner_type, - None, - f"This is a {owner_type} dataset", - ) + # Add entry + d_id = _insert_dataset_entry( + datareg, + f"DESC/datasets/owner_type_{owner_type}", + "0.0.1", + owner_type, + None, + f"This is a {owner_type} dataset", + ) - # Query - f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id) - results = datareg.Query.find_datasets( - ["dataset.owner_type"], [f], return_format="cursorresult" - ) + # Query + f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id) + results = datareg.Query.find_datasets( + ["dataset.owner_type"], [f], return_format="cursorresult" + ) - for i, r in enumerate(results): - assert getattr(r, "dataset.owner_type") == owner_type - assert i < 1 + for i, r in enumerate(results): + assert getattr(r, "dataset.owner_type") == owner_type + assert i < 1 @pytest.mark.parametrize("data_org", ["file", "directory"]) def test_copy_data(dummy_file, data_org): - """Test copying real data into the registry (from an `old_location`)""" - - # Establish connection to database - tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) - - # File/directory we are copying in - if data_org == "file": - data_path = str(tmp_src_dir / "file1.txt") - else: - data_path = str(tmp_src_dir / "directory1") - - # Add entry - d_id = _insert_dataset_entry( - datareg, - f"DESC/datasets/copy_real_{data_org}", - "0.0.1", - "user", - None, - "Test copying a real file", - old_location=data_path, - is_dummy=False, - ) - - # Query - f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id) - results = datareg.Query.find_datasets( - ["dataset.data_org", "dataset.nfiles", "dataset.total_disk_space"], - [f], - return_format="cursorresult", - ) - - for i, r in enumerate(results): - assert getattr(r, "dataset.data_org") == data_org - assert getattr(r, "dataset.nfiles") == 1 - assert getattr(r, "dataset.total_disk_space") > 0 - assert i < 1 + """Test copying real data into the registry (from an `old_location`)""" + + # Establish connection to database + tmp_src_dir, tmp_root_dir = dummy_file + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + + # File/directory we are copying in + if data_org == "file": + data_path = str(tmp_src_dir / "file1.txt") + else: + data_path = str(tmp_src_dir / "directory1") + + # Add entry + d_id = _insert_dataset_entry( + datareg, + f"DESC/datasets/copy_real_{data_org}", + "0.0.1", + "user", + None, + "Test copying a real file", + old_location=data_path, + is_dummy=False, + ) + + # Query + f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id) + results = datareg.Query.find_datasets( + ["dataset.data_org", "dataset.nfiles", "dataset.total_disk_space"], + [f], + return_format="cursorresult", + ) + + for i, r in enumerate(results): + assert getattr(r, "dataset.data_org") == data_org + assert getattr(r, "dataset.nfiles") == 1 + assert getattr(r, "dataset.total_disk_space") > 0 + assert i < 1 @pytest.mark.parametrize( - "data_org,data_path,v_str,overwritable", - [ - ("file", "file1.txt", "0.0.1", True), - ("file", "file1.txt", "0.0.2", False), - ("directory", "dummy_dir", "0.0.1", True), - ("directory", "dummy_dir", "0.0.2", False), - ], + "data_org,data_path,v_str,overwritable", + [ + ("file", "file1.txt", "0.0.1", True), + ("file", "file1.txt", "0.0.2", False), + ("directory", "dummy_dir", "0.0.1", True), + ("directory", "dummy_dir", "0.0.2", False), + ], ) def test_on_location_data(dummy_file, data_org, data_path, v_str, overwritable): - """ - Test ingesting real data into the registry (already on location). Also - tests overwriting datasets. - - Does twice for each file, the first is a normal entry with - `is_overwritable=True`. The second tests overwriting the previous data with - a new version. - """ - - # Establish connection to database - tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) - - d_id = _insert_dataset_entry( - datareg, - data_path, - v_str, - "user", - None, - "Test ingesting a real file on location", - old_location=None, - is_dummy=False, - is_overwritable=overwritable, - ) - - f = datareg.Query.gen_filter("dataset.relative_path", "==", data_path) - results = datareg.Query.find_datasets( - [ - "dataset.data_org", - "dataset.nfiles", - "dataset.total_disk_space", - "dataset.is_overwritable", - "dataset.is_overwritten", - "dataset.version_string", - ], - [f], - return_format="cursorresult", - ) - - num_results = len(results.all()) - for i, r in enumerate(results): - assert getattr(r, "dataset.data_org") == data_org - assert getattr(r, "dataset.nfiles") == 1 - assert getattr(r, "dataset.total_disk_space") > 0 - if getattr(r, "version_string") == "0.0.1": - if num_results == 1: - assert getattr(r, "dataset.is_overwritable") == True - assert getattr(r, "dataset.is_overwritten") == False - else: - assert getattr(r, "dataset.is_overwritable") == True - assert getattr(r, "dataset.is_overwritten") == True - else: - if num_results == 1: - assert getattr(r, "dataset.is_overwritable") == False - assert getattr(r, "dataset.is_overwritten") == True - else: - assert getattr(r, "dataset.is_overwritable") == False - assert getattr(r, "dataset.is_overwritten") == False - assert i < 2 + """ + Test ingesting real data into the registry (already on location). Also + tests overwriting datasets. + + Does twice for each file, the first is a normal entry with + `is_overwritable=True`. The second tests overwriting the previous data with + a new version. + """ + + # Establish connection to database + tmp_src_dir, tmp_root_dir = dummy_file + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + + d_id = _insert_dataset_entry( + datareg, + data_path, + v_str, + "user", + None, + "Test ingesting a real file on location", + old_location=None, + is_dummy=False, + is_overwritable=overwritable, + ) + + f = datareg.Query.gen_filter("dataset.relative_path", "==", data_path) + results = datareg.Query.find_datasets( + [ + "dataset.data_org", + "dataset.nfiles", + "dataset.total_disk_space", + "dataset.is_overwritable", + "dataset.is_overwritten", + "dataset.version_string", + ], + [f], + return_format="cursorresult", + ) + + num_results = len(results.all()) + for i, r in enumerate(results): + assert getattr(r, "dataset.data_org") == data_org + assert getattr(r, "dataset.nfiles") == 1 + assert getattr(r, "dataset.total_disk_space") > 0 + if getattr(r, "version_string") == "0.0.1": + if num_results == 1: + assert getattr(r, "dataset.is_overwritable") == True + assert getattr(r, "dataset.is_overwritten") == False + else: + assert getattr(r, "dataset.is_overwritable") == True + assert getattr(r, "dataset.is_overwritten") == True + else: + if num_results == 1: + assert getattr(r, "dataset.is_overwritable") == False + assert getattr(r, "dataset.is_overwritten") == True + else: + assert getattr(r, "dataset.is_overwritable") == False + assert getattr(r, "dataset.is_overwritten") == False + assert i < 2 def test_dataset_alias(dummy_file): - """Register a dataset and make a dataset alias entry for it""" - - # Establish connection to database - tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) - - # Add dataset - d_id = _insert_dataset_entry( - datareg, - "alias_test_entry", - "0.0.1", - "user", - None, - "Test dataset alias", - ) - - # Add alias - _insert_alias_entry(datareg, "nice_dataset_name", d_id) - - # Query - f = datareg.Query.gen_filter("dataset_alias.alias", "==", "nice_dataset_name") - results = datareg.Query.find_datasets( - [ - "dataset.dataset_id", - "dataset_alias.dataset_id", - ], - [f], - return_format="cursorresult", - ) - - for i, r in enumerate(results): - assert getattr(r, "dataset.dataset_id") == d_id - assert getattr(r, "dataset_alias.dataset_id") == d_id - assert i < 1 + """Register a dataset and make a dataset alias entry for it""" + + # Establish connection to database + tmp_src_dir, tmp_root_dir = dummy_file + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + + # Add dataset + d_id = _insert_dataset_entry( + datareg, + "alias_test_entry", + "0.0.1", + "user", + None, + "Test dataset alias", + ) + + # Add alias + _insert_alias_entry(datareg, "nice_dataset_name", d_id) + + # Query + f = datareg.Query.gen_filter("dataset_alias.alias", "==", "nice_dataset_name") + results = datareg.Query.find_datasets( + [ + "dataset.dataset_id", + "dataset_alias.dataset_id", + ], + [f], + return_format="cursorresult", + ) + + for i, r in enumerate(results): + assert getattr(r, "dataset.dataset_id") == d_id + assert getattr(r, "dataset_alias.dataset_id") == d_id + assert i < 1 def test_pipeline_entry(dummy_file): - """ - Test making multiple executions and datasets to form a pipeline. - - Also queries to make sure dependencies are made. - """ - - # Establish connection to database - tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) - - # Add entries - ex_id_1 = _insert_execution_entry( - datareg, "pipeline_stage_1", "The first stage of my pipeline" - ) - - d_id_1 = _insert_dataset_entry( - datareg, - "DESC/datasets/my_first_pipeline_stage1", - "0.0.1", - "user", - None, - "This is data for stage 1 of my first pipeline", - execution_id=ex_id_1, - ) - - ex_id_2 = _insert_execution_entry( - datareg, - "pipeline_stage_2", - "The second stage of my pipeline", - input_datasets=[d_id_1], - ) - - d_id_2 = _insert_dataset_entry( - datareg, - "DESC/datasets/my_first_pipeline_stage2a", - "0.0.1", - "user", - None, - "This is data for stage 2 of my first pipeline", - execution_id=ex_id_2, - ) - - d_id_3 = _insert_dataset_entry( - datareg, - "DESC/datasets/my_first_pipeline_stage2b", - "0.0.1", - "user", - None, - "This is data for stage 2 of my first pipeline", - execution_id=ex_id_2, - ) - - # Stage 3 of my pipeline - ex_id_3 = _insert_execution_entry( - datareg, - "pipeline_stage_3", - "The third stage of my pipeline", - input_datasets=[d_id_2, d_id_3], - ) - - # Query on execution - f = datareg.Query.gen_filter("dataset.execution_id", "==", ex_id_2) - results = datareg.Query.find_datasets( - [ - "dataset.name", - ], - [f], - return_format="cursorresult", - ) - - for i, r in enumerate(results): - assert "my_first_pipeline_stage2" in getattr(r, "dataset.name") - assert i < 2 - - # Query on dependency - f = datareg.Query.gen_filter("dependency.execution_id", "==", ex_id_2) - results = datareg.Query.find_datasets( - [ - "dependency.execution_id", - "dataset.dataset_id", - "dataset.execution_id", - "dataset.name", - ], - [f], - return_format="cursorresult", - ) - - for i, r in enumerate(results): - assert getattr(r, "dataset.dataset_id") == d_id_1 - assert i < 1 + """ + Test making multiple executions and datasets to form a pipeline. + + Also queries to make sure dependencies are made. + """ + + # Establish connection to database + tmp_src_dir, tmp_root_dir = dummy_file + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + + # Add entries + ex_id_1 = _insert_execution_entry( + datareg, "pipeline_stage_1", "The first stage of my pipeline" + ) + + d_id_1 = _insert_dataset_entry( + datareg, + "DESC/datasets/my_first_pipeline_stage1", + "0.0.1", + "user", + None, + "This is data for stage 1 of my first pipeline", + execution_id=ex_id_1, + ) + + ex_id_2 = _insert_execution_entry( + datareg, + "pipeline_stage_2", + "The second stage of my pipeline", + input_datasets=[d_id_1], + ) + + d_id_2 = _insert_dataset_entry( + datareg, + "DESC/datasets/my_first_pipeline_stage2a", + "0.0.1", + "user", + None, + "This is data for stage 2 of my first pipeline", + execution_id=ex_id_2, + ) + + d_id_3 = _insert_dataset_entry( + datareg, + "DESC/datasets/my_first_pipeline_stage2b", + "0.0.1", + "user", + None, + "This is data for stage 2 of my first pipeline", + execution_id=ex_id_2, + ) + + # Stage 3 of my pipeline + ex_id_3 = _insert_execution_entry( + datareg, + "pipeline_stage_3", + "The third stage of my pipeline", + input_datasets=[d_id_2, d_id_3], + ) + + # Query on execution + f = datareg.Query.gen_filter("dataset.execution_id", "==", ex_id_2) + results = datareg.Query.find_datasets( + [ + "dataset.name", + ], + [f], + return_format="cursorresult", + ) + + for i, r in enumerate(results): + assert "my_first_pipeline_stage2" in getattr(r, "dataset.name") + assert i < 2 + + # Query on dependency + f = datareg.Query.gen_filter("dependency.execution_id", "==", ex_id_2) + results = datareg.Query.find_datasets( + [ + "dependency.execution_id", + "dataset.dataset_id", + "dataset.execution_id", + "dataset.name", + ], + [f], + return_format="cursorresult", + ) + + for i, r in enumerate(results): + assert getattr(r, "dataset.dataset_id") == d_id_1 + assert i < 1 def test_global_owner_set(dummy_file): - """ - Test setting the owner and owner_type globally during the database - initialization. - """ - - # Establish connection to database - tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry( - root_dir=str(tmp_root_dir), - schema=SCHEMA_VERSION, - owner="DESC group", - owner_type="group", - ) - - # Add entry - d_id = _insert_dataset_entry( - datareg, - "DESC/datasets/global_user_dataset", - "0.0.1", - None, - None, - "Should be allocated user and user_type from global config", - ) - - # Query - f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id) - results = datareg.Query.find_datasets( - [ - "dataset.owner", - "dataset.owner_type", - ], - [f], - return_format="cursorresult", - ) - - for i, r in enumerate(results): - assert getattr(r, "dataset.owner") == "DESC group" - assert getattr(r, "dataset.owner_type") == "group" - assert i < 1 + """ + Test setting the owner and owner_type globally during the database + initialization. + """ + + # Establish connection to database + tmp_src_dir, tmp_root_dir = dummy_file + datareg = DataRegistry( + root_dir=str(tmp_root_dir), + schema=SCHEMA_VERSION, + owner="DESC group", + owner_type="group", + ) + + # Add entry + d_id = _insert_dataset_entry( + datareg, + "DESC/datasets/global_user_dataset", + "0.0.1", + None, + None, + "Should be allocated user and user_type from global config", + ) + + # Query + f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id) + results = datareg.Query.find_datasets( + [ + "dataset.owner", + "dataset.owner_type", + ], + [f], + return_format="cursorresult", + ) + + for i, r in enumerate(results): + assert getattr(r, "dataset.owner") == "DESC group" + assert getattr(r, "dataset.owner_type") == "group" + assert i < 1 @pytest.mark.skip(reason="Can't do production related things with sqlite") def test_prooduction_schema(dummy_file): - """ - Test making multiple executions and datasets to form a pipeline. - - Also queries to make sure dependencies are made. - """ - - # Establish connection to database - tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema="production") - - d_id = _insert_dataset_entry( - datareg, - "DESC/datasets/production_dataset_1", - "0.0.1", - "production", - None, - "This is production's first dataset", - ) - - # Query - f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id) - results = datareg.Query.find_datasets( - [ - "dataset.owner", - "dataset.owner_type", - ], - [f], - return_format="cursorresult", - ) - - for i, r in enumerate(results): - assert getattr(r, "dataset.owner") == "production" - assert getattr(r, "dataset.owner_type") == "production" - assert i < 1 + """ + Test making multiple executions and datasets to form a pipeline. + + Also queries to make sure dependencies are made. + """ + + # Establish connection to database + tmp_src_dir, tmp_root_dir = dummy_file + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema="production") + + d_id = _insert_dataset_entry( + datareg, + "DESC/datasets/production_dataset_1", + "0.0.1", + "production", + None, + "This is production's first dataset", + ) + + # Query + f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id) + results = datareg.Query.find_datasets( + [ + "dataset.owner", + "dataset.owner_type", + ], + [f], + return_format="cursorresult", + ) + + for i, r in enumerate(results): + assert getattr(r, "dataset.owner") == "production" + assert getattr(r, "dataset.owner_type") == "production" + assert i < 1 def test_execution_config_file(dummy_file): - """Test ingesting a configuration file with an execution entry""" - - # Establish connection to database - tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) - - # Add entry - ex_id = _insert_execution_entry( - datareg, - "execution_with_configuration", - "An execution with an input configuration file", - configuration=str(tmp_src_dir / "dummy_configuration_file.yaml"), - ) - - # Query - f = datareg.Query.gen_filter("execution.execution_id", "==", ex_id) - results = datareg.Query.find_datasets( - [ - "execution.configuration", - ], - [f], - return_format="cursorresult", - ) - - for i, r in enumerate(results): - assert getattr(r, "execution.configuration") is not None - assert i < 1 + """Test ingesting a configuration file with an execution entry""" + + # Establish connection to database + tmp_src_dir, tmp_root_dir = dummy_file + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + + # Add entry + ex_id = _insert_execution_entry( + datareg, + "execution_with_configuration", + "An execution with an input configuration file", + configuration=str(tmp_src_dir / "dummy_configuration_file.yaml"), + ) + + # Query + f = datareg.Query.gen_filter("execution.execution_id", "==", ex_id) + results = datareg.Query.find_datasets( + [ + "execution.configuration", + ], + [f], + return_format="cursorresult", + ) + + for i, r in enumerate(results): + assert getattr(r, "execution.configuration") is not None + assert i < 1 def test_dataset_with_execution(dummy_file): - """ - Test modifying the datasets default execution directly when registering the - dataset - """ - - # Establish connection to database - tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) - - d_id_1 = _insert_dataset_entry( - datareg, - "DESC/datasets/execution_test_input", - "0.0.1", - None, - None, - "This is production's first dataset", - ) - - d_id_2 = _insert_dataset_entry( - datareg, - "DESC/datasets/execution_test", - "0.0.1", - None, - None, - "This should have a more descriptive execution", - execution_name="Overwrite execution auto name", - execution_description="Overwrite execution auto description", - execution_locale="TestMachine", - input_datasets=[d_id_1], - ) - - # Query on execution - f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id_2) - results = datareg.Query.find_datasets( - [ - "dataset.name", - "execution.execution_id", - "execution.description", - "execution.locale", - "execution.name", - ], - [f], - return_format="cursorresult", - ) - - for i, r in enumerate(results): - assert getattr(r, "execution.name") == "Overwrite execution auto name" - assert ( - getattr(r, "execution.description") - == "Overwrite execution auto description" - ) - assert getattr(r, "execution.locale") == "TestMachine" - ex_id_1 = getattr(r, "execution.execution_id") - assert i < 1 - - # Query on dependency - f = datareg.Query.gen_filter("dependency.input_id", "==", d_id_1) - results = datareg.Query.find_datasets( - [ - "dataset.dataset_id", - "dependency.execution_id", - "dependency.input_id", - ], - [f], - return_format="cursorresult", - ) - - for i, r in enumerate(results): - assert getattr(r, "dependency.execution_id") == ex_id_1 - assert i < 1 + """ + Test modifying the datasets default execution directly when registering the + dataset + """ + + # Establish connection to database + tmp_src_dir, tmp_root_dir = dummy_file + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + + d_id_1 = _insert_dataset_entry( + datareg, + "DESC/datasets/execution_test_input", + "0.0.1", + None, + None, + "This is production's first dataset", + ) + + d_id_2 = _insert_dataset_entry( + datareg, + "DESC/datasets/execution_test", + "0.0.1", + None, + None, + "This should have a more descriptive execution", + execution_name="Overwrite execution auto name", + execution_description="Overwrite execution auto description", + execution_locale="TestMachine", + input_datasets=[d_id_1], + ) + + # Query on execution + f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id_2) + results = datareg.Query.find_datasets( + [ + "dataset.name", + "execution.execution_id", + "execution.description", + "execution.locale", + "execution.name", + ], + [f], + return_format="cursorresult", + ) + + for i, r in enumerate(results): + assert getattr(r, "execution.name") == "Overwrite execution auto name" + assert ( + getattr(r, "execution.description") + == "Overwrite execution auto description" + ) + assert getattr(r, "execution.locale") == "TestMachine" + ex_id_1 = getattr(r, "execution.execution_id") + assert i < 1 + + # Query on dependency + f = datareg.Query.gen_filter("dependency.input_id", "==", d_id_1) + results = datareg.Query.find_datasets( + [ + "dataset.dataset_id", + "dependency.execution_id", + "dependency.input_id", + ], + [f], + return_format="cursorresult", + ) + + for i, r in enumerate(results): + assert getattr(r, "dependency.execution_id") == ex_id_1 + assert i < 1 def test_get_dataset_absolute_path(dummy_file): - """ - Test the generation of the full absolute path of a dataset using the - `Query.get_dataset_absolute_path` function - """ - - # Establish connection to database - tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) - - dset_relpath = "DESC/datasets/get_dataset_absolute_path_test" - dset_ownertype = "group" - dset_owner = "group1" - - # Make a basic entry - d_id_1 = _insert_dataset_entry( - datareg, - dset_relpath, - "0.0.1", - dset_ownertype, - dset_owner, - "Test the Query.get_dataset_absolute_path function", - ) - - v = datareg.Query.get_dataset_absolute_path(d_id_1) - - if datareg.Query._dialect == "sqlite": - assert v == os.path.join( - str(tmp_root_dir), dset_ownertype, dset_owner, dset_relpath - ) - else: - assert v == os.path.join( - str(tmp_root_dir), SCHEMA_VERSION, dset_ownertype, dset_owner, dset_relpath - ) - - -def test_delete_entry_dummy(dummy_file): - """Make a simple (dummy) entry, then delete it, then check it was deleted""" - - # Establish connection to database - tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) - - # Make sure we raise an exception trying to delete a dataset that doesn't exist - with pytest.raises(ValueError, match="does not exist"): - datareg.Registrar.dataset.delete(10000) - - # Add entry - d_id = _insert_dataset_entry( - datareg, - "DESC/datasets/dummy_dataset_to_delete", - "0.0.1", - "user", - None, - "A dataset to delete", - ) - - # Now delete that entry - datareg.Registrar.dataset.delete(d_id) - - # Check the entry was deleted - f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id) - results = datareg.Query.find_datasets( - [ - "dataset.status", - "dataset.delete_date", - "dataset.delete_uid", - ], - [f], - return_format="cursorresult", - ) - - for r in results: - assert getattr(r, "dataset.status") == 3 - assert getattr(r, "dataset.delete_date") is not None - assert getattr(r, "dataset.delete_uid") is not None - - -def test_delete_entry_real(dummy_file): - """Make a simple (real data) entry, then delete it, then check it was deleted""" - - # Establish connection to database - tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) - - # Make sure we raise an exception trying to delete a dataset that doesn't exist - with pytest.raises(ValueError, match="does not exist"): - datareg.Registrar.dataset.delete(10000) - - # Add entry - data_path = str(tmp_src_dir / "file2.txt") - assert os.path.isfile(data_path) - d_id = _insert_dataset_entry( - datareg, - "DESC/datasets/real_dataset_to_delete", - "0.0.1", - "user", - None, - "A dataset to delete", - old_location=data_path, - is_dummy=False, - ) - - # Now delete that entry - datareg.Registrar.dataset.delete(d_id) - - # Check the entry was set to deleted in the registry - f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id) - results = datareg.Query.find_datasets( - [ - "dataset.status", - "dataset.delete_date", - "dataset.delete_uid", - "dataset.owner", - "dataset.owner_type", - "dataset.relative_path", - ], - [f], - return_format="cursorresult", - ) - - for r in results: - assert getattr(r, "dataset.status") == 3 - assert getattr(r, "dataset.delete_date") is not None - assert getattr(r, "dataset.delete_uid") is not None - - # Make sure the file in the root_dir has gone - data_path = _form_dataset_path( - getattr(r, "dataset.owner_type"), - getattr(r, "dataset.owner"), - getattr(r, "dataset.relative_path"), - schema=SCHEMA_VERSION, - root_dir=str(tmp_root_dir), - ) - assert not os.path.isfile(data_path) + """ + Test the generation of the full absolute path of a dataset using the + `Query.get_dataset_absolute_path` function + """ + + # Establish connection to database + tmp_src_dir, tmp_root_dir = dummy_file + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + + dset_relpath = "DESC/datasets/get_dataset_absolute_path_test" + dset_ownertype = "group" + dset_owner = "group1" + + # Make a basic entry + d_id_1 = _insert_dataset_entry( + datareg, + dset_relpath, + "0.0.1", + dset_ownertype, + dset_owner, + "Test the Query.get_dataset_absolute_path function", + ) + + v = datareg.Query.get_dataset_absolute_path(d_id_1) + + if datareg.Query._dialect == "sqlite": + assert v == os.path.join( + str(tmp_root_dir), dset_ownertype, dset_owner, dset_relpath + ) + else: + assert v == os.path.join( + str(tmp_root_dir), SCHEMA_VERSION, dset_ownertype, dset_owner, dset_relpath + ) + +@pytest.mark.parametrize( + "is_dummy,dataset_name", + [ + (True, "dummy_dataset_to_delete"), + (False, "real_dataset_to_delete"), + ], +) +def test_delete_entry(dummy_file, is_dummy, dataset_name): + """ + Make a simple entry, then delete it, then check it was deleted. + + Does this for a dummy dataset and a real one. + """ + + # Establish connection to database + tmp_src_dir, tmp_root_dir = dummy_file + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + + # Make sure we raise an exception trying to delete a dataset that doesn't exist + with pytest.raises(ValueError, match="does not exist"): + datareg.Registrar.dataset.delete(10000) + + # Where is the real data? + if is_dummy: + data_path = None + else: + data_path = str(tmp_src_dir / "file2.txt") + assert os.path.isfile(data_path) + + # Add entry + d_id = _insert_dataset_entry( + datareg, + f"DESC/datasets/{dataset_name}", + "0.0.1", + "user", + None, + "A dataset to delete", + is_dummy=is_dummy, + old_location=data_path, + ) + + # Now delete that entry + datareg.Registrar.dataset.delete(d_id) + + # Check the entry was deleted + f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id) + results = datareg.Query.find_datasets( + [ + "dataset.status", + "dataset.delete_date", + "dataset.delete_uid", + "dataset.owner_type", + "dataset.owner", + "dataset.relative_path", + ], + [f], + return_format="cursorresult", + ) + + for r in results: + assert getattr(r, "dataset.status") == 3 + assert getattr(r, "dataset.delete_date") is not None + assert getattr(r, "dataset.delete_uid") is not None + + if not is_dummy: + # Make sure the file in the root_dir has gone + data_path = _form_dataset_path( + getattr(r, "dataset.owner_type"), + getattr(r, "dataset.owner"), + getattr(r, "dataset.relative_path"), + schema=SCHEMA_VERSION, + root_dir=str(tmp_root_dir), + ) + assert not os.path.isfile(data_path) + + # Make sure we can not delete an already deleted entry. + with pytest.raises(ValueError, match="not have a valid status"): + datareg.Registrar.dataset.delete(d_id) From 408b11e0371d779e849130d88692196662cb1125 Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Wed, 17 Jan 2024 16:20:44 +0100 Subject: [PATCH 07/19] Update dataset status flag to be a bitmask --- src/dataregistry/registrar/dataset.py | 8 ++- src/dataregistry/registrar/dataset_util.py | 74 ++++++++++++++++++++++ src/dataregistry/schema/schema.yaml | 2 +- tests/unit_tests/test_dataset_status.py | 51 +++++++++++++++ 4 files changed, 131 insertions(+), 4 deletions(-) create mode 100644 src/dataregistry/registrar/dataset_util.py create mode 100644 tests/unit_tests/test_dataset_status.py diff --git a/src/dataregistry/registrar/dataset.py b/src/dataregistry/registrar/dataset.py index 23e2ecbb..0d0215ae 100644 --- a/src/dataregistry/registrar/dataset.py +++ b/src/dataregistry/registrar/dataset.py @@ -14,6 +14,7 @@ _read_configuration_file, get_directory_info, ) +from .dataset_util import set_dataset_status, get_dataset_status # Default maximum allowed length of configuration file allowed to be ingested _DEFAULT_MAX_CONFIG = 10000 @@ -68,7 +69,8 @@ def create( First, the dataset entry is created in the database. If success, the data is then copied (if `old_location` was provided). Only if both - steps are successful will there be `status=1` entry in the registry. + steps are successful will there be "valid" status entry in the + registry. Parameters ---------- @@ -225,7 +227,7 @@ def create( # We tentatively start with an "invalid" dataset in the database. This # will be upgraded to valid if the data copying (if any) was successful. - values["status"] = -1 + values["status"] = 0 # Create a new row in the data registry database. with self.parent._engine.connect() as conn: @@ -271,7 +273,7 @@ def create( nfiles=num_files, total_disk_space=total_size / 1024 / 1024, creation_date=ds_creation_date, - status=1, + status=set_dataset_status(values["status"], valid=True), ) ) conn.execute(update_stmt) diff --git a/src/dataregistry/registrar/dataset_util.py b/src/dataregistry/registrar/dataset_util.py new file mode 100644 index 00000000..e9f8887b --- /dev/null +++ b/src/dataregistry/registrar/dataset_util.py @@ -0,0 +1,74 @@ +# Define constants for dataset's "status" bit position +VALID_STATUS_BITS = { + # Is a valid dataset or not. "Invalid" means the dataset entry was created in + # the database, but there was an issue copying the physical data. + "valid": 0, + # Has the data of this dataset been deleted from the `root_dir`? + "deleted": 1, + # Has the data for this dataset been archived? + "archived": 2, +} + + +def set_dataset_status(current_valid_flag, valid=None, deleted=None, archived=None): + """ + Update a value of a dataset's status bit poistion. + + These properties are not mutually exclusive, e.g., a dataset can be both + archived and deleted. + + Properties + ---------- + current_valid_flag : int + The current bitwise representation of the dataset's status + valid : bool, optional + True to set the dataset as valid, False for invalid + deleted : bool, optional + True to set the dataset as deleted + archived : bool, optional + True to set the dataset as archived + + Returns + ------- + valid_flag : int + The datasets new bitwise representation + """ + + if valid is not None: + current_valid_flag &= ~(1 << VALID_STATUS_BITS["valid"]) + current_valid_flag |= valid << VALID_STATUS_BITS["valid"] + + if deleted is not None: + current_valid_flag &= ~(1 << VALID_STATUS_BITS["deleted"]) + current_valid_flag |= deleted << VALID_STATUS_BITS["deleted"] + + if archived is not None: + current_valid_flag &= ~(1 << VALID_STATUS_BITS["archived"]) + current_valid_flag |= archived << VALID_STATUS_BITS["archived"] + + return current_valid_flag + + +def get_dataset_status(current_valid_flag, which_bit): + """ + Return the status of a dataset for a given bit index. + + Properties + ---------- + current_flag_value : int + The current bitwise representation of the dataset's status + which_bit : str + One of VALID_STATUS_BITS keys() + + Returns + ------- + - : bool + True if `which_bit` is 1. e.g., If a dataset is deleted + `get_dataset_status(, "deleted") will return True. + """ + + # Make sure `which_bit` is valid. + if which_bit not in VALID_STATUS_BITS.keys(): + raise ValueError(f"{which_bit} is not a valid dataset status") + + return (current_valid_flag & (1 << VALID_STATUS_BITS[which_bit])) != 0 diff --git a/src/dataregistry/schema/schema.yaml b/src/dataregistry/schema/schema.yaml index b8cb646e..30035fd6 100644 --- a/src/dataregistry/schema/schema.yaml +++ b/src/dataregistry/schema/schema.yaml @@ -272,7 +272,7 @@ dataset: status: type: "Integer" nullable: False - description: "What is the status of the dataset? -1: Invalid (e.g., copy data failed during creation), 1: Valid, 2: Archived, 3: deleted" + description: "What is the status of the dataset? This is a bitmask description of multiple states. Bit number 0=valid (1 if dataset is valid, 0 if copy data failed during creation), 1=deleted (1 if dataset is deleted and no longer on disk, 0 if data is still on disk, database entry is always kept) 3=archived (1 if data has been archived). For example '0b011` would be valid=1, deleted=1 and archived=0." archive_date: type: "DateTime" description: "Dataset archive date" diff --git a/tests/unit_tests/test_dataset_status.py b/tests/unit_tests/test_dataset_status.py new file mode 100644 index 00000000..2342cf7a --- /dev/null +++ b/tests/unit_tests/test_dataset_status.py @@ -0,0 +1,51 @@ +from dataregistry.registrar.dataset_util import set_dataset_status, get_dataset_status +import pytest + + +@pytest.mark.parametrize( + "start_status,valid,deleted,archived,end_status", + [ + (0, True, False, False, "0b1"), + (0, True, True, True, "0b111"), + (0, True, False, True, "0b101"), + (5, None, True, None, "0b111"), + ], +) +def test_set_dataset_status(start_status, valid, deleted, archived, end_status): + """ + Make sure dataset bitwise valid flags get set correctly + + Starts from a value and adds a flag, e.g., `deleted`, then + checks the combined bitmask is correct. + """ + + assert ( + bin( + set_dataset_status( + start_status, valid=valid, deleted=deleted, archived=archived + ) + ) + == end_status + ) + + +@pytest.mark.parametrize( + "bin_status,is_valid,is_deleted,is_archived", + [ + ("0b1", True, False, False), + ("0b111", True, True, True), + ("0b101", True, False, True), + ("0b011", True, True, False), + ], +) +def test_get_dataset_status(bin_status, is_valid, is_deleted, is_archived): + """ + Make sure dataset bitwise valid flags get checked correctly. + + For a given `bin_status` (binary status), check that it pulls out the + individual flags correctly. + """ + + assert get_dataset_status(int(bin_status, 2), "valid") == is_valid + assert get_dataset_status(int(bin_status, 2), "deleted") == is_deleted + assert get_dataset_status(int(bin_status, 2), "archived") == is_archived From 2ab40b124fa0f470bf86c50257bb168866c706a8 Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Fri, 26 Jan 2024 13:55:03 +0100 Subject: [PATCH 08/19] Create a base table registrar class that the dataset/execution/etc classes inherit. This means common functions between the tables, e.g., delete, can be inherited from the base class --- src/dataregistry/registrar/__init__.py | 2 +- .../registrar/base_table_class.py | 77 ++++++++++++++++++ src/dataregistry/registrar/dataset.py | 78 +++++++------------ src/dataregistry/registrar/dataset_alias.py | 24 ++---- src/dataregistry/registrar/execution.py | 49 ++++-------- src/dataregistry/registrar/registrar.py | 50 ++++-------- tests/end_to_end_tests/test_end_to_end.py | 2 +- 7 files changed, 146 insertions(+), 136 deletions(-) create mode 100644 src/dataregistry/registrar/base_table_class.py diff --git a/src/dataregistry/registrar/__init__.py b/src/dataregistry/registrar/__init__.py index 34cafb33..f0d44449 100644 --- a/src/dataregistry/registrar/__init__.py +++ b/src/dataregistry/registrar/__init__.py @@ -1 +1 @@ -from .registrar import _OWNER_TYPES, Registrar +from .registrar import Registrar diff --git a/src/dataregistry/registrar/base_table_class.py b/src/dataregistry/registrar/base_table_class.py new file mode 100644 index 00000000..54a93f60 --- /dev/null +++ b/src/dataregistry/registrar/base_table_class.py @@ -0,0 +1,77 @@ +import os + +from dataregistry.db_basic import TableMetadata + +# Allowed owner types +_OWNER_TYPES = {"user", "project", "group", "production"} + +# Default maximum allowed length of configuration file allowed to be ingested +_DEFAULT_MAX_CONFIG = 10000 + + +class BaseTable: + def __init__(self, db_connection, root_dir, owner, owner_type): + """ + Base class to register/modify/delete entries in the database tables. + + Each table subclass (e.g., DatasetTable) will inherit this class. + Functions universal to all tables, such as delete and modify are + written here, the register function and other unique functions for the + tables are in their own subclasses. + + Parameters + ---------- + db_connection : DbConnection object + Encompasses sqlalchemy engine, dialect (database backend) + and schema version + root_dir : str + Root directory of the dataregistry on disk + owner : str + To set the default owner for all registered datasets in this + instance. + owner_type : str + To set the default owner_type for all registered datasets in this + instance. + """ + + # Root directory on disk for data registry files + self._root_dir = root_dir + + # Database engine and dialect. + self._engine = db_connection.engine + self._schema = db_connection.schema + + # Link to Table Metadata. + self._metadata_getter = TableMetadata(db_connection) + + # Store user id + self._uid = os.getenv("USER") + + # Default owner and owner_type's + self._owner = owner + self._owner_type = owner_type + + # Allowed owner types + self._OWNER_TYPES = _OWNER_TYPES + + # Max configuration file length allowed + self._DEFAULT_MAX_CONFIG = _DEFAULT_MAX_CONFIG + + def _get_table_metadata(self, tbl): + return self._metadata_getter.get(tbl) + + def delete(self): + """ + Delete entry from the DESC data registry. + + """ + + raise NotImplementedError + + def modify(self): + """ + Modify a entry in the DESC data registry. + + """ + + raise NotImplementedError diff --git a/src/dataregistry/registrar/dataset.py b/src/dataregistry/registrar/dataset.py index 52cbf025..157251da 100644 --- a/src/dataregistry/registrar/dataset.py +++ b/src/dataregistry/registrar/dataset.py @@ -5,6 +5,7 @@ from dataregistry.db_basic import add_table_row from sqlalchemy import select, update +from .base_table_class import BaseTable from .registrar_util import ( _bump_version, _copy_data, @@ -15,22 +16,13 @@ get_directory_info, ) -# Default maximum allowed length of configuration file allowed to be ingested -_DEFAULT_MAX_CONFIG = 10000 +class DatasetTable(BaseTable): + def __init__(self, db_connection, root_dir, owner, owner_type, execution_table): + super().__init__(db_connection, root_dir, owner, owner_type) -class RegistrarDataset: - def __init__(self, parent): - """ - Wrapper class to register/modify/delete dataset entries. - - Parameters - ---------- - parent : Registrar class - Contains db_connection, engine, etc - """ - - self.parent = parent + self.execution_table = execution_table + self.which_table = "dataset" def create( self, @@ -57,7 +49,7 @@ def create( execution_configuration=None, input_datasets=[], input_production_datasets=[], - max_config_length=_DEFAULT_MAX_CONFIG, + max_config_length=None, ): """ Create a new dataset entry in the DESC data registry. @@ -117,21 +109,25 @@ def create( The execution ID associated with the dataset """ + # Set max configuration file length + if max_config_length is None: + max_config_length = self._DEFAULT_MAX_CONFIG + # Make sure the owner_type is legal if owner_type is None: - if self.parent._owner_type is not None: - owner_type = self.parent._owner_type + if self._owner_type is not None: + owner_type = self._owner_type else: owner_type = "user" - if owner_type not in self.parent.get_owner_types(): + if owner_type not in self._OWNER_TYPES: raise ValueError(f"{owner_type} is not a valid owner_type") # Establish the dataset owner if owner is None: - if self.parent._owner is not None: - owner = self.parent._owner + if self._owner is not None: + owner = self._owner else: - owner = self.parent._uid + owner = self._uid if owner_type == "production": owner = "production" @@ -141,12 +137,12 @@ def create( raise ValueError("Cannot overwrite production entries") if version_suffix is not None: raise ValueError("Production entries can't have version suffix") - if self.parent._schema != "production": + if self._schema != "production": raise ValueError( "Only the production schema can handle owner_type='production'" ) else: - if self.parent._schema == "production": + if self._schema == "production": raise ValueError( "Only the production schema can handle owner_type='production'" ) @@ -156,7 +152,7 @@ def create( name = _name_from_relpath(relative_path) # Look for previous entries. Fail if not overwritable - dataset_table = self.parent._get_table_metadata("dataset") + dataset_table = self._get_table_metadata("dataset") previous = self._find_previous(relative_path, dataset_table, owner, owner_type) if previous is None: @@ -171,7 +167,7 @@ def create( # Generate new version fields based on previous entries # with the same name field and same suffix (i.e., bump) v_fields = _bump_version( - name, version, version_suffix, dataset_table, self.parent._engine + name, version, version_suffix, dataset_table, self._engine ) version_string = ( f"{v_fields['major']}.{v_fields['minor']}.{v_fields['patch']}" @@ -185,7 +181,7 @@ def create( execution_name = f"{execution_name}-{version_suffix}" if execution_description is None: execution_description = "Fabricated execution for dataset" - execution_id = self.parent.execution.create( + execution_id = self.execution_table.create( execution_name, description=execution_description, execution_start=execution_start, @@ -221,15 +217,15 @@ def create( values["register_date"] = datetime.now() values["owner_type"] = owner_type values["owner"] = owner - values["creator_uid"] = self.parent._uid - values["register_root_dir"] = self.parent._root_dir + values["creator_uid"] = self._uid + values["register_root_dir"] = self._root_dir # We tentatively start with an "invalid" dataset in the database. This # will be upgraded to True if the data copying (if any) was successful. values["is_valid"] = False # Create a new row in the data registry database. - with self.parent._engine.connect() as conn: + with self._engine.connect() as conn: prim_key = add_table_row(conn, dataset_table, values, commit=False) if len(previous) > 0: @@ -263,7 +259,7 @@ def create( ds_creation_date = creation_date # Copy was successful, update the entry with dataset metadata - with self.parent._engine.connect() as conn: + with self._engine.connect() as conn: update_stmt = ( update(dataset_table) .where(dataset_table.c.dataset_id == prim_key) @@ -319,8 +315,8 @@ def _handle_data(self, relative_path, old_location, owner, owner_type, verbose): owner_type, owner, relative_path, - schema=self.parent._schema, - root_dir=self.parent._root_dir, + schema=self._schema, + root_dir=self._root_dir, ) # Is the data already on location, or coming from somewhere new? @@ -397,7 +393,7 @@ def _find_previous(self, relative_path, dataset_table, owner, owner_type): .order_by(dataset_table.c.dataset_id.desc()) ) - with self.parent._engine.connect() as conn: + with self._engine.connect() as conn: result = conn.execute(stmt) conn.commit() @@ -410,19 +406,3 @@ def _find_previous(self, relative_path, dataset_table, owner, owner_type): previous.append(r.dataset_id) return previous - - def delete(self): - """ - Delete a dataset entry from the DESC data registry. - - """ - - raise NotImplementedError - - def modify(self): - """ - Modify a dataset entry in the DESC data registry. - - """ - - raise NotImplementedError diff --git a/src/dataregistry/registrar/dataset_alias.py b/src/dataregistry/registrar/dataset_alias.py index baaf5054..ebd6dea2 100644 --- a/src/dataregistry/registrar/dataset_alias.py +++ b/src/dataregistry/registrar/dataset_alias.py @@ -3,22 +3,14 @@ from dataregistry.db_basic import add_table_row from sqlalchemy import update -# Default maximum allowed length of configuration file allowed to be ingested -_DEFAULT_MAX_CONFIG = 10000 +from .base_table_class import BaseTable -class RegistrarDatasetAlias: - def __init__(self, parent): - """ - Wrapper class to register/modify/delete execution entries. - - Parameters - ---------- - parent : Registrar class - Contains db_connection, engine, etc - """ +class DatasetAliasTable(BaseTable): + def __init__(self, db_connection, root_dir, owner, owner_type): + super().__init__(db_connection, root_dir, owner, owner_type) - self.parent = parent + self.which_table = "dataset_alias" def create(self, aliasname, dataset_id): """ @@ -43,10 +35,10 @@ def create(self, aliasname, dataset_id): values = {"alias": aliasname} values["dataset_id"] = dataset_id values["register_date"] = now - values["creator_uid"] = self.parent._uid + values["creator_uid"] = self._uid - alias_table = self.parent._get_table_metadata("dataset_alias") - with self.parent._engine.connect() as conn: + alias_table = self._get_table_metadata("dataset_alias") + with self._engine.connect() as conn: prim_key = add_table_row(conn, alias_table, values) # Update any other alias rows which have been superseded diff --git a/src/dataregistry/registrar/execution.py b/src/dataregistry/registrar/execution.py index fed95823..0bf4f4d9 100644 --- a/src/dataregistry/registrar/execution.py +++ b/src/dataregistry/registrar/execution.py @@ -2,24 +2,15 @@ from dataregistry.db_basic import add_table_row +from .base_table_class import BaseTable from .registrar_util import _read_configuration_file -# Default maximum allowed length of configuration file allowed to be ingested -_DEFAULT_MAX_CONFIG = 10000 +class ExecutionTable(BaseTable): + def __init__(self, db_connection, root_dir, owner, owner_type): + super().__init__(db_connection, root_dir, owner, owner_type) -class RegistrarExecution: - def __init__(self, parent): - """ - Wrapper class to register/modify/delete execution entries. - - Parameters - ---------- - parent : Registrar class - Contains db_connection, engine, etc - """ - - self.parent = parent + self.which_table = "execution" def create( self, @@ -30,7 +21,7 @@ def create( configuration=None, input_datasets=[], input_production_datasets=[], - max_config_length=_DEFAULT_MAX_CONFIG, + max_config_length=None, ): """ Create a new execution entry in the DESC data registry. @@ -57,6 +48,10 @@ def create( The execution ID of the new row relating to this entry """ + # Set max configuration file length + if max_config_length is None: + max_config_length = self._DEFAULT_MAX_CONFIG + # Put the execution information together values = {"name": name} if locale: @@ -66,10 +61,10 @@ def create( if description: values["description"] = description values["register_date"] = datetime.now() - values["creator_uid"] = self.parent._uid + values["creator_uid"] = self._uid - exec_table = self.parent._get_table_metadata("execution") - dependency_table = self.parent._get_table_metadata("dependency") + exec_table = self._get_table_metadata("execution") + dependency_table = self._get_table_metadata("dependency") # Read configuration file. Enter contents as a raw string. if configuration: @@ -78,7 +73,7 @@ def create( ) # Enter row into data registry database - with self.parent._engine.connect() as conn: + with self._engine.connect() as conn: my_id = add_table_row(conn, exec_table, values, commit=False) # handle dependencies @@ -97,19 +92,3 @@ def create( conn.commit() return my_id - - def delete(self): - """ - Delete an execution entry from the DESC data registry. - - """ - - raise NotImplementedError - - def modify(self): - """ - Modify an execution entry in the DESC data registry. - - """ - - raise NotImplementedError diff --git a/src/dataregistry/registrar/registrar.py b/src/dataregistry/registrar/registrar.py index b29bfea1..5d14682a 100644 --- a/src/dataregistry/registrar/registrar.py +++ b/src/dataregistry/registrar/registrar.py @@ -1,16 +1,10 @@ -import os - -from dataregistry.db_basic import TableMetadata - -from .dataset import RegistrarDataset -from .dataset_alias import RegistrarDatasetAlias -from .execution import RegistrarExecution +from .base_table_class import _OWNER_TYPES +from .dataset import DatasetTable +from .dataset_alias import DatasetAliasTable +from .execution import ExecutionTable __all__ = ["Registrar"] -# Allowed owner types -_OWNER_TYPES = {"user", "project", "group", "production"} - class Registrar: def __init__( @@ -21,7 +15,11 @@ def __init__( owner_type=None, ): """ - Class to register new datasets, executions and alias names. + Wrapper registrar class. + + This holds callable subclasses for each table (dataset, execution and + dataset_alias) which are used to register/modify/delete entries in + those tables. Parameters ---------- @@ -38,27 +36,14 @@ def __init__( instance. """ - # Root directory on disk for data registry files - self._root_dir = root_dir - - # Database engine and dialect. - self._engine = db_connection.engine - self._schema = db_connection.schema - - # Link to Table Metadata. - self._metadata_getter = TableMetadata(db_connection) - - # Store user id - self._uid = os.getenv("USER") - - # Default owner and owner_type's - self._owner = owner - self._owner_type = owner_type - # Class wrappers which are used to create/modify/delete entries - self.dataset = RegistrarDataset(self) - self.execution = RegistrarExecution(self) - self.dataset_alias = RegistrarDatasetAlias(self) + self.execution = ExecutionTable(db_connection, root_dir, owner, owner_type) + self.dataset_alias = DatasetAliasTable( + db_connection, root_dir, owner, owner_type + ) + self.dataset = DatasetTable( + db_connection, root_dir, owner, owner_type, self.execution + ) def get_owner_types(self): """ @@ -72,6 +57,3 @@ def get_owner_types(self): """ return _OWNER_TYPES - - def _get_table_metadata(self, tbl): - return self._metadata_getter.get(tbl) diff --git a/tests/end_to_end_tests/test_end_to_end.py b/tests/end_to_end_tests/test_end_to_end.py index 47cf792c..c0c28b87 100644 --- a/tests/end_to_end_tests/test_end_to_end.py +++ b/tests/end_to_end_tests/test_end_to_end.py @@ -4,7 +4,7 @@ from dataregistry import DataRegistry from dataregistry.db_basic import SCHEMA_VERSION -from dataregistry.registrar import _OWNER_TYPES +#from dataregistry.registrar import get_owner_types import pytest From 41fbb5894667e0b2620891aa890b29659f244c4f Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Fri, 26 Jan 2024 14:02:02 +0100 Subject: [PATCH 09/19] Change 'create' to 'register' for making new database entries --- src/dataregistry/__init__.py | 1 - src/dataregistry/registrar/base_table_class.py | 1 + src/dataregistry/registrar/dataset.py | 4 ++-- src/dataregistry/registrar/dataset_alias.py | 2 +- src/dataregistry/registrar/execution.py | 2 +- src/dataregistry/registrar/registrar.py | 3 ++- tests/end_to_end_tests/test_end_to_end.py | 6 +++--- 7 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/dataregistry/__init__.py b/src/dataregistry/__init__.py index c32e52e1..b14f76e2 100644 --- a/src/dataregistry/__init__.py +++ b/src/dataregistry/__init__.py @@ -1,7 +1,6 @@ from ._version import __version__ from .db_basic import * from .registrar import * -#from .registrar_util import * from .query import * from .git_util import * from .DataRegistry import DataRegistry diff --git a/src/dataregistry/registrar/base_table_class.py b/src/dataregistry/registrar/base_table_class.py index 54a93f60..adf1ec10 100644 --- a/src/dataregistry/registrar/base_table_class.py +++ b/src/dataregistry/registrar/base_table_class.py @@ -15,6 +15,7 @@ def __init__(self, db_connection, root_dir, owner, owner_type): Base class to register/modify/delete entries in the database tables. Each table subclass (e.g., DatasetTable) will inherit this class. + Functions universal to all tables, such as delete and modify are written here, the register function and other unique functions for the tables are in their own subclasses. diff --git a/src/dataregistry/registrar/dataset.py b/src/dataregistry/registrar/dataset.py index 157251da..31abceb5 100644 --- a/src/dataregistry/registrar/dataset.py +++ b/src/dataregistry/registrar/dataset.py @@ -24,7 +24,7 @@ def __init__(self, db_connection, root_dir, owner, owner_type, execution_table): self.execution_table = execution_table self.which_table = "dataset" - def create( + def register( self, relative_path, version, @@ -181,7 +181,7 @@ def create( execution_name = f"{execution_name}-{version_suffix}" if execution_description is None: execution_description = "Fabricated execution for dataset" - execution_id = self.execution_table.create( + execution_id = self.execution_table.register( execution_name, description=execution_description, execution_start=execution_start, diff --git a/src/dataregistry/registrar/dataset_alias.py b/src/dataregistry/registrar/dataset_alias.py index ebd6dea2..055d7bc4 100644 --- a/src/dataregistry/registrar/dataset_alias.py +++ b/src/dataregistry/registrar/dataset_alias.py @@ -12,7 +12,7 @@ def __init__(self, db_connection, root_dir, owner, owner_type): self.which_table = "dataset_alias" - def create(self, aliasname, dataset_id): + def register(self, aliasname, dataset_id): """ Create a new dataset alias entry in the DESC data registry. diff --git a/src/dataregistry/registrar/execution.py b/src/dataregistry/registrar/execution.py index 0bf4f4d9..dec012d4 100644 --- a/src/dataregistry/registrar/execution.py +++ b/src/dataregistry/registrar/execution.py @@ -12,7 +12,7 @@ def __init__(self, db_connection, root_dir, owner, owner_type): self.which_table = "execution" - def create( + def register( self, name, description=None, diff --git a/src/dataregistry/registrar/registrar.py b/src/dataregistry/registrar/registrar.py index 5d14682a..4c82602d 100644 --- a/src/dataregistry/registrar/registrar.py +++ b/src/dataregistry/registrar/registrar.py @@ -36,7 +36,8 @@ def __init__( instance. """ - # Class wrappers which are used to create/modify/delete entries + # Class wrappers which are used to register/modify/delete entries in + # the database self.execution = ExecutionTable(db_connection, root_dir, owner, owner_type) self.dataset_alias = DatasetAliasTable( db_connection, root_dir, owner, owner_type diff --git a/tests/end_to_end_tests/test_end_to_end.py b/tests/end_to_end_tests/test_end_to_end.py index c0c28b87..f485888a 100644 --- a/tests/end_to_end_tests/test_end_to_end.py +++ b/tests/end_to_end_tests/test_end_to_end.py @@ -82,7 +82,7 @@ def _insert_alias_entry(datareg, name, dataset_id): The alias ID for this new entry """ - new_id = datareg.Registrar.dataset_alias.create(name, dataset_id) + new_id = datareg.Registrar.dataset_alias.register(name, dataset_id) assert new_id is not None, "Trying to create a dataset alias that already exists" print(f"Created dataset alias entry with id {new_id}") @@ -113,7 +113,7 @@ def _insert_execution_entry( The execution ID for this new entry """ - new_id = datareg.Registrar.execution.create( + new_id = datareg.Registrar.execution.register( name, description=description, input_datasets=input_datasets, @@ -201,7 +201,7 @@ def _insert_dataset_entry( make_sym_link = False # Add new entry. - dataset_id, execution_id = datareg.Registrar.dataset.create( + dataset_id, execution_id = datareg.Registrar.dataset.register( relpath, version, version_suffix=version_suffix, From fd71e206c71d0f1f2406e302864676557292abba Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Fri, 26 Jan 2024 14:07:10 +0100 Subject: [PATCH 10/19] Update the modify and delete placeholders --- .../registrar/base_table_class.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/dataregistry/registrar/base_table_class.py b/src/dataregistry/registrar/base_table_class.py index adf1ec10..155431d8 100644 --- a/src/dataregistry/registrar/base_table_class.py +++ b/src/dataregistry/registrar/base_table_class.py @@ -61,18 +61,29 @@ def __init__(self, db_connection, root_dir, owner, owner_type): def _get_table_metadata(self, tbl): return self._metadata_getter.get(tbl) - def delete(self): + def delete(self, entry_id): """ - Delete entry from the DESC data registry. + Delete an entry from the DESC data registry. + Parameters + ---------- + entry_id : int + The dataset/execution/etc ID we wish to delete from the database """ raise NotImplementedError - def modify(self): + def modify(self, entry_id, modify_fields): """ - Modify a entry in the DESC data registry. + Modify an entry in the DESC data registry. + Parameters + ---------- + entry_id : int + The dataset/execution/etc ID we wish to delete from the database + modify_fields : dict + Dict where key is the column to modify (must be allowed to modify) + and value is the desired new value for the entry """ raise NotImplementedError From 6ec07fa6983aa5079018f20b06006c762c89ce36 Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Fri, 26 Jan 2024 14:09:17 +0100 Subject: [PATCH 11/19] Update CLI to register over create --- src/cli/register.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cli/register.py b/src/cli/register.py index 60c578e5..3d67bd0f 100644 --- a/src/cli/register.py +++ b/src/cli/register.py @@ -37,7 +37,7 @@ def register_dataset(args): ) # Register new dataset. - new_id = datareg.Registrar.dataset.create( + new_id = datareg.Registrar.dataset.register( args.relative_path, args.version, name=args.name, From a75eb86e5ea2c0e4b34fe757617e2db94d9d1c33 Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Fri, 26 Jan 2024 14:15:32 +0100 Subject: [PATCH 12/19] Fix unit test --- src/dataregistry/DataRegistry.py | 6 +++--- tests/unit_tests/test_root_dir.py | 3 +-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/dataregistry/DataRegistry.py b/src/dataregistry/DataRegistry.py index 1ab0b5eb..00814f25 100644 --- a/src/dataregistry/DataRegistry.py +++ b/src/dataregistry/DataRegistry.py @@ -59,18 +59,18 @@ def __init__( self.db_connection = DbConnection(config_file, schema=schema, verbose=verbose) # Work out the location of the root directory - root_dir = self._get_root_dir(root_dir, site) + self.root_dir = self._get_root_dir(root_dir, site) # Create registrar object self.Registrar = Registrar( self.db_connection, - root_dir, + self.root_dir, owner=owner, owner_type=owner_type, ) # Create query object - self.Query = Query(self.db_connection, root_dir) + self.Query = Query(self.db_connection, self.root_dir) def _get_root_dir(self, root_dir, site): """ diff --git a/tests/unit_tests/test_root_dir.py b/tests/unit_tests/test_root_dir.py index 44d1841f..fe0e56ec 100644 --- a/tests/unit_tests/test_root_dir.py +++ b/tests/unit_tests/test_root_dir.py @@ -39,5 +39,4 @@ def test_root_dir_manual(root_dir, site, set_env_var, ans): if reg.db_connection.dialect != "sqlite": assert reg.db_connection.schema is not None - assert reg.Registrar._root_dir == ans - assert reg.Query._root_dir == ans + assert reg.root_dir == ans From 319ae3cf48740746a91ca48ff57968ffa0dd2597 Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Thu, 1 Feb 2024 14:10:16 +0100 Subject: [PATCH 13/19] Tweak doc strings --- src/dataregistry/DataRegistry.py | 16 ++++++--------- .../registrar/base_table_class.py | 4 ++-- src/dataregistry/registrar/dataset_alias.py | 18 +---------------- src/dataregistry/registrar/registrar.py | 20 ++++++------------- 4 files changed, 15 insertions(+), 43 deletions(-) diff --git a/src/dataregistry/DataRegistry.py b/src/dataregistry/DataRegistry.py index 00814f25..d4bef9b5 100644 --- a/src/dataregistry/DataRegistry.py +++ b/src/dataregistry/DataRegistry.py @@ -22,16 +22,17 @@ def __init__( """ Primary data registry wrapper class. - Class links to both the Registrar class, to registry new dataset, and - the Query class, to query existing datasets. + The DataRegistry class links to both the Registrar class, to + register/modify/delete datasets, and the Query class, to query existing + datasets. Links to the database is done automatically using the: - the users config file (if None defaults are used) - - the passed schema (if None default is used) + - the passed schema (if None the default schema is used) The `root_dir` is the location the data is copied to. This can be manually passed, or alternately a predefined `site` can be chosen. If - nether are chosen, the NERSC site will be selected. + nether are chosen, the NERSC site will be selected as the default. Parameters ---------- @@ -62,12 +63,7 @@ def __init__( self.root_dir = self._get_root_dir(root_dir, site) # Create registrar object - self.Registrar = Registrar( - self.db_connection, - self.root_dir, - owner=owner, - owner_type=owner_type, - ) + self.Registrar = Registrar(self.db_connection, self.root_dir, owner, owner_type) # Create query object self.Query = Query(self.db_connection, self.root_dir) diff --git a/src/dataregistry/registrar/base_table_class.py b/src/dataregistry/registrar/base_table_class.py index 155431d8..db64ba7b 100644 --- a/src/dataregistry/registrar/base_table_class.py +++ b/src/dataregistry/registrar/base_table_class.py @@ -17,8 +17,8 @@ def __init__(self, db_connection, root_dir, owner, owner_type): Each table subclass (e.g., DatasetTable) will inherit this class. Functions universal to all tables, such as delete and modify are - written here, the register function and other unique functions for the - tables are in their own subclasses. + written here, the register function, and other unique functions for the + tables, are in their respective subclasses. Parameters ---------- diff --git a/src/dataregistry/registrar/dataset_alias.py b/src/dataregistry/registrar/dataset_alias.py index 055d7bc4..8d925f05 100644 --- a/src/dataregistry/registrar/dataset_alias.py +++ b/src/dataregistry/registrar/dataset_alias.py @@ -14,7 +14,7 @@ def __init__(self, db_connection, root_dir, owner, owner_type): def register(self, aliasname, dataset_id): """ - Create a new dataset alias entry in the DESC data registry. + Create a new `dataset_alias` entry in the DESC data registry. Any args marked with '**' share their name with the associated column in the registry schema. Descriptions of what these columns are can be @@ -53,19 +53,3 @@ def register(self, aliasname, dataset_id): conn.execute(stmt) conn.commit() return prim_key - - def delete(self): - """ - Delete a dataset alias entry from the DESC data registry. - - """ - - raise NotImplementedError - - def modify(self): - """ - Modify a dataset alias entry in the DESC data registry. - - """ - - raise NotImplementedError diff --git a/src/dataregistry/registrar/registrar.py b/src/dataregistry/registrar/registrar.py index 4c82602d..b7e846d2 100644 --- a/src/dataregistry/registrar/registrar.py +++ b/src/dataregistry/registrar/registrar.py @@ -7,19 +7,11 @@ class Registrar: - def __init__( - self, - db_connection, - root_dir, - owner=None, - owner_type=None, - ): + def __init__(self, db_connection, root_dir, owner, owner_type): """ - Wrapper registrar class. - - This holds callable subclasses for each table (dataset, execution and - dataset_alias) which are used to register/modify/delete entries in - those tables. + The registrar class is a wrapper for each table subclass (dataset, + execution and dataset_alias). Each table subclass can + register/modify/delete entries in those tables. Parameters ---------- @@ -37,7 +29,7 @@ def __init__( """ # Class wrappers which are used to register/modify/delete entries in - # the database + # their respective tables in the database self.execution = ExecutionTable(db_connection, root_dir, owner, owner_type) self.dataset_alias = DatasetAliasTable( db_connection, root_dir, owner, owner_type @@ -48,7 +40,7 @@ def __init__( def get_owner_types(self): """ - Returns a list of allowed owner_types that can be registered within the + Returns a list of allowed `owner_types` that can be registered within the data registry. Returns From fa019ee3e4d9fce8964c26bc113de79a0d63b4e0 Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Fri, 9 Feb 2024 14:38:04 +0100 Subject: [PATCH 14/19] Remove commented line --- tests/end_to_end_tests/test_end_to_end.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/end_to_end_tests/test_end_to_end.py b/tests/end_to_end_tests/test_end_to_end.py index f485888a..f80dfea1 100644 --- a/tests/end_to_end_tests/test_end_to_end.py +++ b/tests/end_to_end_tests/test_end_to_end.py @@ -4,7 +4,6 @@ from dataregistry import DataRegistry from dataregistry.db_basic import SCHEMA_VERSION -#from dataregistry.registrar import get_owner_types import pytest From c95897190592b599022ecae75cdb150b6156a20c Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Fri, 9 Feb 2024 16:24:19 +0100 Subject: [PATCH 15/19] Fix ability to delete datasets with new restructuring --- .../registrar/base_table_class.py | 64 +- src/dataregistry/registrar/dataset.py | 915 ++++----- tests/end_to_end_tests/test_end_to_end.py | 1627 +++++++++-------- 3 files changed, 1301 insertions(+), 1305 deletions(-) diff --git a/src/dataregistry/registrar/base_table_class.py b/src/dataregistry/registrar/base_table_class.py index 9ada4cf7..1dab7c55 100644 --- a/src/dataregistry/registrar/base_table_class.py +++ b/src/dataregistry/registrar/base_table_class.py @@ -1,6 +1,19 @@ import os from dataregistry.db_basic import TableMetadata +from sqlalchemy import select, update +from datetime import datetime + +from .registrar_util import ( + _bump_version, + _copy_data, + _form_dataset_path, + _name_from_relpath, + _parse_version_string, + _read_configuration_file, + get_directory_info, +) +from .dataset_util import set_dataset_status, get_dataset_status # Allowed owner types _OWNER_TYPES = {"user", "project", "group", "production"} @@ -68,60 +81,11 @@ def delete(self, entry_id): Parameters ---------- entry_id : int - The dataset/execution/etc ID we wish to delete from the database + Entry we want to delete from the registry """ raise NotImplementedError - """ - Delete a dataset entry from the DESC data registry. - - This will remove the raw data from the root dir, but the dataset entry - remains in the registry (now with `status=3`). - - Parameters - ---------- - dataset_id : int - Dataset we want to delete from the registry - """ - -# # First make sure the given dataset id is in the registry -# dataset_table = self.parent._get_table_metadata("dataset") -# previous_dataset = self._find_previous(dataset_table, dataset_id=dataset_id) -# -# if previous_dataset is None: -# raise ValueError(f"Dataset ID {dataset_id} does not exist") -# if previous_dataset.status not in [0, 1]: -# raise ValueError(f"Dataset ID {dataset_id} does not have a valid status") -# -# # Update the status of the dataset to deleted -# with self.parent._engine.connect() as conn: -# update_stmt = ( -# update(dataset_table) -# .where(dataset_table.c.dataset_id == dataset_id) -# .values( -# status=3, -# delete_date=datetime.now(), -# delete_uid=self.parent._uid, -# ) -# ) -# conn.execute(update_stmt) -# conn.commit() -# -# # Delete the physical data in the root_dir -# if previous_dataset.status == 1: -# data_path = _form_dataset_path( -# previous_dataset.owner_type, -# previous_dataset.owner, -# previous_dataset.relative_path, -# schema=self.parent._schema, -# root_dir=self.parent._root_dir, -# ) -# print(f"Deleting data {data_path}") -# os.remove(data_path) -# -# print(f"Deleted {dataset_id} from data registry") - def modify(self, entry_id, modify_fields): """ Modify an entry in the DESC data registry. diff --git a/src/dataregistry/registrar/dataset.py b/src/dataregistry/registrar/dataset.py index 76bdafa1..e8ce0a6b 100644 --- a/src/dataregistry/registrar/dataset.py +++ b/src/dataregistry/registrar/dataset.py @@ -7,450 +7,481 @@ from .base_table_class import BaseTable from .registrar_util import ( - _bump_version, - _copy_data, - _form_dataset_path, - _name_from_relpath, - _parse_version_string, - _read_configuration_file, - get_directory_info, + _bump_version, + _copy_data, + _form_dataset_path, + _name_from_relpath, + _parse_version_string, + _read_configuration_file, + get_directory_info, ) from .dataset_util import set_dataset_status, get_dataset_status class DatasetTable(BaseTable): - def __init__(self, db_connection, root_dir, owner, owner_type, execution_table): - super().__init__(db_connection, root_dir, owner, owner_type) - - self.execution_table = execution_table - self.which_table = "dataset" - - def register( - self, - relative_path, - version, - version_suffix=None, - name=None, - creation_date=None, - description=None, - execution_id=None, - access_API=None, - access_API_configuration=None, - is_overwritable=False, - old_location=None, - copy=True, - is_dummy=False, - verbose=False, - owner=None, - owner_type=None, - execution_name=None, - execution_description=None, - execution_start=None, - execution_locale=None, - execution_configuration=None, - input_datasets=[], - input_production_datasets=[], - max_config_length=None, - ): - """ - Create a new dataset entry in the DESC data registry. - - Any args marked with '**' share their name with the associated column - in the registry schema. Descriptions of what these columns are can be - found in `schema.yaml` or the documentation. - - First, the dataset entry is created in the database. If success, the - data is then copied (if `old_location` was provided). Only if both - steps are successful will there be "valid" status entry in the - registry. - - Parameters - ---------- - relative_path** : str - version** : str - version_suffix** : str, optional - name** : str, optional - creation_date** : datetime, optional - description** : str, optional - execution_id** : int, optional - access_API** : str, optional - is_overwritable** : bool, optional - old_location : str, optional - Absolute location of dataset to copy into the data registry. - - If None, dataset should already be at correct relative_path within - the data registry. - copy : bool, optional - True to copy data from ``old_location`` into the data registry - (default behaviour). - False to create a symlink. - is_dummy : bool, optional - True for "dummy" datasets (no data is copied, for testing purposes - only) - verbose : bool, optional - Provide some additional output information - owner** : str, optional - owner_type** : str, optional - execution_name** : str, optional - execution_description** : str, optional - execution_start** : datetime, optional - execution_locale** : str, optional - execution_configuration** : str, optional - input_datasets : list, optional - List of dataset ids that were the input to this execution - input_production_datasets : list, optional - List of production dataset ids that were the input to this execution - max_config_length : int, optional - Maxiumum number of lines to read from a configuration file - - Returns - ------- - prim_key : int - The dataset ID of the new row relating to this entry (else None) - execution_id : int - The execution ID associated with the dataset - """ - - # Set max configuration file length - if max_config_length is None: - max_config_length = self._DEFAULT_MAX_CONFIG - - # Make sure the owner_type is legal - if owner_type is None: - if self._owner_type is not None: - owner_type = self._owner_type - else: - owner_type = "user" - if owner_type not in self._OWNER_TYPES: - raise ValueError(f"{owner_type} is not a valid owner_type") - - # Establish the dataset owner - if owner is None: - if self._owner is not None: - owner = self._owner - else: - owner = self._uid - if owner_type == "production": - owner = "production" - - # Checks for production datasets - if owner_type == "production": - if is_overwritable: - raise ValueError("Cannot overwrite production entries") - if version_suffix is not None: - raise ValueError("Production entries can't have version suffix") - if self._schema != "production": - raise ValueError( - "Only the production schema can handle owner_type='production'" - ) - else: - if self._schema == "production": - raise ValueError( - "Only the production schema can handle owner_type='production'" - ) - - # If `name` not passed, automatically generate a name from the relative path - if name is None: - name = _name_from_relpath(relative_path) - - # Look for previous entries. Fail if not overwritable - dataset_table = self._get_table_metadata("dataset") - previous_dataset = self._find_previous( - dataset_table, - relative_path=relative_path, - owner=owner, - owner_type=owner_type, - ) - - if previous_dataset is not None: - if not previous_dataset.is_overwritable: - print(f"Dataset {relative_path} exists, and is not overwritable") - return None - - # Deal with version string (non-special case) - if version not in ["major", "minor", "patch"]: - v_fields = _parse_version_string(version) - version_string = version - else: - # Generate new version fields based on previous entries - # with the same name field and same suffix (i.e., bump) - v_fields = _bump_version( - name, version, version_suffix, dataset_table, self._engine - ) - version_string = ( - f"{v_fields['major']}.{v_fields['minor']}.{v_fields['patch']}" - ) - - # If no execution_id is supplied, create a minimal entry - if execution_id is None: - if execution_name is None: - execution_name = f"for_dataset_{name}-{version_string}" - if version_suffix: - execution_name = f"{execution_name}-{version_suffix}" - if execution_description is None: - execution_description = "Fabricated execution for dataset" - execution_id = self.execution_table.register( - execution_name, - description=execution_description, - execution_start=execution_start, - locale=execution_locale, - configuration=execution_configuration, - input_datasets=input_datasets, - input_production_datasets=input_production_datasets, - ) - - # Pull the dataset properties together - values = {"name": name, "relative_path": relative_path} - values["version_major"] = v_fields["major"] - values["version_minor"] = v_fields["minor"] - values["version_patch"] = v_fields["patch"] - values["version_string"] = version_string - if version_suffix: - values["version_suffix"] = version_suffix - if description: - values["description"] = description - if execution_id: - values["execution_id"] = execution_id - if access_API: - values["access_API"] = access_API - if access_API_configuration: - values["access_API_configuration"] = _read_configuration_file( - access_API_configuration, max_config_length - ) - values["is_overwritable"] = is_overwritable - values["is_overwritten"] = False - values["is_external_link"] = False - values["is_archived"] = False - values["register_date"] = datetime.now() - values["owner_type"] = owner_type - values["owner"] = owner - values["creator_uid"] = self._uid - values["register_root_dir"] = self._root_dir - - # We tentatively start with an "invalid" dataset in the database. This - # will be upgraded to valid if the data copying (if any) was successful. - values["status"] = 0 - - # Create a new row in the data registry database. - with self._engine.connect() as conn: - prim_key = add_table_row(conn, dataset_table, values, commit=False) - - if previous_dataset is not None: - # Update previous rows, setting is_overwritten to True - update_stmt = ( - update(dataset_table) - .where(dataset_table.c.dataset_id == previous_dataset.dataset_id) - .values(is_overwritten=True) - ) - conn.execute(update_stmt) - conn.commit() - - # Get dataset characteristics; copy to `root_dir` if requested - if not is_dummy: - ( - dataset_organization, - num_files, - total_size, - ds_creation_date, - ) = self._handle_data( - relative_path, old_location, owner, owner_type, verbose - ) - valid_status = 1 - else: - dataset_organization = "dummy" - num_files = 0 - total_size = 0 - ds_creation_date = None - valid_status = 0 - - # Case where use is overwriting the dateset `creation_date` - if creation_date: - ds_creation_date = creation_date - - # Copy was successful, update the entry with dataset metadata - with self._engine.connect() as conn: - update_stmt = ( - update(dataset_table) - .where(dataset_table.c.dataset_id == prim_key) - .values( - data_org=dataset_organization, - nfiles=num_files, - total_disk_space=total_size / 1024 / 1024, - creation_date=ds_creation_date, - status=set_dataset_status(values["status"], valid=True), - ) - ) - conn.execute(update_stmt) - conn.commit() - - return prim_key, execution_id - - def _handle_data(self, relative_path, old_location, owner, owner_type, verbose): - """ - Find characteristics of dataset (i.e., is it a file or directory, how - many files and total disk space of the dataset). - - If old_location is not None, copy the dataset files and directories - into the data registry. - - Parameters - ---------- - relative_path : str - Relative path of dataset in the data registry - old_location : str - Location of data (if not already in the data registry root) - Data will be copied from this location - owner : str - Owner of the dataset - owner_type : str - Owner type of the dataset - verbose : bool - True for extra output - - Returns - ------- - dataset_organization : str - "file", "directory", or "dummy" - num_files : int - Total number of files making up dataset - total_size : float - Total disk space of dataset in bytes - ds_creation_date : datetime - When file or directory was created - """ - - # Get destination directory in data registry. - dest = _form_dataset_path( - owner_type, - owner, - relative_path, - schema=self._schema, - root_dir=self._root_dir, - ) - - # Is the data already on location, or coming from somewhere new? - if old_location: - loc = old_location - else: - loc = dest - - # Get metadata on dataset. - if os.path.isfile(loc): - dataset_organization = "file" - elif os.path.isdir(loc): - dataset_organization = "directory" - else: - raise FileNotFoundError(f"Dataset {loc} not found") - - if verbose: - tic = time.time() - print("Collecting metadata...", end="") - - ds_creation_date = datetime.fromtimestamp(os.path.getctime(loc)) - - if dataset_organization == "directory": - num_files, total_size = get_directory_info(loc) - else: - num_files = 1 - total_size = os.path.getsize(loc) - if verbose: - print(f"took {time.time()-tic:.2f}s") - - # Copy data into data registry - if old_location: - if verbose: - tic = time.time() - print( - f"Copying {num_files} files ({total_size/1024/1024:.2f} Mb)...", - end="", - ) - _copy_data(dataset_organization, old_location, dest) - if verbose: - print(f"took {time.time()-tic:.2f}") - - return dataset_organization, num_files, total_size, ds_creation_date - - def _find_previous( - self, - dataset_table, - relative_path=None, - owner=None, - owner_type=None, - dataset_id=None, - ): - """ - Check to see if a dataset exists already in the registry, and if we are - allowed to overwrite it. - - Can search either by `dataset_id`, or a combination of `relative_path`, - `owner` and `owner_type`. - - Only one dataset should ever be found. - - Parameters - ---------- - dataset_table : SQLAlchemy Table object - Link to the dataset table - relative_path : str, optional - Relative path to dataset - owner : str, optional - Owner of the dataset - owner_type : str, optional - dataset_id : int, optional - - Returns - ------- - r : CursorResult object - Searched dataset - """ - - # Make sure we have all the relavant information - if dataset_id is None: - if (relative_path is None) or (owner is None) or (owner_type is None): - raise ValueError( - "Must pass relative_path, owner and owner_type to _find_previous" - ) - - # Search for dataset in the registry. - if dataset_id is None: - stmt = ( - select( - dataset_table.c.dataset_id, - dataset_table.c.is_overwritable, - dataset_table.c.status, - dataset_table.c.owner, - dataset_table.c.owner_type, - dataset_table.c.relative_path, - ) - .where( - dataset_table.c.relative_path == relative_path, - dataset_table.c.owner == owner, - dataset_table.c.owner_type == owner_type, - ) - .order_by(dataset_table.c.dataset_id.desc()) - ) - else: - stmt = ( - select( - dataset_table.c.dataset_id, - dataset_table.c.is_overwritable, - dataset_table.c.status, - dataset_table.c.owner, - dataset_table.c.owner_type, - dataset_table.c.relative_path, - ) - .where( - dataset_table.c.dataset_id == dataset_id, - ) - .order_by(dataset_table.c.dataset_id.desc()) - ) - - with self._engine.connect() as conn: - result = conn.execute(stmt) - conn.commit() - - # If the datasets are overwritable, log their ID, else return None - for r in result: - return r - - return None + def __init__(self, db_connection, root_dir, owner, owner_type, execution_table): + super().__init__(db_connection, root_dir, owner, owner_type) + + self.execution_table = execution_table + self.which_table = "dataset" + + def register( + self, + relative_path, + version, + version_suffix=None, + name=None, + creation_date=None, + description=None, + execution_id=None, + access_API=None, + access_API_configuration=None, + is_overwritable=False, + old_location=None, + copy=True, + is_dummy=False, + verbose=False, + owner=None, + owner_type=None, + execution_name=None, + execution_description=None, + execution_start=None, + execution_locale=None, + execution_configuration=None, + input_datasets=[], + input_production_datasets=[], + max_config_length=None, + ): + """ + Create a new dataset entry in the DESC data registry. + + Any args marked with '**' share their name with the associated column + in the registry schema. Descriptions of what these columns are can be + found in `schema.yaml` or the documentation. + + First, the dataset entry is created in the database. If success, the + data is then copied (if `old_location` was provided). Only if both + steps are successful will there be "valid" status entry in the + registry. + + Parameters + ---------- + relative_path** : str + version** : str + version_suffix** : str, optional + name** : str, optional + creation_date** : datetime, optional + description** : str, optional + execution_id** : int, optional + access_API** : str, optional + is_overwritable** : bool, optional + old_location : str, optional + Absolute location of dataset to copy into the data registry. + + If None, dataset should already be at correct relative_path within + the data registry. + copy : bool, optional + True to copy data from ``old_location`` into the data registry + (default behaviour). + False to create a symlink. + is_dummy : bool, optional + True for "dummy" datasets (no data is copied, for testing purposes + only) + verbose : bool, optional + Provide some additional output information + owner** : str, optional + owner_type** : str, optional + execution_name** : str, optional + execution_description** : str, optional + execution_start** : datetime, optional + execution_locale** : str, optional + execution_configuration** : str, optional + input_datasets : list, optional + List of dataset ids that were the input to this execution + input_production_datasets : list, optional + List of production dataset ids that were the input to this execution + max_config_length : int, optional + Maxiumum number of lines to read from a configuration file + + Returns + ------- + prim_key : int + The dataset ID of the new row relating to this entry (else None) + execution_id : int + The execution ID associated with the dataset + """ + + # Set max configuration file length + if max_config_length is None: + max_config_length = self._DEFAULT_MAX_CONFIG + + # Make sure the owner_type is legal + if owner_type is None: + if self._owner_type is not None: + owner_type = self._owner_type + else: + owner_type = "user" + if owner_type not in self._OWNER_TYPES: + raise ValueError(f"{owner_type} is not a valid owner_type") + + # Establish the dataset owner + if owner is None: + if self._owner is not None: + owner = self._owner + else: + owner = self._uid + if owner_type == "production": + owner = "production" + + # Checks for production datasets + if owner_type == "production": + if is_overwritable: + raise ValueError("Cannot overwrite production entries") + if version_suffix is not None: + raise ValueError("Production entries can't have version suffix") + if self._schema != "production": + raise ValueError( + "Only the production schema can handle owner_type='production'" + ) + else: + if self._schema == "production": + raise ValueError( + "Only the production schema can handle owner_type='production'" + ) + + # If `name` not passed, automatically generate a name from the relative path + if name is None: + name = _name_from_relpath(relative_path) + + # Look for previous entries. Fail if not overwritable + dataset_table = self._get_table_metadata("dataset") + previous_dataset = self._find_entry( + relative_path=relative_path, + owner=owner, + owner_type=owner_type, + ) + + if previous_dataset is not None: + if not previous_dataset.is_overwritable: + print(f"Dataset {relative_path} exists, and is not overwritable") + return None + + # Deal with version string (non-special case) + if version not in ["major", "minor", "patch"]: + v_fields = _parse_version_string(version) + version_string = version + else: + # Generate new version fields based on previous entries + # with the same name field and same suffix (i.e., bump) + v_fields = _bump_version( + name, version, version_suffix, dataset_table, self._engine + ) + version_string = ( + f"{v_fields['major']}.{v_fields['minor']}.{v_fields['patch']}" + ) + + # If no execution_id is supplied, create a minimal entry + if execution_id is None: + if execution_name is None: + execution_name = f"for_dataset_{name}-{version_string}" + if version_suffix: + execution_name = f"{execution_name}-{version_suffix}" + if execution_description is None: + execution_description = "Fabricated execution for dataset" + execution_id = self.execution_table.register( + execution_name, + description=execution_description, + execution_start=execution_start, + locale=execution_locale, + configuration=execution_configuration, + input_datasets=input_datasets, + input_production_datasets=input_production_datasets, + ) + + # Pull the dataset properties together + values = {"name": name, "relative_path": relative_path} + values["version_major"] = v_fields["major"] + values["version_minor"] = v_fields["minor"] + values["version_patch"] = v_fields["patch"] + values["version_string"] = version_string + if version_suffix: + values["version_suffix"] = version_suffix + if description: + values["description"] = description + if execution_id: + values["execution_id"] = execution_id + if access_API: + values["access_API"] = access_API + if access_API_configuration: + values["access_API_configuration"] = _read_configuration_file( + access_API_configuration, max_config_length + ) + values["is_overwritable"] = is_overwritable + values["is_overwritten"] = False + values["is_external_link"] = False + values["is_archived"] = False + values["register_date"] = datetime.now() + values["owner_type"] = owner_type + values["owner"] = owner + values["creator_uid"] = self._uid + values["register_root_dir"] = self._root_dir + + # We tentatively start with an "invalid" dataset in the database. This + # will be upgraded to valid if the data copying (if any) was successful. + values["status"] = 0 + + # Create a new row in the data registry database. + with self._engine.connect() as conn: + prim_key = add_table_row(conn, dataset_table, values, commit=False) + + if previous_dataset is not None: + # Update previous rows, setting is_overwritten to True + update_stmt = ( + update(dataset_table) + .where(dataset_table.c.dataset_id == previous_dataset.dataset_id) + .values(is_overwritten=True) + ) + conn.execute(update_stmt) + conn.commit() + + # Get dataset characteristics; copy to `root_dir` if requested + if not is_dummy: + ( + dataset_organization, + num_files, + total_size, + ds_creation_date, + ) = self._handle_data( + relative_path, old_location, owner, owner_type, verbose + ) + valid_status = 1 + else: + dataset_organization = "dummy" + num_files = 0 + total_size = 0 + ds_creation_date = None + valid_status = 0 + + # Case where use is overwriting the dateset `creation_date` + if creation_date: + ds_creation_date = creation_date + + # Copy was successful, update the entry with dataset metadata + with self._engine.connect() as conn: + update_stmt = ( + update(dataset_table) + .where(dataset_table.c.dataset_id == prim_key) + .values( + data_org=dataset_organization, + nfiles=num_files, + total_disk_space=total_size / 1024 / 1024, + creation_date=ds_creation_date, + status=set_dataset_status(values["status"], valid=True), + ) + ) + conn.execute(update_stmt) + conn.commit() + + return prim_key, execution_id + + def _handle_data(self, relative_path, old_location, owner, owner_type, verbose): + """ + Find characteristics of dataset (i.e., is it a file or directory, how + many files and total disk space of the dataset). + + If old_location is not None, copy the dataset files and directories + into the data registry. + + Parameters + ---------- + relative_path : str + Relative path of dataset in the data registry + old_location : str + Location of data (if not already in the data registry root) + Data will be copied from this location + owner : str + Owner of the dataset + owner_type : str + Owner type of the dataset + verbose : bool + True for extra output + + Returns + ------- + dataset_organization : str + "file", "directory", or "dummy" + num_files : int + Total number of files making up dataset + total_size : float + Total disk space of dataset in bytes + ds_creation_date : datetime + When file or directory was created + """ + + # Get destination directory in data registry. + dest = _form_dataset_path( + owner_type, + owner, + relative_path, + schema=self._schema, + root_dir=self._root_dir, + ) + + # Is the data already on location, or coming from somewhere new? + if old_location: + loc = old_location + else: + loc = dest + + # Get metadata on dataset. + if os.path.isfile(loc): + dataset_organization = "file" + elif os.path.isdir(loc): + dataset_organization = "directory" + else: + raise FileNotFoundError(f"Dataset {loc} not found") + + if verbose: + tic = time.time() + print("Collecting metadata...", end="") + + ds_creation_date = datetime.fromtimestamp(os.path.getctime(loc)) + + if dataset_organization == "directory": + num_files, total_size = get_directory_info(loc) + else: + num_files = 1 + total_size = os.path.getsize(loc) + if verbose: + print(f"took {time.time()-tic:.2f}s") + + # Copy data into data registry + if old_location: + if verbose: + tic = time.time() + print( + f"Copying {num_files} files ({total_size/1024/1024:.2f} Mb)...", + end="", + ) + _copy_data(dataset_organization, old_location, dest) + if verbose: + print(f"took {time.time()-tic:.2f}") + + return dataset_organization, num_files, total_size, ds_creation_date + + def _find_entry( + self, + relative_path=None, + owner=None, + owner_type=None, + dataset_id=None, + ): + """ + Find a dataset entry in the database. + + Can search by either: + 1) Just `dataset_id` + 2) A combination of `relative_path`, `owner` and `owner_type`. + + Only one dataset should ever be found. + + Parameters + ---------- + relative_path : str, optional + Relative path to dataset + owner : str, optional + Owner of the dataset + owner_type : str, optional + dataset_id : int, optional + + Returns + ------- + r : CursorResult object + Searched dataset + """ + + # Make sure we have all the relavant information + if dataset_id is None: + if (relative_path is None) or (owner is None) or (owner_type is None): + raise ValueError( + "Must pass relative_path, owner and owner_type to _find_entry" + ) + + # Search for dataset in the registry. + dataset_table = self._get_table_metadata("dataset") + stmt = select(dataset_table) + + if dataset_id is None: + stmt = stmt.where( + dataset_table.c.relative_path == relative_path, + dataset_table.c.owner == owner, + dataset_table.c.owner_type == owner_type, + ) + else: + stmt = stmt.where(dataset_table.c.dataset_id == dataset_id) + + with self._engine.connect() as conn: + result = conn.execute(stmt) + conn.commit() + + # Pull out the single result + for r in result: + return r + + # No results found + return None + + def delete(self, dataset_id): + """ + Delete an dataset entry from the DESC data registry. + + This will also remove the raw data from the root dir, but the dataset + entry remains in the registry (now with an updated `status` field). + + Parameters + ---------- + dataset_id : int + Dataset we want to delete from the registry + """ + + # First make sure the given dataset id is in the registry + dataset_table = self._get_table_metadata(self.which_table) + previous_dataset = self._find_entry(dataset_table, dataset_id=dataset_id) + + # Check dataset exists + if previous_dataset is None: + raise ValueError(f"Dataset ID {dataset_id} does not exist") + # Check dataset is valid + if not get_dataset_status(previous_dataset.status, "valid"): + raise ValueError(f"Dataset ID {dataset_id} does not have a valid status") + # Check dataset has not already been deleted + if get_dataset_status(previous_dataset.status, "deleted"): + raise ValueError(f"Dataset ID {dataset_id} does not have a valid status") + + # Update the status of the dataset to deleted + with self._engine.connect() as conn: + update_stmt = ( + update(dataset_table) + .where(dataset_table.c.dataset_id == dataset_id) + .values( + status=set_dataset_status(previous_dataset.status, deleted=True), + delete_date=datetime.now(), + delete_uid=self._uid, + ) + ) + conn.execute(update_stmt) + conn.commit() + + # Delete the physical data in the root_dir + if previous_dataset.data_org != "dummy": + data_path = _form_dataset_path( + previous_dataset.owner_type, + previous_dataset.owner, + previous_dataset.relative_path, + schema=self._schema, + root_dir=self._root_dir, + ) + print(f"Deleting data {data_path}") + os.remove(data_path) + + print(f"Deleted {dataset_id} from data registry") diff --git a/tests/end_to_end_tests/test_end_to_end.py b/tests/end_to_end_tests/test_end_to_end.py index fad00de4..da60e2d2 100644 --- a/tests/end_to_end_tests/test_end_to_end.py +++ b/tests/end_to_end_tests/test_end_to_end.py @@ -7,113 +7,114 @@ import pytest from dataregistry.registrar.registrar_util import _form_dataset_path +from dataregistry.registrar.dataset_util import set_dataset_status, get_dataset_status @pytest.fixture def dummy_file(tmp_path): - """ - Create some dummy (temporary) files and directories - - Parameters - ---------- - tmp_path : pathlib.Path object - - Returns - ------- - tmp_src_dir : pathlib.Path object - Temporary files we are going to be copying into the registry will be - created in here - tmp_root_dir : pathlib.Path object - Temporary root_dir for the registry we can copy files to - """ - - # Temp dir for files that we copy files from (old_location) - tmp_src_dir = tmp_path / "source" - tmp_src_dir.mkdir() - - for i in range(2): - f = tmp_src_dir / f"file{i+1}.txt" - f.write_text("i am a dummy file") - - p = tmp_src_dir / "directory1" - p.mkdir() - f = p / "file2.txt" - f.write_text("i am another dummy file") - - # Temp root_dir of the registry - tmp_root_dir = tmp_path / "root_dir" - for THIS_SCHEMA in [SCHEMA_VERSION + "/", ""]: - p = tmp_root_dir / f"{THIS_SCHEMA}user/{os.getenv('USER')}/dummy_dir" - p.mkdir(parents=True) - - f = p / "file1.txt" - f.write_text("i am another dummy file (but on location in a dir)") - - p = tmp_root_dir / f"{THIS_SCHEMA}user/{os.getenv('USER')}" - f = p / "file1.txt" - f.write_text("i am another dummy file (but on location)") - - # Make a dummy configuration yaml file - data = { - "run_by": "somebody", - "software_version": {"major": 1, "minor": 1, "patch": 0}, - "an_important_list": [1, 2, 3], - } - - # Write the data to the YAML file - with open(tmp_src_dir / "dummy_configuration_file.yaml", "w") as file: - yaml.dump(data, file, default_flow_style=False) - - return tmp_src_dir, tmp_root_dir + """ + Create some dummy (temporary) files and directories + + Parameters + ---------- + tmp_path : pathlib.Path object + + Returns + ------- + tmp_src_dir : pathlib.Path object + Temporary files we are going to be copying into the registry will be + created in here + tmp_root_dir : pathlib.Path object + Temporary root_dir for the registry we can copy files to + """ + + # Temp dir for files that we copy files from (old_location) + tmp_src_dir = tmp_path / "source" + tmp_src_dir.mkdir() + + for i in range(2): + f = tmp_src_dir / f"file{i+1}.txt" + f.write_text("i am a dummy file") + + p = tmp_src_dir / "directory1" + p.mkdir() + f = p / "file2.txt" + f.write_text("i am another dummy file") + + # Temp root_dir of the registry + tmp_root_dir = tmp_path / "root_dir" + for THIS_SCHEMA in [SCHEMA_VERSION + "/", ""]: + p = tmp_root_dir / f"{THIS_SCHEMA}user/{os.getenv('USER')}/dummy_dir" + p.mkdir(parents=True) + + f = p / "file1.txt" + f.write_text("i am another dummy file (but on location in a dir)") + + p = tmp_root_dir / f"{THIS_SCHEMA}user/{os.getenv('USER')}" + f = p / "file1.txt" + f.write_text("i am another dummy file (but on location)") + + # Make a dummy configuration yaml file + data = { + "run_by": "somebody", + "software_version": {"major": 1, "minor": 1, "patch": 0}, + "an_important_list": [1, 2, 3], + } + + # Write the data to the YAML file + with open(tmp_src_dir / "dummy_configuration_file.yaml", "w") as file: + yaml.dump(data, file, default_flow_style=False) + + return tmp_src_dir, tmp_root_dir def _insert_alias_entry(datareg, name, dataset_id): - """ - Wrapper to create dataset alias entry - - Parameters - ---------- - name : str - Name of alias - dataset_id : int - Dataset we are assigning alias name to - - Returns - ------- - new_id : int - The alias ID for this new entry - """ + """ + Wrapper to create dataset alias entry + + Parameters + ---------- + name : str + Name of alias + dataset_id : int + Dataset we are assigning alias name to + + Returns + ------- + new_id : int + The alias ID for this new entry + """ new_id = datareg.Registrar.dataset_alias.register(name, dataset_id) - assert new_id is not None, "Trying to create a dataset alias that already exists" - print(f"Created dataset alias entry with id {new_id}") + assert new_id is not None, "Trying to create a dataset alias that already exists" + print(f"Created dataset alias entry with id {new_id}") - return new_id + return new_id def _insert_execution_entry( - datareg, name, description, input_datasets=[], configuration=None + datareg, name, description, input_datasets=[], configuration=None ): - """ - Wrapper to create execution entry - - Parameters - ---------- - name : str - Name of execution - description : str - Description of execution - intput_datasets : list - List of dataset ids - configuration : str - Path to configuration file for execution - - Returns - ------- - new_id : int - The execution ID for this new entry - """ + """ + Wrapper to create execution entry + + Parameters + ---------- + name : str + Name of execution + description : str + Description of execution + intput_datasets : list + List of dataset ids + configuration : str + Path to configuration file for execution + + Returns + ------- + new_id : int + The execution ID for this new entry + """ new_id = datareg.Registrar.execution.register( @@ -123,85 +124,85 @@ def _insert_execution_entry( configuration=configuration, ) - assert new_id is not None, "Trying to create a execution that already exists" - print(f"Created execution entry with id {new_id}") + assert new_id is not None, "Trying to create a execution that already exists" + print(f"Created execution entry with id {new_id}") - return new_id + return new_id def _insert_dataset_entry( - datareg, - relpath, - version, - owner_type, - owner, - description, - name=None, - execution_id=None, - version_suffix=None, - is_dummy=True, - old_location=None, - is_overwritable=False, - which_datareg=None, - execution_name=None, - execution_description=None, - execution_start=None, - execution_locale=None, - execution_configuration=None, - input_datasets=[], + datareg, + relpath, + version, + owner_type, + owner, + description, + name=None, + execution_id=None, + version_suffix=None, + is_dummy=True, + old_location=None, + is_overwritable=False, + which_datareg=None, + execution_name=None, + execution_description=None, + execution_start=None, + execution_locale=None, + execution_configuration=None, + input_datasets=[], ): - """ - Wrapper to create dataset entry - - Parameters - ---------- - relpath : str - Relative path within the data registry to store the data - Relative to ///... - version : str - Semantic version string (i.e., M.N.P) or - "major", "minor", "patch" to automatically bump the version previous - owner_type : str - Either "production", "group", "user" - owner : str - Dataset owner - description : str - Description of dataset - name : str - A manually selected name for the dataset - execution_id : int - Execution entry related to this dataset - version_suffix : str - Append a suffix to the version string - is_dummy : bool - True for dummy dataset (copies no data) - old_location : str - Path to data to be copied to data registry - which_datareg : DataRegistry object - In case we want to register using a custom DataRegistry object - execution_name : str, optional - Typically pipeline name or program name - execution_description : str, optional - Human readible description of execution - execution_start : datetime, optional - Date the execution started - execution_locale : str, optional - Where was the execution performed? - execution_configuration : str, optional - Path to text file used to configure the execution - input_datasets : list, optional - List of dataset ids that were the input to this execution - - Returns - ------- - dataset_id : int - The dataset it created for this entry - """ - - # Some defaults over all test datasets - locale = "NERSC" - creation_data = None - make_sym_link = False + """ + Wrapper to create dataset entry + + Parameters + ---------- + relpath : str + Relative path within the data registry to store the data + Relative to ///... + version : str + Semantic version string (i.e., M.N.P) or + "major", "minor", "patch" to automatically bump the version previous + owner_type : str + Either "production", "group", "user" + owner : str + Dataset owner + description : str + Description of dataset + name : str + A manually selected name for the dataset + execution_id : int + Execution entry related to this dataset + version_suffix : str + Append a suffix to the version string + is_dummy : bool + True for dummy dataset (copies no data) + old_location : str + Path to data to be copied to data registry + which_datareg : DataRegistry object + In case we want to register using a custom DataRegistry object + execution_name : str, optional + Typically pipeline name or program name + execution_description : str, optional + Human readible description of execution + execution_start : datetime, optional + Date the execution started + execution_locale : str, optional + Where was the execution performed? + execution_configuration : str, optional + Path to text file used to configure the execution + input_datasets : list, optional + List of dataset ids that were the input to this execution + + Returns + ------- + dataset_id : int + The dataset it created for this entry + """ + + # Some defaults over all test datasets + locale = "NERSC" + creation_data = None + make_sym_link = False # Add new entry. dataset_id, execution_id = datareg.Registrar.dataset.register( @@ -227,703 +228,703 @@ def _insert_dataset_entry( input_datasets=input_datasets, ) - assert dataset_id is not None, "Trying to create a dataset that already exists" - assert execution_id is not None, "Trying to create a execution that already exists" - print(f"Created dataset entry with id {dataset_id}") + assert dataset_id is not None, "Trying to create a dataset that already exists" + assert execution_id is not None, "Trying to create a execution that already exists" + print(f"Created dataset entry with id {dataset_id}") - return dataset_id + return dataset_id def test_simple_query(dummy_file): - """Make a simple entry, and make sure the query returns the correct result""" - - # Establish connection to database - tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) - - # Add entry - d_id = _insert_dataset_entry( - datareg, - "DESC/datasets/my_first_dataset", - "0.0.1", - "user", - None, - "This is my first DESC dataset", - ) - - # Query - f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id) - results = datareg.Query.find_datasets( - [ - "dataset.name", - "dataset.version_string", - "dataset.owner", - "dataset.owner_type", - "dataset.description", - "dataset.version_major", - "dataset.version_minor", - "dataset.version_patch", - "dataset.relative_path", - "dataset.version_suffix", - "dataset.data_org", - ], - [f], - return_format="cursorresult", - ) - - for i, r in enumerate(results): - assert getattr(r, "dataset.name") == "my_first_dataset" - assert getattr(r, "dataset.version_string") == "0.0.1" - assert getattr(r, "dataset.version_major") == 0 - assert getattr(r, "dataset.version_minor") == 0 - assert getattr(r, "dataset.version_patch") == 1 - assert getattr(r, "dataset.owner") == os.getenv("USER") - assert getattr(r, "dataset.owner_type") == "user" - assert getattr(r, "dataset.description") == "This is my first DESC dataset" - assert getattr(r, "dataset.relative_path") == "DESC/datasets/my_first_dataset" - assert getattr(r, "dataset.version_suffix") == None - assert getattr(r, "dataset.data_org") == "dummy" - assert i < 1 + """Make a simple entry, and make sure the query returns the correct result""" + + # Establish connection to database + tmp_src_dir, tmp_root_dir = dummy_file + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + + # Add entry + d_id = _insert_dataset_entry( + datareg, + "DESC/datasets/my_first_dataset", + "0.0.1", + "user", + None, + "This is my first DESC dataset", + ) + + # Query + f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id) + results = datareg.Query.find_datasets( + [ + "dataset.name", + "dataset.version_string", + "dataset.owner", + "dataset.owner_type", + "dataset.description", + "dataset.version_major", + "dataset.version_minor", + "dataset.version_patch", + "dataset.relative_path", + "dataset.version_suffix", + "dataset.data_org", + ], + [f], + return_format="cursorresult", + ) + + for i, r in enumerate(results): + assert getattr(r, "dataset.name") == "my_first_dataset" + assert getattr(r, "dataset.version_string") == "0.0.1" + assert getattr(r, "dataset.version_major") == 0 + assert getattr(r, "dataset.version_minor") == 0 + assert getattr(r, "dataset.version_patch") == 1 + assert getattr(r, "dataset.owner") == os.getenv("USER") + assert getattr(r, "dataset.owner_type") == "user" + assert getattr(r, "dataset.description") == "This is my first DESC dataset" + assert getattr(r, "dataset.relative_path") == "DESC/datasets/my_first_dataset" + assert getattr(r, "dataset.version_suffix") == None + assert getattr(r, "dataset.data_org") == "dummy" + assert i < 1 def test_manual_name_and_vsuffix(dummy_file): - """Test setting the name and version suffix manually""" - - # Establish connection to database - tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) - - # Add entry - d_id = _insert_dataset_entry( - datareg, - "DESC/datasets/my_second_dataset", - "0.0.1", - "user", - None, - "This is my first DESC dataset", - name="custom name", - version_suffix="custom_suffix", - ) - - # Query - f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id) - results = datareg.Query.find_datasets( - ["dataset.name", "dataset.version_suffix"], [f], return_format="cursorresult" - ) - - for i, r in enumerate(results): - assert getattr(r, "dataset.name") == "custom name" - assert getattr(r, "dataset.version_suffix") == "custom_suffix" - assert i < 1 + """Test setting the name and version suffix manually""" + + # Establish connection to database + tmp_src_dir, tmp_root_dir = dummy_file + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + + # Add entry + d_id = _insert_dataset_entry( + datareg, + "DESC/datasets/my_second_dataset", + "0.0.1", + "user", + None, + "This is my first DESC dataset", + name="custom name", + version_suffix="custom_suffix", + ) + + # Query + f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id) + results = datareg.Query.find_datasets( + ["dataset.name", "dataset.version_suffix"], [f], return_format="cursorresult" + ) + + for i, r in enumerate(results): + assert getattr(r, "dataset.name") == "custom name" + assert getattr(r, "dataset.version_suffix") == "custom_suffix" + assert i < 1 @pytest.mark.parametrize( - "v_type,ans,name", - [ - ("major", "1.0.0", "my_first_dataset"), - ("minor", "0.1.0", "my_first_dataset"), - ("patch", "0.0.2", "my_first_dataset"), - ("patch", "0.0.1", "my_second_dataset"), - ], + "v_type,ans,name", + [ + ("major", "1.0.0", "my_first_dataset"), + ("minor", "0.1.0", "my_first_dataset"), + ("patch", "0.0.2", "my_first_dataset"), + ("patch", "0.0.1", "my_second_dataset"), + ], ) def test_dataset_bumping(dummy_file, v_type, ans, name): - """ - Test bumping a dataset and make sure the new version is correct. - - Tests bumping datasets with and without a version suffix. - """ - - # Establish connection to database - tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) - - # Add entry - d_id = _insert_dataset_entry( - datareg, - f"DESC/datasets/bumped_dataset_{v_type}_{name}", - v_type, - "user", - None, - "This is my first bumped DESC dataset", - name=name, - ) - - # Query - f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id) - results = datareg.Query.find_datasets( - ["dataset.name", "dataset.version_string"], [f], return_format="cursorresult" - ) - - for i, r in enumerate(results): - assert getattr(r, "dataset.name") == name - assert getattr(r, "dataset.version_string") == ans - assert i < 1 + """ + Test bumping a dataset and make sure the new version is correct. + + Tests bumping datasets with and without a version suffix. + """ + + # Establish connection to database + tmp_src_dir, tmp_root_dir = dummy_file + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + + # Add entry + d_id = _insert_dataset_entry( + datareg, + f"DESC/datasets/bumped_dataset_{v_type}_{name}", + v_type, + "user", + None, + "This is my first bumped DESC dataset", + name=name, + ) + # Query + f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id) + results = datareg.Query.find_datasets( + ["dataset.name", "dataset.version_string"], [f], return_format="cursorresult" + ) -@pytest.mark.parametrize("owner_type", ["user", "group", "project"]) -def test_owner_types(dummy_file, owner_type): - """Test the different owner types""" + for i, r in enumerate(results): + assert getattr(r, "dataset.name") == name + assert getattr(r, "dataset.version_string") == ans + assert i < 1 - # Establish connection to database - tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) - # Add entry - d_id = _insert_dataset_entry( - datareg, - f"DESC/datasets/owner_type_{owner_type}", - "0.0.1", - owner_type, - None, - f"This is a {owner_type} dataset", - ) +@pytest.mark.parametrize("owner_type", ["user", "group", "project"]) +def test_owner_types(dummy_file, owner_type): + """Test the different owner types""" + + # Establish connection to database + tmp_src_dir, tmp_root_dir = dummy_file + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + + # Add entry + d_id = _insert_dataset_entry( + datareg, + f"DESC/datasets/owner_type_{owner_type}", + "0.0.1", + owner_type, + None, + f"This is a {owner_type} dataset", + ) - # Query - f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id) - results = datareg.Query.find_datasets( - ["dataset.owner_type"], [f], return_format="cursorresult" - ) + # Query + f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id) + results = datareg.Query.find_datasets( + ["dataset.owner_type"], [f], return_format="cursorresult" + ) - for i, r in enumerate(results): - assert getattr(r, "dataset.owner_type") == owner_type - assert i < 1 + for i, r in enumerate(results): + assert getattr(r, "dataset.owner_type") == owner_type + assert i < 1 @pytest.mark.parametrize("data_org", ["file", "directory"]) def test_copy_data(dummy_file, data_org): - """Test copying real data into the registry (from an `old_location`)""" - - # Establish connection to database - tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) - - # File/directory we are copying in - if data_org == "file": - data_path = str(tmp_src_dir / "file1.txt") - else: - data_path = str(tmp_src_dir / "directory1") - - # Add entry - d_id = _insert_dataset_entry( - datareg, - f"DESC/datasets/copy_real_{data_org}", - "0.0.1", - "user", - None, - "Test copying a real file", - old_location=data_path, - is_dummy=False, - ) - - # Query - f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id) - results = datareg.Query.find_datasets( - ["dataset.data_org", "dataset.nfiles", "dataset.total_disk_space"], - [f], - return_format="cursorresult", - ) - - for i, r in enumerate(results): - assert getattr(r, "dataset.data_org") == data_org - assert getattr(r, "dataset.nfiles") == 1 - assert getattr(r, "dataset.total_disk_space") > 0 - assert i < 1 + """Test copying real data into the registry (from an `old_location`)""" + + # Establish connection to database + tmp_src_dir, tmp_root_dir = dummy_file + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + + # File/directory we are copying in + if data_org == "file": + data_path = str(tmp_src_dir / "file1.txt") + else: + data_path = str(tmp_src_dir / "directory1") + + # Add entry + d_id = _insert_dataset_entry( + datareg, + f"DESC/datasets/copy_real_{data_org}", + "0.0.1", + "user", + None, + "Test copying a real file", + old_location=data_path, + is_dummy=False, + ) + + # Query + f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id) + results = datareg.Query.find_datasets( + ["dataset.data_org", "dataset.nfiles", "dataset.total_disk_space"], + [f], + return_format="cursorresult", + ) + + for i, r in enumerate(results): + assert getattr(r, "dataset.data_org") == data_org + assert getattr(r, "dataset.nfiles") == 1 + assert getattr(r, "dataset.total_disk_space") > 0 + assert i < 1 @pytest.mark.parametrize( - "data_org,data_path,v_str,overwritable", - [ - ("file", "file1.txt", "0.0.1", True), - ("file", "file1.txt", "0.0.2", False), - ("directory", "dummy_dir", "0.0.1", True), - ("directory", "dummy_dir", "0.0.2", False), - ], + "data_org,data_path,v_str,overwritable", + [ + ("file", "file1.txt", "0.0.1", True), + ("file", "file1.txt", "0.0.2", False), + ("directory", "dummy_dir", "0.0.1", True), + ("directory", "dummy_dir", "0.0.2", False), + ], ) def test_on_location_data(dummy_file, data_org, data_path, v_str, overwritable): - """ - Test ingesting real data into the registry (already on location). Also - tests overwriting datasets. - - Does twice for each file, the first is a normal entry with - `is_overwritable=True`. The second tests overwriting the previous data with - a new version. - """ - - # Establish connection to database - tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) - - d_id = _insert_dataset_entry( - datareg, - data_path, - v_str, - "user", - None, - "Test ingesting a real file on location", - old_location=None, - is_dummy=False, - is_overwritable=overwritable, - ) - - f = datareg.Query.gen_filter("dataset.relative_path", "==", data_path) - results = datareg.Query.find_datasets( - [ - "dataset.data_org", - "dataset.nfiles", - "dataset.total_disk_space", - "dataset.is_overwritable", - "dataset.is_overwritten", - "dataset.version_string", - ], - [f], - return_format="cursorresult", - ) - - num_results = len(results.all()) - for i, r in enumerate(results): - assert getattr(r, "dataset.data_org") == data_org - assert getattr(r, "dataset.nfiles") == 1 - assert getattr(r, "dataset.total_disk_space") > 0 - if getattr(r, "version_string") == "0.0.1": - if num_results == 1: - assert getattr(r, "dataset.is_overwritable") == True - assert getattr(r, "dataset.is_overwritten") == False - else: - assert getattr(r, "dataset.is_overwritable") == True - assert getattr(r, "dataset.is_overwritten") == True - else: - if num_results == 1: - assert getattr(r, "dataset.is_overwritable") == False - assert getattr(r, "dataset.is_overwritten") == True - else: - assert getattr(r, "dataset.is_overwritable") == False - assert getattr(r, "dataset.is_overwritten") == False - assert i < 2 + """ + Test ingesting real data into the registry (already on location). Also + tests overwriting datasets. + + Does twice for each file, the first is a normal entry with + `is_overwritable=True`. The second tests overwriting the previous data with + a new version. + """ + + # Establish connection to database + tmp_src_dir, tmp_root_dir = dummy_file + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + + d_id = _insert_dataset_entry( + datareg, + data_path, + v_str, + "user", + None, + "Test ingesting a real file on location", + old_location=None, + is_dummy=False, + is_overwritable=overwritable, + ) + + f = datareg.Query.gen_filter("dataset.relative_path", "==", data_path) + results = datareg.Query.find_datasets( + [ + "dataset.data_org", + "dataset.nfiles", + "dataset.total_disk_space", + "dataset.is_overwritable", + "dataset.is_overwritten", + "dataset.version_string", + ], + [f], + return_format="cursorresult", + ) + + num_results = len(results.all()) + for i, r in enumerate(results): + assert getattr(r, "dataset.data_org") == data_org + assert getattr(r, "dataset.nfiles") == 1 + assert getattr(r, "dataset.total_disk_space") > 0 + if getattr(r, "version_string") == "0.0.1": + if num_results == 1: + assert getattr(r, "dataset.is_overwritable") == True + assert getattr(r, "dataset.is_overwritten") == False + else: + assert getattr(r, "dataset.is_overwritable") == True + assert getattr(r, "dataset.is_overwritten") == True + else: + if num_results == 1: + assert getattr(r, "dataset.is_overwritable") == False + assert getattr(r, "dataset.is_overwritten") == True + else: + assert getattr(r, "dataset.is_overwritable") == False + assert getattr(r, "dataset.is_overwritten") == False + assert i < 2 def test_dataset_alias(dummy_file): - """Register a dataset and make a dataset alias entry for it""" - - # Establish connection to database - tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) - - # Add dataset - d_id = _insert_dataset_entry( - datareg, - "alias_test_entry", - "0.0.1", - "user", - None, - "Test dataset alias", - ) - - # Add alias - _insert_alias_entry(datareg, "nice_dataset_name", d_id) - - # Query - f = datareg.Query.gen_filter("dataset_alias.alias", "==", "nice_dataset_name") - results = datareg.Query.find_datasets( - [ - "dataset.dataset_id", - "dataset_alias.dataset_id", - ], - [f], - return_format="cursorresult", - ) - - for i, r in enumerate(results): - assert getattr(r, "dataset.dataset_id") == d_id - assert getattr(r, "dataset_alias.dataset_id") == d_id - assert i < 1 + """Register a dataset and make a dataset alias entry for it""" + + # Establish connection to database + tmp_src_dir, tmp_root_dir = dummy_file + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + + # Add dataset + d_id = _insert_dataset_entry( + datareg, + "alias_test_entry", + "0.0.1", + "user", + None, + "Test dataset alias", + ) + + # Add alias + _insert_alias_entry(datareg, "nice_dataset_name", d_id) + + # Query + f = datareg.Query.gen_filter("dataset_alias.alias", "==", "nice_dataset_name") + results = datareg.Query.find_datasets( + [ + "dataset.dataset_id", + "dataset_alias.dataset_id", + ], + [f], + return_format="cursorresult", + ) + + for i, r in enumerate(results): + assert getattr(r, "dataset.dataset_id") == d_id + assert getattr(r, "dataset_alias.dataset_id") == d_id + assert i < 1 def test_pipeline_entry(dummy_file): - """ - Test making multiple executions and datasets to form a pipeline. - - Also queries to make sure dependencies are made. - """ - - # Establish connection to database - tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) - - # Add entries - ex_id_1 = _insert_execution_entry( - datareg, "pipeline_stage_1", "The first stage of my pipeline" - ) - - d_id_1 = _insert_dataset_entry( - datareg, - "DESC/datasets/my_first_pipeline_stage1", - "0.0.1", - "user", - None, - "This is data for stage 1 of my first pipeline", - execution_id=ex_id_1, - ) - - ex_id_2 = _insert_execution_entry( - datareg, - "pipeline_stage_2", - "The second stage of my pipeline", - input_datasets=[d_id_1], - ) - - d_id_2 = _insert_dataset_entry( - datareg, - "DESC/datasets/my_first_pipeline_stage2a", - "0.0.1", - "user", - None, - "This is data for stage 2 of my first pipeline", - execution_id=ex_id_2, - ) - - d_id_3 = _insert_dataset_entry( - datareg, - "DESC/datasets/my_first_pipeline_stage2b", - "0.0.1", - "user", - None, - "This is data for stage 2 of my first pipeline", - execution_id=ex_id_2, - ) - - # Stage 3 of my pipeline - ex_id_3 = _insert_execution_entry( - datareg, - "pipeline_stage_3", - "The third stage of my pipeline", - input_datasets=[d_id_2, d_id_3], - ) - - # Query on execution - f = datareg.Query.gen_filter("dataset.execution_id", "==", ex_id_2) - results = datareg.Query.find_datasets( - [ - "dataset.name", - ], - [f], - return_format="cursorresult", - ) - - for i, r in enumerate(results): - assert "my_first_pipeline_stage2" in getattr(r, "dataset.name") - assert i < 2 - - # Query on dependency - f = datareg.Query.gen_filter("dependency.execution_id", "==", ex_id_2) - results = datareg.Query.find_datasets( - [ - "dependency.execution_id", - "dataset.dataset_id", - "dataset.execution_id", - "dataset.name", - ], - [f], - return_format="cursorresult", - ) - - for i, r in enumerate(results): - assert getattr(r, "dataset.dataset_id") == d_id_1 - assert i < 1 + """ + Test making multiple executions and datasets to form a pipeline. + + Also queries to make sure dependencies are made. + """ + + # Establish connection to database + tmp_src_dir, tmp_root_dir = dummy_file + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + + # Add entries + ex_id_1 = _insert_execution_entry( + datareg, "pipeline_stage_1", "The first stage of my pipeline" + ) + + d_id_1 = _insert_dataset_entry( + datareg, + "DESC/datasets/my_first_pipeline_stage1", + "0.0.1", + "user", + None, + "This is data for stage 1 of my first pipeline", + execution_id=ex_id_1, + ) + + ex_id_2 = _insert_execution_entry( + datareg, + "pipeline_stage_2", + "The second stage of my pipeline", + input_datasets=[d_id_1], + ) + + d_id_2 = _insert_dataset_entry( + datareg, + "DESC/datasets/my_first_pipeline_stage2a", + "0.0.1", + "user", + None, + "This is data for stage 2 of my first pipeline", + execution_id=ex_id_2, + ) + + d_id_3 = _insert_dataset_entry( + datareg, + "DESC/datasets/my_first_pipeline_stage2b", + "0.0.1", + "user", + None, + "This is data for stage 2 of my first pipeline", + execution_id=ex_id_2, + ) + + # Stage 3 of my pipeline + ex_id_3 = _insert_execution_entry( + datareg, + "pipeline_stage_3", + "The third stage of my pipeline", + input_datasets=[d_id_2, d_id_3], + ) + + # Query on execution + f = datareg.Query.gen_filter("dataset.execution_id", "==", ex_id_2) + results = datareg.Query.find_datasets( + [ + "dataset.name", + ], + [f], + return_format="cursorresult", + ) + + for i, r in enumerate(results): + assert "my_first_pipeline_stage2" in getattr(r, "dataset.name") + assert i < 2 + + # Query on dependency + f = datareg.Query.gen_filter("dependency.execution_id", "==", ex_id_2) + results = datareg.Query.find_datasets( + [ + "dependency.execution_id", + "dataset.dataset_id", + "dataset.execution_id", + "dataset.name", + ], + [f], + return_format="cursorresult", + ) + + for i, r in enumerate(results): + assert getattr(r, "dataset.dataset_id") == d_id_1 + assert i < 1 def test_global_owner_set(dummy_file): - """ - Test setting the owner and owner_type globally during the database - initialization. - """ - - # Establish connection to database - tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry( - root_dir=str(tmp_root_dir), - schema=SCHEMA_VERSION, - owner="DESC group", - owner_type="group", - ) - - # Add entry - d_id = _insert_dataset_entry( - datareg, - "DESC/datasets/global_user_dataset", - "0.0.1", - None, - None, - "Should be allocated user and user_type from global config", - ) - - # Query - f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id) - results = datareg.Query.find_datasets( - [ - "dataset.owner", - "dataset.owner_type", - ], - [f], - return_format="cursorresult", - ) - - for i, r in enumerate(results): - assert getattr(r, "dataset.owner") == "DESC group" - assert getattr(r, "dataset.owner_type") == "group" - assert i < 1 + """ + Test setting the owner and owner_type globally during the database + initialization. + """ + + # Establish connection to database + tmp_src_dir, tmp_root_dir = dummy_file + datareg = DataRegistry( + root_dir=str(tmp_root_dir), + schema=SCHEMA_VERSION, + owner="DESC group", + owner_type="group", + ) + + # Add entry + d_id = _insert_dataset_entry( + datareg, + "DESC/datasets/global_user_dataset", + "0.0.1", + None, + None, + "Should be allocated user and user_type from global config", + ) + + # Query + f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id) + results = datareg.Query.find_datasets( + [ + "dataset.owner", + "dataset.owner_type", + ], + [f], + return_format="cursorresult", + ) + + for i, r in enumerate(results): + assert getattr(r, "dataset.owner") == "DESC group" + assert getattr(r, "dataset.owner_type") == "group" + assert i < 1 @pytest.mark.skip(reason="Can't do production related things with sqlite") def test_prooduction_schema(dummy_file): - """ - Test making multiple executions and datasets to form a pipeline. - - Also queries to make sure dependencies are made. - """ - - # Establish connection to database - tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema="production") - - d_id = _insert_dataset_entry( - datareg, - "DESC/datasets/production_dataset_1", - "0.0.1", - "production", - None, - "This is production's first dataset", - ) - - # Query - f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id) - results = datareg.Query.find_datasets( - [ - "dataset.owner", - "dataset.owner_type", - ], - [f], - return_format="cursorresult", - ) - - for i, r in enumerate(results): - assert getattr(r, "dataset.owner") == "production" - assert getattr(r, "dataset.owner_type") == "production" - assert i < 1 + """ + Test making multiple executions and datasets to form a pipeline. + + Also queries to make sure dependencies are made. + """ + + # Establish connection to database + tmp_src_dir, tmp_root_dir = dummy_file + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema="production") + + d_id = _insert_dataset_entry( + datareg, + "DESC/datasets/production_dataset_1", + "0.0.1", + "production", + None, + "This is production's first dataset", + ) + + # Query + f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id) + results = datareg.Query.find_datasets( + [ + "dataset.owner", + "dataset.owner_type", + ], + [f], + return_format="cursorresult", + ) + + for i, r in enumerate(results): + assert getattr(r, "dataset.owner") == "production" + assert getattr(r, "dataset.owner_type") == "production" + assert i < 1 def test_execution_config_file(dummy_file): - """Test ingesting a configuration file with an execution entry""" - - # Establish connection to database - tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) - - # Add entry - ex_id = _insert_execution_entry( - datareg, - "execution_with_configuration", - "An execution with an input configuration file", - configuration=str(tmp_src_dir / "dummy_configuration_file.yaml"), - ) - - # Query - f = datareg.Query.gen_filter("execution.execution_id", "==", ex_id) - results = datareg.Query.find_datasets( - [ - "execution.configuration", - ], - [f], - return_format="cursorresult", - ) - - for i, r in enumerate(results): - assert getattr(r, "execution.configuration") is not None - assert i < 1 + """Test ingesting a configuration file with an execution entry""" + + # Establish connection to database + tmp_src_dir, tmp_root_dir = dummy_file + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + + # Add entry + ex_id = _insert_execution_entry( + datareg, + "execution_with_configuration", + "An execution with an input configuration file", + configuration=str(tmp_src_dir / "dummy_configuration_file.yaml"), + ) + + # Query + f = datareg.Query.gen_filter("execution.execution_id", "==", ex_id) + results = datareg.Query.find_datasets( + [ + "execution.configuration", + ], + [f], + return_format="cursorresult", + ) + + for i, r in enumerate(results): + assert getattr(r, "execution.configuration") is not None + assert i < 1 def test_dataset_with_execution(dummy_file): - """ - Test modifying the datasets default execution directly when registering the - dataset - """ - - # Establish connection to database - tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) - - d_id_1 = _insert_dataset_entry( - datareg, - "DESC/datasets/execution_test_input", - "0.0.1", - None, - None, - "This is production's first dataset", - ) - - d_id_2 = _insert_dataset_entry( - datareg, - "DESC/datasets/execution_test", - "0.0.1", - None, - None, - "This should have a more descriptive execution", - execution_name="Overwrite execution auto name", - execution_description="Overwrite execution auto description", - execution_locale="TestMachine", - input_datasets=[d_id_1], - ) - - # Query on execution - f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id_2) - results = datareg.Query.find_datasets( - [ - "dataset.name", - "execution.execution_id", - "execution.description", - "execution.locale", - "execution.name", - ], - [f], - return_format="cursorresult", - ) - - for i, r in enumerate(results): - assert getattr(r, "execution.name") == "Overwrite execution auto name" - assert ( - getattr(r, "execution.description") - == "Overwrite execution auto description" - ) - assert getattr(r, "execution.locale") == "TestMachine" - ex_id_1 = getattr(r, "execution.execution_id") - assert i < 1 - - # Query on dependency - f = datareg.Query.gen_filter("dependency.input_id", "==", d_id_1) - results = datareg.Query.find_datasets( - [ - "dataset.dataset_id", - "dependency.execution_id", - "dependency.input_id", - ], - [f], - return_format="cursorresult", - ) - - for i, r in enumerate(results): - assert getattr(r, "dependency.execution_id") == ex_id_1 - assert i < 1 + """ + Test modifying the datasets default execution directly when registering the + dataset + """ + + # Establish connection to database + tmp_src_dir, tmp_root_dir = dummy_file + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + + d_id_1 = _insert_dataset_entry( + datareg, + "DESC/datasets/execution_test_input", + "0.0.1", + None, + None, + "This is production's first dataset", + ) + + d_id_2 = _insert_dataset_entry( + datareg, + "DESC/datasets/execution_test", + "0.0.1", + None, + None, + "This should have a more descriptive execution", + execution_name="Overwrite execution auto name", + execution_description="Overwrite execution auto description", + execution_locale="TestMachine", + input_datasets=[d_id_1], + ) + + # Query on execution + f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id_2) + results = datareg.Query.find_datasets( + [ + "dataset.name", + "execution.execution_id", + "execution.description", + "execution.locale", + "execution.name", + ], + [f], + return_format="cursorresult", + ) + + for i, r in enumerate(results): + assert getattr(r, "execution.name") == "Overwrite execution auto name" + assert ( + getattr(r, "execution.description") + == "Overwrite execution auto description" + ) + assert getattr(r, "execution.locale") == "TestMachine" + ex_id_1 = getattr(r, "execution.execution_id") + assert i < 1 + + # Query on dependency + f = datareg.Query.gen_filter("dependency.input_id", "==", d_id_1) + results = datareg.Query.find_datasets( + [ + "dataset.dataset_id", + "dependency.execution_id", + "dependency.input_id", + ], + [f], + return_format="cursorresult", + ) + + for i, r in enumerate(results): + assert getattr(r, "dependency.execution_id") == ex_id_1 + assert i < 1 def test_get_dataset_absolute_path(dummy_file): - """ - Test the generation of the full absolute path of a dataset using the - `Query.get_dataset_absolute_path` function - """ - - # Establish connection to database - tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) - - dset_relpath = "DESC/datasets/get_dataset_absolute_path_test" - dset_ownertype = "group" - dset_owner = "group1" - - # Make a basic entry - d_id_1 = _insert_dataset_entry( - datareg, - dset_relpath, - "0.0.1", - dset_ownertype, - dset_owner, - "Test the Query.get_dataset_absolute_path function", - ) - - v = datareg.Query.get_dataset_absolute_path(d_id_1) - - if datareg.Query._dialect == "sqlite": - assert v == os.path.join( - str(tmp_root_dir), dset_ownertype, dset_owner, dset_relpath - ) - else: - assert v == os.path.join( - str(tmp_root_dir), SCHEMA_VERSION, dset_ownertype, dset_owner, dset_relpath - ) + """ + Test the generation of the full absolute path of a dataset using the + `Query.get_dataset_absolute_path` function + """ + + # Establish connection to database + tmp_src_dir, tmp_root_dir = dummy_file + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + + dset_relpath = "DESC/datasets/get_dataset_absolute_path_test" + dset_ownertype = "group" + dset_owner = "group1" + + # Make a basic entry + d_id_1 = _insert_dataset_entry( + datareg, + dset_relpath, + "0.0.1", + dset_ownertype, + dset_owner, + "Test the Query.get_dataset_absolute_path function", + ) + + v = datareg.Query.get_dataset_absolute_path(d_id_1) + + if datareg.Query._dialect == "sqlite": + assert v == os.path.join( + str(tmp_root_dir), dset_ownertype, dset_owner, dset_relpath + ) + else: + assert v == os.path.join( + str(tmp_root_dir), SCHEMA_VERSION, dset_ownertype, dset_owner, dset_relpath + ) @pytest.mark.parametrize( - "is_dummy,dataset_name", - [ - (True, "dummy_dataset_to_delete"), - (False, "real_dataset_to_delete"), - ], + "is_dummy,dataset_name", + [ + (True, "dummy_dataset_to_delete"), + (False, "real_dataset_to_delete"), + ], ) def test_delete_entry(dummy_file, is_dummy, dataset_name): - """ - Make a simple entry, then delete it, then check it was deleted. - - Does this for a dummy dataset and a real one. - """ - - # Establish connection to database - tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) - - # Make sure we raise an exception trying to delete a dataset that doesn't exist - with pytest.raises(ValueError, match="does not exist"): - datareg.Registrar.dataset.delete(10000) - - # Where is the real data? - if is_dummy: - data_path = None - else: - data_path = str(tmp_src_dir / "file2.txt") - assert os.path.isfile(data_path) - - # Add entry - d_id = _insert_dataset_entry( - datareg, - f"DESC/datasets/{dataset_name}", - "0.0.1", - "user", - None, - "A dataset to delete", - is_dummy=is_dummy, - old_location=data_path, - ) - - # Now delete that entry - datareg.Registrar.dataset.delete(d_id) - - # Check the entry was deleted - f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id) - results = datareg.Query.find_datasets( - [ - "dataset.status", - "dataset.delete_date", - "dataset.delete_uid", - "dataset.owner_type", - "dataset.owner", - "dataset.relative_path", - ], - [f], - return_format="cursorresult", - ) - - for r in results: - assert getattr(r, "dataset.status") == 3 - assert getattr(r, "dataset.delete_date") is not None - assert getattr(r, "dataset.delete_uid") is not None - - if not is_dummy: - # Make sure the file in the root_dir has gone - data_path = _form_dataset_path( - getattr(r, "dataset.owner_type"), - getattr(r, "dataset.owner"), - getattr(r, "dataset.relative_path"), - schema=SCHEMA_VERSION, - root_dir=str(tmp_root_dir), - ) - assert not os.path.isfile(data_path) - - # Make sure we can not delete an already deleted entry. - with pytest.raises(ValueError, match="not have a valid status"): - datareg.Registrar.dataset.delete(d_id) + """ + Make a simple entry, then delete it, then check it was deleted. + + Does this for a dummy dataset and a real one. + """ + + # Establish connection to database + tmp_src_dir, tmp_root_dir = dummy_file + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + + # Make sure we raise an exception trying to delete a dataset that doesn't exist + with pytest.raises(ValueError, match="does not exist"): + datareg.Registrar.dataset.delete(10000) + + # Where is the real data? + if is_dummy: + data_path = None + else: + data_path = str(tmp_src_dir / "file2.txt") + assert os.path.isfile(data_path) + + # Add entry + d_id = _insert_dataset_entry( + datareg, + f"DESC/datasets/{dataset_name}", + "0.0.1", + "user", + None, + "A dataset to delete", + is_dummy=is_dummy, + old_location=data_path, + ) + + # Now delete that entry + datareg.Registrar.dataset.delete(d_id) + + # Check the entry was deleted + f = datareg.Query.gen_filter("dataset.dataset_id", "==", d_id) + results = datareg.Query.find_datasets( + [ + "dataset.status", + "dataset.delete_date", + "dataset.delete_uid", + "dataset.owner_type", + "dataset.owner", + "dataset.relative_path", + ], + [f], + return_format="cursorresult", + ) + + for r in results: + assert get_dataset_status(getattr(r, "dataset.status"), "deleted") + assert getattr(r, "dataset.delete_date") is not None + assert getattr(r, "dataset.delete_uid") is not None + + if not is_dummy: + # Make sure the file in the root_dir has gone + data_path = _form_dataset_path( + getattr(r, "dataset.owner_type"), + getattr(r, "dataset.owner"), + getattr(r, "dataset.relative_path"), + schema=SCHEMA_VERSION, + root_dir=str(tmp_root_dir), + ) + assert not os.path.isfile(data_path) + + # Make sure we can not delete an already deleted entry. + with pytest.raises(ValueError, match="not have a valid status"): + datareg.Registrar.dataset.delete(d_id) From cac985689d066dbbccb0b64eb1e1e44876c0f17e Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Fri, 16 Feb 2024 20:15:11 +0100 Subject: [PATCH 16/19] Split find_entry into find_previous and a universal find_entry --- .../registrar/base_table_class.py | 37 +- src/dataregistry/registrar/dataset.py | 935 +++++++++--------- tests/end_to_end_tests/test_end_to_end.py | 32 +- 3 files changed, 519 insertions(+), 485 deletions(-) diff --git a/src/dataregistry/registrar/base_table_class.py b/src/dataregistry/registrar/base_table_class.py index 1dab7c55..0d08131c 100644 --- a/src/dataregistry/registrar/base_table_class.py +++ b/src/dataregistry/registrar/base_table_class.py @@ -28,7 +28,7 @@ def __init__(self, db_connection, root_dir, owner, owner_type): Base class to register/modify/delete entries in the database tables. Each table subclass (e.g., DatasetTable) will inherit this class. - + Functions universal to all tables, such as delete and modify are written here, the register function, and other unique functions for the tables, are in their respective subclasses. @@ -100,3 +100,38 @@ def modify(self, entry_id, modify_fields): """ raise NotImplementedError + + def find_entry(self, entry_id): + """ + Find an entry in the database. + + Parameters + ---------- + entry_id : int + Unique identifier for table entry + e.g., dataset_id for the dataset table + + Returns + ------- + r : CursorResult object + Found entry (None if no entry found) + """ + + # Search for dataset in the registry. + my_table = self._get_table_metadata(self.which_table) + + if self.which_table == "dataset": + stmt = select(my_table).where(my_table.c.dataset_id == entry_id) + else: + raise ValueError("Can only perform `find_entry` on dataset table for now") + + with self._engine.connect() as conn: + result = conn.execute(stmt) + conn.commit() + + # Pull out the single result + for r in result: + return r + + # No results found + return None diff --git a/src/dataregistry/registrar/dataset.py b/src/dataregistry/registrar/dataset.py index e8ce0a6b..1b12343c 100644 --- a/src/dataregistry/registrar/dataset.py +++ b/src/dataregistry/registrar/dataset.py @@ -7,481 +7,470 @@ from .base_table_class import BaseTable from .registrar_util import ( - _bump_version, - _copy_data, - _form_dataset_path, - _name_from_relpath, - _parse_version_string, - _read_configuration_file, - get_directory_info, + _bump_version, + _copy_data, + _form_dataset_path, + _name_from_relpath, + _parse_version_string, + _read_configuration_file, + get_directory_info, ) from .dataset_util import set_dataset_status, get_dataset_status class DatasetTable(BaseTable): - def __init__(self, db_connection, root_dir, owner, owner_type, execution_table): - super().__init__(db_connection, root_dir, owner, owner_type) - - self.execution_table = execution_table - self.which_table = "dataset" - - def register( - self, - relative_path, - version, - version_suffix=None, - name=None, - creation_date=None, - description=None, - execution_id=None, - access_API=None, - access_API_configuration=None, - is_overwritable=False, - old_location=None, - copy=True, - is_dummy=False, - verbose=False, - owner=None, - owner_type=None, - execution_name=None, - execution_description=None, - execution_start=None, - execution_locale=None, - execution_configuration=None, - input_datasets=[], - input_production_datasets=[], - max_config_length=None, - ): - """ - Create a new dataset entry in the DESC data registry. - - Any args marked with '**' share their name with the associated column - in the registry schema. Descriptions of what these columns are can be - found in `schema.yaml` or the documentation. - - First, the dataset entry is created in the database. If success, the - data is then copied (if `old_location` was provided). Only if both - steps are successful will there be "valid" status entry in the - registry. - - Parameters - ---------- - relative_path** : str - version** : str - version_suffix** : str, optional - name** : str, optional - creation_date** : datetime, optional - description** : str, optional - execution_id** : int, optional - access_API** : str, optional - is_overwritable** : bool, optional - old_location : str, optional - Absolute location of dataset to copy into the data registry. - - If None, dataset should already be at correct relative_path within - the data registry. - copy : bool, optional - True to copy data from ``old_location`` into the data registry - (default behaviour). - False to create a symlink. - is_dummy : bool, optional - True for "dummy" datasets (no data is copied, for testing purposes - only) - verbose : bool, optional - Provide some additional output information - owner** : str, optional - owner_type** : str, optional - execution_name** : str, optional - execution_description** : str, optional - execution_start** : datetime, optional - execution_locale** : str, optional - execution_configuration** : str, optional - input_datasets : list, optional - List of dataset ids that were the input to this execution - input_production_datasets : list, optional - List of production dataset ids that were the input to this execution - max_config_length : int, optional - Maxiumum number of lines to read from a configuration file - - Returns - ------- - prim_key : int - The dataset ID of the new row relating to this entry (else None) - execution_id : int - The execution ID associated with the dataset - """ - - # Set max configuration file length - if max_config_length is None: - max_config_length = self._DEFAULT_MAX_CONFIG - - # Make sure the owner_type is legal - if owner_type is None: - if self._owner_type is not None: - owner_type = self._owner_type - else: - owner_type = "user" - if owner_type not in self._OWNER_TYPES: - raise ValueError(f"{owner_type} is not a valid owner_type") - - # Establish the dataset owner - if owner is None: - if self._owner is not None: - owner = self._owner - else: - owner = self._uid - if owner_type == "production": - owner = "production" - - # Checks for production datasets - if owner_type == "production": - if is_overwritable: - raise ValueError("Cannot overwrite production entries") - if version_suffix is not None: - raise ValueError("Production entries can't have version suffix") - if self._schema != "production": - raise ValueError( - "Only the production schema can handle owner_type='production'" - ) - else: - if self._schema == "production": - raise ValueError( - "Only the production schema can handle owner_type='production'" - ) - - # If `name` not passed, automatically generate a name from the relative path - if name is None: - name = _name_from_relpath(relative_path) - - # Look for previous entries. Fail if not overwritable - dataset_table = self._get_table_metadata("dataset") - previous_dataset = self._find_entry( - relative_path=relative_path, - owner=owner, - owner_type=owner_type, - ) - - if previous_dataset is not None: - if not previous_dataset.is_overwritable: - print(f"Dataset {relative_path} exists, and is not overwritable") - return None - - # Deal with version string (non-special case) - if version not in ["major", "minor", "patch"]: - v_fields = _parse_version_string(version) - version_string = version - else: - # Generate new version fields based on previous entries - # with the same name field and same suffix (i.e., bump) - v_fields = _bump_version( - name, version, version_suffix, dataset_table, self._engine - ) - version_string = ( - f"{v_fields['major']}.{v_fields['minor']}.{v_fields['patch']}" - ) - - # If no execution_id is supplied, create a minimal entry - if execution_id is None: - if execution_name is None: - execution_name = f"for_dataset_{name}-{version_string}" - if version_suffix: - execution_name = f"{execution_name}-{version_suffix}" - if execution_description is None: - execution_description = "Fabricated execution for dataset" - execution_id = self.execution_table.register( - execution_name, - description=execution_description, - execution_start=execution_start, - locale=execution_locale, - configuration=execution_configuration, - input_datasets=input_datasets, - input_production_datasets=input_production_datasets, - ) - - # Pull the dataset properties together - values = {"name": name, "relative_path": relative_path} - values["version_major"] = v_fields["major"] - values["version_minor"] = v_fields["minor"] - values["version_patch"] = v_fields["patch"] - values["version_string"] = version_string - if version_suffix: - values["version_suffix"] = version_suffix - if description: - values["description"] = description - if execution_id: - values["execution_id"] = execution_id - if access_API: - values["access_API"] = access_API - if access_API_configuration: - values["access_API_configuration"] = _read_configuration_file( - access_API_configuration, max_config_length - ) - values["is_overwritable"] = is_overwritable - values["is_overwritten"] = False - values["is_external_link"] = False - values["is_archived"] = False - values["register_date"] = datetime.now() - values["owner_type"] = owner_type - values["owner"] = owner - values["creator_uid"] = self._uid - values["register_root_dir"] = self._root_dir - - # We tentatively start with an "invalid" dataset in the database. This - # will be upgraded to valid if the data copying (if any) was successful. - values["status"] = 0 - - # Create a new row in the data registry database. - with self._engine.connect() as conn: - prim_key = add_table_row(conn, dataset_table, values, commit=False) - - if previous_dataset is not None: - # Update previous rows, setting is_overwritten to True - update_stmt = ( - update(dataset_table) - .where(dataset_table.c.dataset_id == previous_dataset.dataset_id) - .values(is_overwritten=True) - ) - conn.execute(update_stmt) - conn.commit() - - # Get dataset characteristics; copy to `root_dir` if requested - if not is_dummy: - ( - dataset_organization, - num_files, - total_size, - ds_creation_date, - ) = self._handle_data( - relative_path, old_location, owner, owner_type, verbose - ) - valid_status = 1 - else: - dataset_organization = "dummy" - num_files = 0 - total_size = 0 - ds_creation_date = None - valid_status = 0 - - # Case where use is overwriting the dateset `creation_date` - if creation_date: - ds_creation_date = creation_date - - # Copy was successful, update the entry with dataset metadata - with self._engine.connect() as conn: - update_stmt = ( - update(dataset_table) - .where(dataset_table.c.dataset_id == prim_key) - .values( - data_org=dataset_organization, - nfiles=num_files, - total_disk_space=total_size / 1024 / 1024, - creation_date=ds_creation_date, - status=set_dataset_status(values["status"], valid=True), - ) - ) - conn.execute(update_stmt) - conn.commit() - - return prim_key, execution_id - - def _handle_data(self, relative_path, old_location, owner, owner_type, verbose): - """ - Find characteristics of dataset (i.e., is it a file or directory, how - many files and total disk space of the dataset). - - If old_location is not None, copy the dataset files and directories - into the data registry. - - Parameters - ---------- - relative_path : str - Relative path of dataset in the data registry - old_location : str - Location of data (if not already in the data registry root) - Data will be copied from this location - owner : str - Owner of the dataset - owner_type : str - Owner type of the dataset - verbose : bool - True for extra output - - Returns - ------- - dataset_organization : str - "file", "directory", or "dummy" - num_files : int - Total number of files making up dataset - total_size : float - Total disk space of dataset in bytes - ds_creation_date : datetime - When file or directory was created - """ - - # Get destination directory in data registry. - dest = _form_dataset_path( - owner_type, - owner, - relative_path, - schema=self._schema, - root_dir=self._root_dir, - ) - - # Is the data already on location, or coming from somewhere new? - if old_location: - loc = old_location - else: - loc = dest - - # Get metadata on dataset. - if os.path.isfile(loc): - dataset_organization = "file" - elif os.path.isdir(loc): - dataset_organization = "directory" - else: - raise FileNotFoundError(f"Dataset {loc} not found") - - if verbose: - tic = time.time() - print("Collecting metadata...", end="") - - ds_creation_date = datetime.fromtimestamp(os.path.getctime(loc)) - - if dataset_organization == "directory": - num_files, total_size = get_directory_info(loc) - else: - num_files = 1 - total_size = os.path.getsize(loc) - if verbose: - print(f"took {time.time()-tic:.2f}s") - - # Copy data into data registry - if old_location: - if verbose: - tic = time.time() - print( - f"Copying {num_files} files ({total_size/1024/1024:.2f} Mb)...", - end="", - ) - _copy_data(dataset_organization, old_location, dest) - if verbose: - print(f"took {time.time()-tic:.2f}") - - return dataset_organization, num_files, total_size, ds_creation_date - - def _find_entry( - self, - relative_path=None, - owner=None, - owner_type=None, - dataset_id=None, - ): - """ - Find a dataset entry in the database. - - Can search by either: - 1) Just `dataset_id` - 2) A combination of `relative_path`, `owner` and `owner_type`. - - Only one dataset should ever be found. - - Parameters - ---------- - relative_path : str, optional - Relative path to dataset - owner : str, optional - Owner of the dataset - owner_type : str, optional - dataset_id : int, optional - - Returns - ------- - r : CursorResult object - Searched dataset - """ - - # Make sure we have all the relavant information - if dataset_id is None: - if (relative_path is None) or (owner is None) or (owner_type is None): - raise ValueError( - "Must pass relative_path, owner and owner_type to _find_entry" - ) - - # Search for dataset in the registry. - dataset_table = self._get_table_metadata("dataset") - stmt = select(dataset_table) - - if dataset_id is None: - stmt = stmt.where( - dataset_table.c.relative_path == relative_path, - dataset_table.c.owner == owner, - dataset_table.c.owner_type == owner_type, - ) - else: - stmt = stmt.where(dataset_table.c.dataset_id == dataset_id) - - with self._engine.connect() as conn: - result = conn.execute(stmt) - conn.commit() - - # Pull out the single result - for r in result: - return r - - # No results found - return None - - def delete(self, dataset_id): - """ - Delete an dataset entry from the DESC data registry. - - This will also remove the raw data from the root dir, but the dataset - entry remains in the registry (now with an updated `status` field). - - Parameters - ---------- - dataset_id : int - Dataset we want to delete from the registry - """ - - # First make sure the given dataset id is in the registry - dataset_table = self._get_table_metadata(self.which_table) - previous_dataset = self._find_entry(dataset_table, dataset_id=dataset_id) - - # Check dataset exists - if previous_dataset is None: - raise ValueError(f"Dataset ID {dataset_id} does not exist") - # Check dataset is valid - if not get_dataset_status(previous_dataset.status, "valid"): - raise ValueError(f"Dataset ID {dataset_id} does not have a valid status") - # Check dataset has not already been deleted - if get_dataset_status(previous_dataset.status, "deleted"): - raise ValueError(f"Dataset ID {dataset_id} does not have a valid status") - - # Update the status of the dataset to deleted - with self._engine.connect() as conn: - update_stmt = ( - update(dataset_table) - .where(dataset_table.c.dataset_id == dataset_id) - .values( - status=set_dataset_status(previous_dataset.status, deleted=True), - delete_date=datetime.now(), - delete_uid=self._uid, - ) - ) - conn.execute(update_stmt) - conn.commit() - - # Delete the physical data in the root_dir - if previous_dataset.data_org != "dummy": - data_path = _form_dataset_path( - previous_dataset.owner_type, - previous_dataset.owner, - previous_dataset.relative_path, - schema=self._schema, - root_dir=self._root_dir, - ) - print(f"Deleting data {data_path}") - os.remove(data_path) - - print(f"Deleted {dataset_id} from data registry") + def __init__(self, db_connection, root_dir, owner, owner_type, execution_table): + super().__init__(db_connection, root_dir, owner, owner_type) + + self.execution_table = execution_table + self.which_table = "dataset" + + def register( + self, + relative_path, + version, + version_suffix=None, + name=None, + creation_date=None, + description=None, + execution_id=None, + access_API=None, + access_API_configuration=None, + is_overwritable=False, + old_location=None, + copy=True, + is_dummy=False, + verbose=False, + owner=None, + owner_type=None, + execution_name=None, + execution_description=None, + execution_start=None, + execution_locale=None, + execution_configuration=None, + input_datasets=[], + input_production_datasets=[], + max_config_length=None, + ): + """ + Create a new dataset entry in the DESC data registry. + + Any args marked with '**' share their name with the associated column + in the registry schema. Descriptions of what these columns are can be + found in `schema.yaml` or the documentation. + + First, the dataset entry is created in the database. If success, the + data is then copied (if `old_location` was provided). Only if both + steps are successful will there be "valid" status entry in the + registry. + + Parameters + ---------- + relative_path** : str + version** : str + version_suffix** : str, optional + name** : str, optional + creation_date** : datetime, optional + description** : str, optional + execution_id** : int, optional + access_API** : str, optional + is_overwritable** : bool, optional + old_location : str, optional + Absolute location of dataset to copy into the data registry. + + If None, dataset should already be at correct relative_path within + the data registry. + copy : bool, optional + True to copy data from ``old_location`` into the data registry + (default behaviour). + False to create a symlink. + is_dummy : bool, optional + True for "dummy" datasets (no data is copied, for testing purposes + only) + verbose : bool, optional + Provide some additional output information + owner** : str, optional + owner_type** : str, optional + execution_name** : str, optional + execution_description** : str, optional + execution_start** : datetime, optional + execution_locale** : str, optional + execution_configuration** : str, optional + input_datasets : list, optional + List of dataset ids that were the input to this execution + input_production_datasets : list, optional + List of production dataset ids that were the input to this execution + max_config_length : int, optional + Maxiumum number of lines to read from a configuration file + + Returns + ------- + prim_key : int + The dataset ID of the new row relating to this entry (else None) + execution_id : int + The execution ID associated with the dataset + """ + + # Set max configuration file length + if max_config_length is None: + max_config_length = self._DEFAULT_MAX_CONFIG + + # Make sure the owner_type is legal + if owner_type is None: + if self._owner_type is not None: + owner_type = self._owner_type + else: + owner_type = "user" + if owner_type not in self._OWNER_TYPES: + raise ValueError(f"{owner_type} is not a valid owner_type") + + # Establish the dataset owner + if owner is None: + if self._owner is not None: + owner = self._owner + else: + owner = self._uid + if owner_type == "production": + owner = "production" + + # Checks for production datasets + if owner_type == "production": + if is_overwritable: + raise ValueError("Cannot overwrite production entries") + if version_suffix is not None: + raise ValueError("Production entries can't have version suffix") + if self._schema != "production": + raise ValueError( + "Only the production schema can handle owner_type='production'" + ) + else: + if self._schema == "production": + raise ValueError( + "Only the production schema can handle owner_type='production'" + ) + + # If `name` not passed, automatically generate a name from the relative path + if name is None: + name = _name_from_relpath(relative_path) + + # Look for previous entries. Fail if not overwritable + dataset_table = self._get_table_metadata("dataset") + previous_dataset_id, previous_dataset_overwritable = self._find_previous( + relative_path, + owner, + owner_type, + ) + + if previous_dataset_id is not None: + if not previous_dataset_overwritable: + print(f"Dataset {relative_path} exists, and is not overwritable") + return None + + # Deal with version string (non-special case) + if version not in ["major", "minor", "patch"]: + v_fields = _parse_version_string(version) + version_string = version + else: + # Generate new version fields based on previous entries + # with the same name field and same suffix (i.e., bump) + v_fields = _bump_version( + name, version, version_suffix, dataset_table, self._engine + ) + version_string = ( + f"{v_fields['major']}.{v_fields['minor']}.{v_fields['patch']}" + ) + + # If no execution_id is supplied, create a minimal entry + if execution_id is None: + if execution_name is None: + execution_name = f"for_dataset_{name}-{version_string}" + if version_suffix: + execution_name = f"{execution_name}-{version_suffix}" + if execution_description is None: + execution_description = "Fabricated execution for dataset" + execution_id = self.execution_table.register( + execution_name, + description=execution_description, + execution_start=execution_start, + locale=execution_locale, + configuration=execution_configuration, + input_datasets=input_datasets, + input_production_datasets=input_production_datasets, + ) + + # Pull the dataset properties together + values = {"name": name, "relative_path": relative_path} + values["version_major"] = v_fields["major"] + values["version_minor"] = v_fields["minor"] + values["version_patch"] = v_fields["patch"] + values["version_string"] = version_string + if version_suffix: + values["version_suffix"] = version_suffix + if description: + values["description"] = description + if execution_id: + values["execution_id"] = execution_id + if access_API: + values["access_API"] = access_API + if access_API_configuration: + values["access_API_configuration"] = _read_configuration_file( + access_API_configuration, max_config_length + ) + values["is_overwritable"] = is_overwritable + values["is_overwritten"] = False + values["is_external_link"] = False + values["is_archived"] = False + values["register_date"] = datetime.now() + values["owner_type"] = owner_type + values["owner"] = owner + values["creator_uid"] = self._uid + values["register_root_dir"] = self._root_dir + + # We tentatively start with an "invalid" dataset in the database. This + # will be upgraded to valid if the data copying (if any) was successful. + values["status"] = 0 + + # Create a new row in the data registry database. + with self._engine.connect() as conn: + prim_key = add_table_row(conn, dataset_table, values, commit=False) + + if previous_dataset_id is not None: + # Update previous rows, setting is_overwritten to True + update_stmt = ( + update(dataset_table) + .where(dataset_table.c.dataset_id == previous_dataset_id) + .values(is_overwritten=True) + ) + conn.execute(update_stmt) + conn.commit() + + # Get dataset characteristics; copy to `root_dir` if requested + if not is_dummy: + ( + dataset_organization, + num_files, + total_size, + ds_creation_date, + ) = self._handle_data( + relative_path, old_location, owner, owner_type, verbose + ) + valid_status = 1 + else: + dataset_organization = "dummy" + num_files = 0 + total_size = 0 + ds_creation_date = None + valid_status = 0 + + # Case where use is overwriting the dateset `creation_date` + if creation_date: + ds_creation_date = creation_date + + # Copy was successful, update the entry with dataset metadata + with self._engine.connect() as conn: + update_stmt = ( + update(dataset_table) + .where(dataset_table.c.dataset_id == prim_key) + .values( + data_org=dataset_organization, + nfiles=num_files, + total_disk_space=total_size / 1024 / 1024, + creation_date=ds_creation_date, + status=set_dataset_status(values["status"], valid=True), + ) + ) + conn.execute(update_stmt) + conn.commit() + + return prim_key, execution_id + + def _handle_data(self, relative_path, old_location, owner, owner_type, verbose): + """ + Find characteristics of dataset (i.e., is it a file or directory, how + many files and total disk space of the dataset). + + If old_location is not None, copy the dataset files and directories + into the data registry. + + Parameters + ---------- + relative_path : str + Relative path of dataset in the data registry + old_location : str + Location of data (if not already in the data registry root) + Data will be copied from this location + owner : str + Owner of the dataset + owner_type : str + Owner type of the dataset + verbose : bool + True for extra output + + Returns + ------- + dataset_organization : str + "file", "directory", or "dummy" + num_files : int + Total number of files making up dataset + total_size : float + Total disk space of dataset in bytes + ds_creation_date : datetime + When file or directory was created + """ + + # Get destination directory in data registry. + dest = _form_dataset_path( + owner_type, + owner, + relative_path, + schema=self._schema, + root_dir=self._root_dir, + ) + + # Is the data already on location, or coming from somewhere new? + if old_location: + loc = old_location + else: + loc = dest + + # Get metadata on dataset. + if os.path.isfile(loc): + dataset_organization = "file" + elif os.path.isdir(loc): + dataset_organization = "directory" + else: + raise FileNotFoundError(f"Dataset {loc} not found") + + if verbose: + tic = time.time() + print("Collecting metadata...", end="") + + ds_creation_date = datetime.fromtimestamp(os.path.getctime(loc)) + + if dataset_organization == "directory": + num_files, total_size = get_directory_info(loc) + else: + num_files = 1 + total_size = os.path.getsize(loc) + if verbose: + print(f"took {time.time()-tic:.2f}s") + + # Copy data into data registry + if old_location: + if verbose: + tic = time.time() + print( + f"Copying {num_files} files ({total_size/1024/1024:.2f} Mb)...", + end="", + ) + _copy_data(dataset_organization, old_location, dest) + if verbose: + print(f"took {time.time()-tic:.2f}") + + return dataset_organization, num_files, total_size, ds_creation_date + + def _find_previous(self, relative_path, owner, owner_type): + """ + Find a dataset(s) based on their combination of `relative_path`, + `owner`, `owner_type`. + + Looking to see if an entry exists, so we can check if it is + overwritable. If multiple datasets are found, only the latest (i.e., + that with `is_overwritten=False`) is of interest. + + Parameters + ---------- + relative_path : str + Relative path to dataset + owner : str + Owner of the dataset + owner_type : str + Owner type of the dataset + + Returns + ------- + dataset_id : bool + Dataset ID of dataset with the path combination + dataset_is_overwritable : bool + True if found dataset can be overwritten + """ + + # Search for dataset in the registry. + dataset_table = self._get_table_metadata("dataset") + stmt = select(dataset_table) + + stmt = stmt.where( + dataset_table.c.relative_path == relative_path, + dataset_table.c.owner == owner, + dataset_table.c.owner_type == owner_type, + ) + + with self._engine.connect() as conn: + result = conn.execute(stmt) + conn.commit() + + # Pull out the single result + dataset_id = None + dataset_is_overwritable = None + for r in result: + if not r.is_overwritten: + dataset_is_overwritable = r.is_overwritable + dataset_id = r.dataset_id + break + + return dataset_id, dataset_is_overwritable + + def delete(self, dataset_id): + """ + Delete an dataset entry from the DESC data registry. + + This will also remove the raw data from the root dir, but the dataset + entry remains in the registry (now with an updated `status` field). + + Parameters + ---------- + dataset_id : int + Dataset we want to delete from the registry + """ + + # First make sure the given dataset id is in the registry + dataset_table = self._get_table_metadata(self.which_table) + previous_dataset = self.find_entry(dataset_id) + + # Check dataset exists + if previous_dataset is None: + raise ValueError(f"Dataset ID {dataset_id} does not exist") + # Check dataset is valid + if not get_dataset_status(previous_dataset.status, "valid"): + raise ValueError(f"Dataset ID {dataset_id} does not have a valid status") + # Check dataset has not already been deleted + if get_dataset_status(previous_dataset.status, "deleted"): + raise ValueError(f"Dataset ID {dataset_id} does not have a valid status") + + # Update the status of the dataset to deleted + with self._engine.connect() as conn: + update_stmt = ( + update(dataset_table) + .where(dataset_table.c.dataset_id == dataset_id) + .values( + status=set_dataset_status(previous_dataset.status, deleted=True), + delete_date=datetime.now(), + delete_uid=self._uid, + ) + ) + conn.execute(update_stmt) + conn.commit() + + # Delete the physical data in the root_dir + if previous_dataset.data_org != "dummy": + data_path = _form_dataset_path( + previous_dataset.owner_type, + previous_dataset.owner, + previous_dataset.relative_path, + schema=self._schema, + root_dir=self._root_dir, + ) + print(f"Deleting data {data_path}") + os.remove(data_path) + + print(f"Deleted {dataset_id} from data registry") diff --git a/tests/end_to_end_tests/test_end_to_end.py b/tests/end_to_end_tests/test_end_to_end.py index da60e2d2..b08d9a1c 100644 --- a/tests/end_to_end_tests/test_end_to_end.py +++ b/tests/end_to_end_tests/test_end_to_end.py @@ -116,7 +116,6 @@ def _insert_execution_entry( The execution ID for this new entry """ - new_id = datareg.Registrar.execution.register( name, description=description, @@ -435,9 +434,11 @@ def test_copy_data(dummy_file, data_org): "data_org,data_path,v_str,overwritable", [ ("file", "file1.txt", "0.0.1", True), - ("file", "file1.txt", "0.0.2", False), + ("file", "file1.txt", "0.0.2", True), + ("file", "file1.txt", "0.0.3", False), ("directory", "dummy_dir", "0.0.1", True), - ("directory", "dummy_dir", "0.0.2", False), + ("directory", "dummy_dir", "0.0.2", True), + ("directory", "dummy_dir", "0.0.3", False), ], ) def test_on_location_data(dummy_file, data_org, data_path, v_str, overwritable): @@ -445,9 +446,9 @@ def test_on_location_data(dummy_file, data_org, data_path, v_str, overwritable): Test ingesting real data into the registry (already on location). Also tests overwriting datasets. - Does twice for each file, the first is a normal entry with - `is_overwritable=True`. The second tests overwriting the previous data with - a new version. + Does three times for each file, the first is a normal entry with + `is_overwritable=True`. The second and third tests overwriting the previous + data with a new version. """ # Establish connection to database @@ -492,14 +493,22 @@ def test_on_location_data(dummy_file, data_org, data_path, v_str, overwritable): else: assert getattr(r, "dataset.is_overwritable") == True assert getattr(r, "dataset.is_overwritten") == True - else: - if num_results == 1: - assert getattr(r, "dataset.is_overwritable") == False + elif getattr(r, "version_string") == "0.0.2": + assert num_results >= 2 + if num_results == 2: + assert getattr(r, "dataset.is_overwritable") == True + assert getattr(r, "dataset.is_overwritten") == False + elif num_results == 3: + assert getattr(r, "dataset.is_overwritable") == True assert getattr(r, "dataset.is_overwritten") == True - else: + elif getattr(r, "version_string") == "0.0.3": + assert num_results >= 3 + if num_results == 3: assert getattr(r, "dataset.is_overwritable") == False assert getattr(r, "dataset.is_overwritten") == False - assert i < 2 + else: + assert getattr(r, "dataset.is_overwritable") == True + assert getattr(r, "dataset.is_overwritten") == True def test_dataset_alias(dummy_file): @@ -850,6 +859,7 @@ def test_get_dataset_absolute_path(dummy_file): str(tmp_root_dir), SCHEMA_VERSION, dset_ownertype, dset_owner, dset_relpath ) + @pytest.mark.parametrize( "is_dummy,dataset_name", [ From ad408fd6f8ae31dbce141e8829225017baf9bcec Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Fri, 16 Feb 2024 20:30:05 +0100 Subject: [PATCH 17/19] Add proper function for removing directories --- src/dataregistry/registrar/dataset.py | 6 +++++- tests/end_to_end_tests/test_end_to_end.py | 25 +++++++++++++++++++---- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/src/dataregistry/registrar/dataset.py b/src/dataregistry/registrar/dataset.py index 1b12343c..8ce3fbb7 100644 --- a/src/dataregistry/registrar/dataset.py +++ b/src/dataregistry/registrar/dataset.py @@ -1,6 +1,7 @@ import os import time from datetime import datetime +import shutil from dataregistry.db_basic import add_table_row from sqlalchemy import select, update @@ -471,6 +472,9 @@ def delete(self, dataset_id): root_dir=self._root_dir, ) print(f"Deleting data {data_path}") - os.remove(data_path) + if os.path.isfile(data_path): + os.remove(data_path) + else: + shutil.rmtree(data_path) print(f"Deleted {dataset_id} from data registry") diff --git a/tests/end_to_end_tests/test_end_to_end.py b/tests/end_to_end_tests/test_end_to_end.py index b08d9a1c..e6fb0c8f 100644 --- a/tests/end_to_end_tests/test_end_to_end.py +++ b/tests/end_to_end_tests/test_end_to_end.py @@ -13,7 +13,16 @@ @pytest.fixture def dummy_file(tmp_path): """ - Create some dummy (temporary) files and directories + Create some dummy (temporary) files and directories: + + | - + | - + | - file1.txt + | - file2.txt + | - + | - file2.txt + | + | - Parameters ---------- @@ -865,6 +874,7 @@ def test_get_dataset_absolute_path(dummy_file): [ (True, "dummy_dataset_to_delete"), (False, "real_dataset_to_delete"), + (False, "real_directory_to_delete"), ], ) def test_delete_entry(dummy_file, is_dummy, dataset_name): @@ -886,8 +896,12 @@ def test_delete_entry(dummy_file, is_dummy, dataset_name): if is_dummy: data_path = None else: - data_path = str(tmp_src_dir / "file2.txt") - assert os.path.isfile(data_path) + if dataset_name == "real_dataset_to_delete": + data_path = str(tmp_src_dir / "file2.txt") + assert os.path.isfile(data_path) + else: + data_path = str(tmp_src_dir / "directory1") + assert os.path.isdir(data_path) # Add entry d_id = _insert_dataset_entry( @@ -933,7 +947,10 @@ def test_delete_entry(dummy_file, is_dummy, dataset_name): schema=SCHEMA_VERSION, root_dir=str(tmp_root_dir), ) - assert not os.path.isfile(data_path) + if dataset_name == "real_dataset_to_delete": + assert not os.path.isfile(data_path) + else: + assert not os.path.isdir(data_path) # Make sure we can not delete an already deleted entry. with pytest.raises(ValueError, match="not have a valid status"): From 21cad22bb27aa45ee186c508d72651f2879e4b4e Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Sat, 17 Feb 2024 12:49:11 +0100 Subject: [PATCH 18/19] Revert find_previous behaviour --- src/dataregistry/registrar/dataset.py | 49 +++++++++++++-------------- 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/src/dataregistry/registrar/dataset.py b/src/dataregistry/registrar/dataset.py index 8ce3fbb7..bad8d30b 100644 --- a/src/dataregistry/registrar/dataset.py +++ b/src/dataregistry/registrar/dataset.py @@ -156,16 +156,11 @@ def register( # Look for previous entries. Fail if not overwritable dataset_table = self._get_table_metadata("dataset") - previous_dataset_id, previous_dataset_overwritable = self._find_previous( - relative_path, - owner, - owner_type, - ) + previous = self._find_previous(relative_path, owner, owner_type) - if previous_dataset_id is not None: - if not previous_dataset_overwritable: - print(f"Dataset {relative_path} exists, and is not overwritable") - return None + if previous is None: + print(f"Dataset {relative_path} exists, and is not overwritable") + return None, None # Deal with version string (non-special case) if version not in ["major", "minor", "patch"]: @@ -235,11 +230,11 @@ def register( with self._engine.connect() as conn: prim_key = add_table_row(conn, dataset_table, values, commit=False) - if previous_dataset_id is not None: + if len(previous) > 0: # Update previous rows, setting is_overwritten to True update_stmt = ( update(dataset_table) - .where(dataset_table.c.dataset_id == previous_dataset_id) + .where(dataset_table.c.dataset_id.in_(previous)) .values(is_overwritten=True) ) conn.execute(update_stmt) @@ -372,12 +367,15 @@ def _handle_data(self, relative_path, old_location, owner, owner_type, verbose): def _find_previous(self, relative_path, owner, owner_type): """ - Find a dataset(s) based on their combination of `relative_path`, - `owner`, `owner_type`. + Find each dataset with combination of `relative_path`, `owner`, + `owner_type`. + + We want to know, of those datasets, which are overwritable but have not + yet been marked as overwritten. - Looking to see if an entry exists, so we can check if it is - overwritable. If multiple datasets are found, only the latest (i.e., - that with `is_overwritten=False`) is of interest. + If any dataset with the same path has `is_overwritable=False`, the + routine returns None, indicating the dataset is not allowed to be + overwritten. Parameters ---------- @@ -390,10 +388,9 @@ def _find_previous(self, relative_path, owner, owner_type): Returns ------- - dataset_id : bool - Dataset ID of dataset with the path combination - dataset_is_overwritable : bool - True if found dataset can be overwritten + dataset_id_list : list + List of dataset IDs that have the desired path combination that are + overwritable, but have not already previously been overwritten. """ # Search for dataset in the registry. @@ -411,15 +408,15 @@ def _find_previous(self, relative_path, owner, owner_type): conn.commit() # Pull out the single result - dataset_id = None - dataset_is_overwritable = None + dataset_id_list = [] for r in result: + if not r.is_overwritable: + return None + if not r.is_overwritten: - dataset_is_overwritable = r.is_overwritable - dataset_id = r.dataset_id - break + dataset_id_list.append(r.dataset_id) - return dataset_id, dataset_is_overwritable + return dataset_id_list def delete(self, dataset_id): """ From 90646145c00f1221321b5e48e7174c906e23984c Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Thu, 7 Mar 2024 12:09:28 +0100 Subject: [PATCH 19/19] Address review comments --- src/dataregistry/registrar/dataset.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/dataregistry/registrar/dataset.py b/src/dataregistry/registrar/dataset.py index bad8d30b..65c8f79c 100644 --- a/src/dataregistry/registrar/dataset.py +++ b/src/dataregistry/registrar/dataset.py @@ -395,7 +395,11 @@ def _find_previous(self, relative_path, owner, owner_type): # Search for dataset in the registry. dataset_table = self._get_table_metadata("dataset") - stmt = select(dataset_table) + stmt = select( + dataset_table.c.dataset_id, + dataset_table.c.is_overwritable, + dataset_table.c.is_overwritten, + ) stmt = stmt.where( dataset_table.c.relative_path == relative_path,