diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 88a0f860..c9e893e5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -74,6 +74,7 @@ jobs: env: DATAREG_CONFIG: "${{ github.workspace }}/config.txt" + DATAREG_BACKEND: "postgres" # Service containers to run with `runner-job` services: @@ -150,6 +151,7 @@ jobs: env: DATAREG_CONFIG: "${{ github.workspace }}/config.txt" + DATAREG_BACKEND: "sqlite" # Our strategy lists the OS and Python versions we want to test on. strategy: diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml index 2c7e4af8..ed2295b7 100644 --- a/.github/workflows/documentation.yml +++ b/.github/workflows/documentation.yml @@ -10,7 +10,7 @@ jobs: run: | python -m pip install --upgrade pip python -m pip install . - pip install sphinx sphinx_rtd_theme sphinx_toolbox sphinxcontrib-autoprogram + pip install sphinx sphinx_rtd_theme sphinx_toolbox sphinxcontrib-autoprogram sphinxcontrib.datatemplates - name: Sphinx build run: | sphinx-build docs/source _build diff --git a/docs/source/_static/css/custom.css b/docs/source/_static/css/custom.css new file mode 100644 index 00000000..7d2e89e6 --- /dev/null +++ b/docs/source/_static/css/custom.css @@ -0,0 +1,3 @@ +.tight-table td { + white-space: normal !important; +} diff --git a/docs/source/conf.py b/docs/source/conf.py index 8b2fa02e..6bbbb460 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -5,7 +5,8 @@ "sphinx_rtd_theme", "sphinx.ext.autodoc", 'sphinx.ext.napoleon', - 'sphinxcontrib.autoprogram' + 'sphinxcontrib.autoprogram', + 'sphinxcontrib.datatemplates' ] project = 'DESC data management' @@ -36,3 +37,9 @@ html_logo = '_static/DREGS_logo_v2.png' autoclass_content = 'both' + +templates_path = ['templates'] + +html_css_files = [ + 'css/custom.css', +] diff --git a/docs/source/reference_schema.rst b/docs/source/reference_schema.rst index 9494ac52..66da4bc2 100644 --- a/docs/source/reference_schema.rst +++ b/docs/source/reference_schema.rst @@ -7,248 +7,5 @@ database (e.g., the default and production schemas) follows the same structure. .. image:: _static/schema_plot.png :alt: Image missing -The dataset table ------------------ - -.. list-table:: - :header-rows: 1 - - * - row - - description - - type - * - ``dataset_id`` - - Unique identifier for dataset - - int - * - ``name`` - - User given name for dataset - - str - * - ``relative_path`` - - Relative path storing the data, relative to `` - - str - * - ``version_major`` - - Major version in semantic string (i.e., X.x.x) - - int - * - ``version_minor`` - - Minor version in semantic string (i.e., x.X.x) - - int - * - ``version_patch`` - - Patch version in semantic string (i.e., x.x.X) - - int - * - ``version_suffix`` - - Optional version suffix - - str - * - ``dataset_creation_date`` - - Dataset creation date - - datetime - * - ``is_archived`` - - True if the data is archived, i.e, the data is longer within `` - - bool - * - ``is_external_link`` - - ??? - - bool - * - ``is_overwritten`` - - True if the original data for this dataset has been overwritten at some point. This would have required that ``is_overwritable`` was set to ``true`` on the original dataset - - bool - * - ``is_valid`` - - ??? - - bool - * - ``register_date`` - - Date the dataset was registered - - datetime - * - ``creator_uid`` - - `uid` (user id) of the person that registered the dataset - - str - * - ``access_API`` - - Describes the software that can read the dataset (e.g., "gcr-catalogs", "skyCatalogs") - - str - * - ``execution_id`` - - ID of execution this dataset belongs to - - int - * - ``description`` - - User provided description of the dataset - - str - * - ``owner_type`` - - Datasets owner type, can be "user", "group", "project" or "production". - - str - * - ``owner`` - - Owner of the dataset - - str - * - ``data_org`` - - Dataset organisation ("file" or "directory") - - str - * - ``nfiles`` - - How many files are in the dataset - - int - * - ``total_disk_space`` - - Total disk spaced used by the dataset - - float - -The dataset_alias table ------------------------ - -.. list-table:: - :header-rows: 1 - - * - row - - description - - type - * - ``dataset_alias_id`` - - Unique identifier for alias - - int - * - ``name`` - - User given alias name - - str - * - ``dataset_id`` - - ID of dataset this is an alias for - - int - * - ``supersede_date`` - - If a new entry has been added to the table with the same alias name (but - different dataset_id), the old entry will be superseded. ``supersede_date`` - in the old entry tracks when this happened. If the entry has not been - superseded, ``supersede_date`` will be None - - datetime - * - ``register_date`` - - Date the dataset was registered - - datetime - * - ``creator_uid`` - - `uid` (user id) of the person that registered the dataset - - str - -The dependency table --------------------- - -.. list-table:: - :header-rows: 1 - - * - row - - description - - type - * - ``dependency_id`` - - Unique identifier for dependency - - int - * - ``execution_id`` - - Execution this dependency is linked to - - int - * - ``input_id`` - - Dataset ID of the dependent dataset - - int - * - ``register_date`` - - Date the dependency was registered - - datetime - -The execution table -------------------- - -.. list-table:: - :header-rows: 1 - - * - row - - description - - type - * - ``execution_id`` - - Unique identifier for execution - - int - * - ``description`` - - User given discription of execution - - str - * - ``name`` - - User given execution name - - str - * - ``register_date`` - - Date the execution was registered - - datetime - * - ``execution_start`` - - Date the execution started - - datetime - * - ``locale`` - - Locale of execution (e.g., NERSC) - - str - * - ``configuration`` - - Path to configuration file of execution - - str - * - ``creator_uid`` - - `uid` (user id) of the person that registered the dataset - - str - -The execution_alias table -------------------------- - -.. list-table:: - :header-rows: 1 - - * - row - - description - - type - * - ``execution_alias_id`` - - Unique identifier for execution alias - - int - * - ``execution_id`` - - Execution this alias is linked to - - int - * - ``alias`` - - User given execution alias name - - str - * - ``register_date`` - - Date the execution was registered - - datetime - * - ``supersede_date`` - - If a new entry has been added to the table with the same alias name (but - different dataset_id), the old entry will be superseded. ``supersede_date`` - in the old entry tracks when this happened. If the entry has not been - superseded, ``supersede_date`` will be None - - datetime - * - ``creator_uid`` - - `uid` (user id) of the person that registered the dataset - - str - -The provenance table --------------------- - -.. list-table:: - :header-rows: 1 - - * - row - - description - - type - * - ``provenance_id`` - - Unique identifier for provenance - - int - * - ``code_version_major`` - - Major version of code when this schema was created - - int - * - ``code_version_minor`` - - Minor version of code when this schema was created - - int - * - ``code_version_patch`` - - Patch version of code when this schema was created - - int - * - ``code_version_suffix`` - - Version suffix of code when this schema was created - - str - * - ``db_version_major`` - - Major version of database - - int - * - ``db_version_minor`` - - Minor version of database - - int - * - ``db_version_patch`` - - Patch version of database - - int - * - ``git_hash`` - - Git commit hash when this schema was created - - str - * - ``repo_is_clean`` - - Was repository clean when this schema was created - - bool - * - ``update_method`` - - "CREATE", "MODIFY" or "MIGRATE" - - str - * - ``schema_enabled_date`` - - When was the schema enabled - - datetime - * - ``creator_uid`` - - `uid` (user id) of the person that registered the schema - - str - * - ``comment`` - - Any comment - - str +.. datatemplate:yaml:: ../../src/dataregistry/schema/schema.yaml + :template: schema_table.tmpl diff --git a/docs/source/templates/schema_table.tmpl b/docs/source/templates/schema_table.tmpl new file mode 100644 index 00000000..7abbbce6 --- /dev/null +++ b/docs/source/templates/schema_table.tmpl @@ -0,0 +1,22 @@ +.. -*- mode: rst -*- + +{% for table in ['execution','provenance','execution_alias','dataset','dependency','dataset_alias'] %} + +The {{table}} table +---------------------------------------- + +.. list-table:: + :header-rows: 1 + :class: tight-table + + * - row + - description + - type + +{% for item in data[table] %} + * - {{item}} +{% for item2 in ['description', 'type'] %} + - {{data[table][item][item2]}} +{% endfor %} +{% endfor %} +{% endfor %} diff --git a/pyproject.toml b/pyproject.toml index c15b3459..8cc09883 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,4 +38,4 @@ where = ["src"] dregs = "cli.cli:main" [tool.setuptools.package-data] -"dataregistry" = ["site_config/site_rootdir.yaml"] +"dataregistry" = ["site_config/site_rootdir.yaml", "schema/schema.yaml"] diff --git a/scripts/create_registry_db.py b/scripts/create_registry_db.py index c63c1cd1..4c21d302 100644 --- a/scripts/create_registry_db.py +++ b/scripts/create_registry_db.py @@ -2,11 +2,21 @@ import sys import argparse from datetime import datetime -from sqlalchemy import Column, Integer, String, DateTime, Boolean, Index, Float +from sqlalchemy import ( + Column, + ColumnDefault, + Integer, + String, + DateTime, + Boolean, + Index, + Float, +) from sqlalchemy import ForeignKey, UniqueConstraint, text from sqlalchemy.orm import relationship, DeclarativeBase from dataregistry.db_basic import DbConnection, SCHEMA_VERSION -from dataregistry.db_basic import add_table_row, _insert_provenance +from dataregistry.db_basic import _insert_provenance +from dataregistry.schema import load_schema """ A script to create the default dataregistry schema and the production schema. @@ -20,14 +30,96 @@ - "provenance" : Contains information about the database/schema """ +# Conversion from string types in `schema.yaml` to SQLAlchemy +_TYPE_TRANSLATE = { + "String": String, + "Integer": Integer, + "DateTime": DateTime, + "StringShort": String(20), + "StringLong": String(250), + "Boolean": Boolean, + "Float": Float, +} + +# Load the schema from the `schema.yaml` file +schema_yaml = load_schema() + + +def _get_column_definitions(schema, table): + """ + Build the SQLAlchemy `Column` list for this table from the information in + the `schema.yaml` file. + + Parameters + ---------- + schema : str + table : str + + Returns + ------- + return_dict : dict + SQLAlchemy Column entries for each table + """ + + return_dict = {} + for column in schema_yaml[table].keys(): + # Special case where column has a foreign key + if schema_yaml[table][column]["foreign_key"]: + if schema_yaml[table][column]["foreign_key_schema"] == "self": + schema_yaml[table][column]["foreign_key_schema"] = schema + + return_dict[column] = Column( + column, + _TYPE_TRANSLATE[schema_yaml[table][column]["type"]], + ForeignKey( + _get_ForeignKey_str( + schema_yaml[table][column]["foreign_key_schema"], + schema_yaml[table][column]["foreign_key_table"], + schema_yaml[table][column]["foreign_key_column"], + ) + ), + primary_key=schema_yaml[table][column]["primary_key"], + nullable=schema_yaml[table][column]["nullable"], + ) + + # Normal case + else: + return_dict[column] = Column( + column, + _TYPE_TRANSLATE[schema_yaml[table][column]["type"]], + primary_key=schema_yaml[table][column]["primary_key"], + nullable=schema_yaml[table][column]["nullable"], + ) + + return return_dict + + class Base(DeclarativeBase): pass -def _get_ForeignKey_str(schema, table, row): + +def _get_ForeignKey_str(schema, table, column): + """ + Get the string reference to the ".." a foreign key will + point to. + + The schema address will only be included for postgres backends. + + Parameters + --------- + schema : str + table : str + column : str + + Returns + ------- + - : str + """ + if schema is None: - return f"{table}.{row}" + return f"{table}.{column}" else: - return f"{schema}.{table}.{row}" + return f"{schema}.{table}.{column}" def _Provenance(schema): @@ -35,30 +127,13 @@ def _Provenance(schema): class_name = f"{schema}_provenance" - # Rows - rows = { - "provenance_id": Column("provenance_id", Integer, primary_key=True), - "code_version_major": Column("code_version_major", Integer, nullable=False), - "code_version_minor": Column("code_version_minor", Integer, nullable=False), - "code_version_patch": Column("code_version_patch", Integer, nullable=False), - "code_version_suffix": Column("code_version_suffix", String), - "db_version_major": Column("db_version_major", Integer, nullable=False), - "db_version_minor": Column("db_version_minor", Integer, nullable=False), - "db_version_patch": Column("db_version_patch", Integer, nullable=False), - "git_hash": Column("git_hash", String, nullable=True), - "repo_is_clean": Column("repo_is_clean", Boolean, nullable=True), - # update method is always "CREATE" for this script. - # Alternative could be "MODIFY" or "MIGRATE" - "update_method": Column("update_method", String(10), nullable=False), - "schema_enabled_date": Column("schema_enabled_date", DateTime, nullable=False), - "creator_uid": Column("creator_uid", String(20), nullable=False), - "comment": Column("comment", String(250)), - } + # Load columns from `schema.yaml` file + columns = _get_column_definitions(schema, "provenance") # Table metadata meta = {"__tablename__": "provenance", "__table_args__": {"schema": schema}} - Model = type(class_name, (Base,), {**rows, **meta}) + Model = type(class_name, (Base,), {**columns, **meta}) return Model @@ -67,24 +142,13 @@ def _Execution(schema): class_name = f"{schema}_execution" - # Rows - rows = { - "execution_id": Column("execution_id", Integer, primary_key=True), - "description": Column("description", String), - "register_date": Column("register_date", DateTime, nullable=False), - "execution_start": Column("execution_start", DateTime), - # name is meant to identify the code executed. E.g., could be pipeline name - "name": Column("name", String), - # locale is, e.g. site where code was run - "locale": Column("locale", String), - "configuration": Column("configuration", String), - "creator_uid": Column("creator_uid", String(20), nullable=False), - } + # Load columns from `schema.yaml` file + columns = _get_column_definitions(schema, "execution") # Table metadata meta = {"__tablename__": "execution", "__table_args__": {"schema": schema}} - Model = type(class_name, (Base,), {**rows, **meta}) + Model = type(class_name, (Base,), {**columns, **meta}) return Model @@ -93,18 +157,8 @@ def _ExecutionAlias(schema): class_name = f"{schema}_execution_alias" - # Rows - rows = { - "execution_alias_id": Column("execution_alias_id", Integer, primary_key=True), - "alias": Column(String, nullable=False), - "execution_id": Column( - Integer, - ForeignKey(_get_ForeignKey_str(schema, "execution", "execution_id")), - ), - "supersede_date": Column(DateTime, default=None), - "register_date": Column(DateTime, nullable=False), - "creator_uid": Column(String(20), nullable=False), - } + # Load columns from `schema.yaml` file + columns = _get_column_definitions(schema, "execution_alias") # Table metadata meta = { @@ -115,7 +169,7 @@ def _ExecutionAlias(schema): ), } - Model = type(class_name, (Base,), {**rows, **meta}) + Model = type(class_name, (Base,), {**columns, **meta}) return Model @@ -124,17 +178,8 @@ def _DatasetAlias(schema): class_name = f"{schema}_dataset_alias" - # Rows - rows = { - "dataset_alias_id": Column(Integer, primary_key=True), - "alias": Column(String, nullable=False), - "dataset_id": Column( - Integer, ForeignKey(_get_ForeignKey_str(schema, "dataset", "dataset_id")) - ), - "supersede_date": Column(DateTime, default=None), - "register_date": Column(DateTime, nullable=False), - "creator_uid": Column(String(20), nullable=False), - } + # Load columns from `schema.yaml` file + columns = _get_column_definitions(schema, "dataset_alias") # Table metadata meta = { @@ -145,7 +190,7 @@ def _DatasetAlias(schema): ), } - Model = type(class_name, (Base,), {**rows, **meta}) + Model = type(class_name, (Base,), {**columns, **meta}) return Model @@ -154,49 +199,8 @@ def _Dataset(schema): class_name = f"{schema}_dataset" - # Rows - rows = { - "dataset_id": Column(Integer, primary_key=True), - "name": Column(String, nullable=False), - "relative_path": Column(String, nullable=False), - "version_major": Column(Integer, nullable=False), - "version_minor": Column(Integer, nullable=False), - "version_patch": Column(Integer, nullable=False), - "version_string": Column(String, nullable=False), - "version_suffix": Column(String), - "dataset_creation_date": Column(DateTime), - "is_archived": Column(Boolean, default=False), - "is_external_link": Column(Boolean, default=False), - "is_overwritable": Column(Boolean, default=False), - "is_overwritten": Column(Boolean, default=False), - "is_valid": Column(Boolean, default=True), # False if, e.g., copy failed - # The following are boilerplate, included in all or most tables - "register_date": Column(DateTime, nullable=False), - "creator_uid": Column(String(20), nullable=False), - # Make access_API a string for now, but it could be an enumeration or - # a foreign key into another table. Possible values for the column - # might include "gcr-catalogs", "skyCatalogs" - "access_API": Column("access_API", String(20)), - # Optional configuration file associated with access API - "access_API_configuration": Column("configuration", String), - # A way to associate a dataset with a program execution or "run" - "execution_id": Column( - Integer, - ForeignKey(_get_ForeignKey_str(schema, "execution", "execution_id")), - ), - "description": Column(String), - "owner_type": Column(String, nullable=False), - # If ownership_type is 'production', then owner is always 'production' - # If ownership_type is 'group', owner will be a group name - # If ownership_type is 'user', owner will be a user name - "owner": Column(String, nullable=False), - # To store metadata about the dataset. - "data_org": Column("data_org", String, nullable=False), - "nfiles": Column("nfiles", Integer, nullable=False), - "total_disk_space": Column("total_disk_space", Float, nullable=False), - # What `root_dir` was the data originially ingested into - "register_root_dir": Column(String, nullable=False), - } + # Load columns from `schema.yaml` file + columns = _get_column_definitions(schema, "dataset") # Table metadata meta = { @@ -210,7 +214,7 @@ def _Dataset(schema): ), } - Model = type(class_name, (Base,), {**rows, **meta}) + Model = type(class_name, (Base,), {**columns, **meta}) return Model @@ -219,30 +223,17 @@ def _Dependency(schema, has_production): class_name = f"{schema}_dependency" - # Rows - rows = { - "dependency_id": Column(Integer, primary_key=True), - "register_date": Column(DateTime, nullable=False), - "execution_id": Column( - Integer, - ForeignKey(_get_ForeignKey_str(schema, "execution", "execution_id")), - ), - "input_id": Column( - Integer, ForeignKey(_get_ForeignKey_str(schema, "dataset", "dataset_id")) - ), - } + # Load columns from `schema.yaml` file + columns = _get_column_definitions(schema, "dependency") + + # Remove link to production schema. + if not has_production: + del columns["input_production_id"] - # Add link to production schema. - if has_production: - rows["input_production_id"] = Column( - Integer, - ForeignKey(_get_ForeignKey_str("production", "dataset", "dataset_id")), - ) - # Table metadata meta = {"__tablename__": "dependency", "__table_args__": {"schema": schema}} - Model = type(class_name, (Base,), {**rows, **meta}) + Model = type(class_name, (Base,), {**columns, **meta}) return Model @@ -290,7 +281,7 @@ def _Dependency(schema, has_production): stmt = f"CREATE SCHEMA IF NOT EXISTS {SCHEMA}" conn.execute(text(stmt)) conn.commit() - + # Grant reg_reader access acct = "reg_reader" for SCHEMA in SCHEMA_LIST: diff --git a/src/cli/cli.py b/src/cli/cli.py index ed07b67d..a60ecc9d 100644 --- a/src/cli/cli.py +++ b/src/cli/cli.py @@ -4,6 +4,7 @@ from dataregistry.db_basic import SCHEMA_VERSION from .register import register_dataset from .query import dregs_ls +from dataregistry.schema import load_schema # --------------------- # The data registry CLI @@ -45,6 +46,20 @@ # Register a dataset # ------------------ +# Load the schema information +schema_data = load_schema() + +# Conversion from string types in `schema.yaml` to SQLAlchemy +_TYPE_TRANSLATE = { + "String": str, + "Integer": int, + "DateTime": str, + "StringShort": str, + "StringLong": str, + "Boolean": bool, + "Float": float, +} + # Register a new database entry. arg_register = subparsers.add_parser( "register", help="Register a new entry to the database" @@ -57,6 +72,36 @@ # Register a new dataset. arg_register_dataset = arg_register_sub.add_parser("dataset", help="Register a dataset") +# Get some information from the `schema.yaml` file +for column in schema_data["dataset"]: + extra_args = {} + + # Any default? + if schema_data["dataset"][column]["cli_default"] is not None: + extra_args["default"] = schema_data["dataset"][column]["cli_default"] + default_str = f" (default={extra_args['default']})" + else: + default_str = "" + + # Restricted to choices? + if schema_data["dataset"][column]["choices"] is not None: + extra_args["choices"] = schema_data["dataset"][column]["choices"] + + # Is this a boolean flag? + if schema_data["dataset"][column]["type"] == "Boolean": + extra_args["action"] = "store_true" + else: + extra_args["type"] = _TYPE_TRANSLATE[schema_data["dataset"][column]["type"]] + + # Add flag + if schema_data["dataset"][column]["cli_optional"]: + arg_register_dataset.add_argument( + "--" + column, + help=schema_data["dataset"][column]["description"] + default_str, + **extra_args, + ) + +# Entries unique to registering the dataset using the CLI arg_register_dataset.add_argument( "relative_path", help=( @@ -75,45 +120,6 @@ ), type=str, ) -arg_register_dataset.add_argument( - "--version_suffix", - help=( - "Optional suffix string to place at the end of the version string." - "Cannot be used for production datasets." - ), - type=str, -) -arg_register_dataset.add_argument( - "--name", - help=( - "Any convenient, evocative name for the human. Note the combination of" - "name, version and version_suffix must be unique. If None name is generated" - "from the relative path." - ), - type=str, -) -arg_register_dataset.add_argument( - "--creation_date", help="Manually set creation date of dataset" -) -arg_register_dataset.add_argument( - "--description", help="Human-readable description of dataset", type=str -) -arg_register_dataset.add_argument( - "--execution_id", - help="Used to associate dataset with a particular execution", - type=int, -) -arg_register_dataset.add_argument( - "--access_API", help="Hint as to how to read the data", type=str -) -arg_register_dataset.add_argument( - "--is_overwritable", - help=( - "True if dataset may be overwritten (defaults to False). Production" - "datasets cannot be overwritten." - ), - action="store_true", -) arg_register_dataset.add_argument( "--old_location", help=( @@ -137,18 +143,6 @@ default=f"{SCHEMA_VERSION}", help="Which schema to connect to", ) -arg_register_dataset.add_argument( - "--locale", - help="Location where dataset was produced", - type=str, - default="NERSC", -) -arg_register_dataset.add_argument( - "--owner", help="Owner of dataset. Defaults to $USER." -) -arg_register_dataset.add_argument( - "--owner-type", choices=["production", "group", "user"], default="user" -) arg_register_dataset.add_argument( "--config_file", help="Location of data registry config file", type=str ) diff --git a/src/cli/register.py b/src/cli/register.py index 98340e91..ca74870b 100644 --- a/src/cli/register.py +++ b/src/cli/register.py @@ -9,7 +9,7 @@ def register_dataset(args): Parameters ---------- args : argparse object - + args.config_file : str Path to data registry config file args.schema : str @@ -38,11 +38,15 @@ def register_dataset(args): name=args.name, version_suffix=args.version_suffix, creation_date=args.creation_date, + access_API=args.access_API, + execution_id=args.execution_id, + is_overwritable=args.is_overwritable, description=args.description, old_location=args.old_location, copy=(not args.make_symlink), is_dummy=args.is_dummy, owner=args.owner, + owner_type=args.owner_type, execution_name=args.execution_name, execution_description=args.execution_description, execution_start=args.execution_start, diff --git a/src/dataregistry/registrar.py b/src/dataregistry/registrar.py index 7411a69e..e0d51ee4 100644 --- a/src/dataregistry/registrar.py +++ b/src/dataregistry/registrar.py @@ -229,22 +229,19 @@ def register_execution( """ Register a new execution in the DESC data registry. + Any args marked with '**' share their name with the associated column + in the registry schema. Descriptions of what these columns are can be + found in `schema.yaml` or the documentation. + Parameters ---------- - name : str - Typically pipeline name or program name - description : str, optional - Human readible description of execution - execution_start : datetime, optional - Date the execution started - locale : str, optional - Where was the execution performed? - configuration : str, optional - Path to text file used to configure the execution - input_datasets : list, optional - List of dataset ids that were the input to this execution - input_production_datasets : list, optional - List of production dataset ids that were the input to this execution + name** : str + description** : str, optional + execution_start** : datetime, optional + locale** : str, optional + configuration** : str, optional + input_datasets** : list, optional + input_production_datasets** : list, optional max_config_length : int, optional Maxiumum number of lines to read from a configuration file @@ -323,39 +320,21 @@ def register_dataset( """ Register a new dataset in the DESC data registry. + Any args marked with '**' share their name with the associated column + in the registry schema. Descriptions of what these columns are can be + found in `schema.yaml` or the documentation. + Parameters ---------- - relative_path : str - Destination for the dataset within the data registry. Path is - relative to ``//``. - version : str - Semantic version string of the format MAJOR.MINOR.PATCH *or* - a special flag: "patch", "minor" or "major". - - When a special flag is used it automatically bumps the relative - version for you (see examples for more details). - version_suffix : str, optional - Suffix string to place at the end of the version string. Cannot be - used for production datasets. - name : str, optional - Any convenient, evocative name for the human. - - Note the combination of name, version and version_suffix must be - unique. - creation_date : datetime, optional - Manually set creation date of dataset - description : str, optional - Human-readable description of dataset - execution_id : int, optional - Used to associate dataset with a particular execution - access_API : str, optional - Hint as to how to read the data - access_API_configuration : str, optional - Path to configuration file for `access_API` - is_overwritable : bool, optional - True if dataset may be overwritten (defaults to False). - - Note production datasets cannot be overwritten. + relative_path** : str + version** : str + version_suffix** : str, optional + name** : str, optional + creation_date** : datetime, optional + description** : str, optional + execution_id** : int, optional + access_API** : str, optional + is_overwritable** : bool, optional old_location : str, optional Absolute location of dataset to copy into the data registry. @@ -370,23 +349,13 @@ def register_dataset( only) verbose : bool, optional Provide some additional output information - owner : str, optional - Owner of the dataset. If None, defaults to what was set in - Registrar __init__, if that is also None, defaults to $USER. - owner_type : str, optional - Owner type: "user", "group", or "production". If None, defaults to - what was set in Registrar __init__, if that is also None, defaults - to "user". - execution_name : str, optional - Typically pipeline name or program name - execution_description : str, optional - Human readible description of execution - execution_start : datetime, optional - Date the execution started - execution_locale : str, optional - Where was the execution performed? - execution_configuration : str, optional - Path to text file used to configure the execution + owner** : str, optional + owner_type** : str, optional + execution_name** : str, optional + execution_description** : str, optional + execution_start** : datetime, optional + execution_locale** : str, optional + execution_configuration** : str, optional input_datasets : list, optional List of dataset ids that were the input to this execution input_production_datasets : list, optional @@ -506,10 +475,10 @@ def register_dataset( if version_suffix: values["version_suffix"] = version_suffix if creation_date: - values["dataset_creation_date"] = creation_date + values["creation_date"] = creation_date else: if ds_creation_date: - values["dataset_creation_date"] = ds_creation_date + values["creation_date"] = ds_creation_date if description: values["description"] = description if execution_id: @@ -520,6 +489,9 @@ def register_dataset( values["access_API_configuration"] = _read_configuration_file(access_API_configuration, max_config_length) values["is_overwritable"] = is_overwritable values["is_overwritten"] = False + values["is_external_link"] = False + values["is_archived"] = False + values["is_valid"] = True values["register_date"] = datetime.now() values["owner_type"] = owner_type values["owner"] = owner @@ -549,12 +521,14 @@ def register_dataset_alias(self, aliasname, dataset_id): """ Register a new dataset alias in the DESC data registry. + Any args marked with '**' share their name with the associated column + in the registry schema. Descriptions of what these columns are can be + found in `schema.yaml` or the documentation. + Parameters ---------- - aliasname : str - Human readible alias for the dataset - dataset_id : int - Existing dataset ID to attach dataset alias to + aliasname** : str + dataset_id** : int Returns ------- diff --git a/src/dataregistry/schema/__init__.py b/src/dataregistry/schema/__init__.py new file mode 100644 index 00000000..329fa0e8 --- /dev/null +++ b/src/dataregistry/schema/__init__.py @@ -0,0 +1 @@ +from .load_schema import load_schema diff --git a/src/dataregistry/schema/load_schema.py b/src/dataregistry/schema/load_schema.py new file mode 100644 index 00000000..fbd0f238 --- /dev/null +++ b/src/dataregistry/schema/load_schema.py @@ -0,0 +1,40 @@ +import os +import yaml + + +def _populate_defaults(mydict): + """ + Populate the default values for rows that haven't been specified in the + YAML file. + + Parameters + ---------- + mydict : dict + """ + + # Attributes we check for and populate with these default value if missing + atts = {"nullable": True, "primary_key": False, "foreign_key": False, "cli_optional": False, + "cli_default": None, "choices": None} + + # Loop over eah row and ingest + for table in mydict.keys(): + for row in mydict[table].keys(): + for att in atts.keys(): + if att not in mydict[table][row].keys(): + mydict[table][row][att] = atts[att] + + +def load_schema(): + """Load the schema layout from the YAML file""" + + # Load + yaml_file_path = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "schema.yaml" + ) + with open(yaml_file_path, "r") as file: + yaml_data = yaml.safe_load(file) + + # Populate defaults + _populate_defaults(yaml_data) + + return yaml_data diff --git a/src/dataregistry/schema/schema.yaml b/src/dataregistry/schema/schema.yaml new file mode 100644 index 00000000..972fa4e5 --- /dev/null +++ b/src/dataregistry/schema/schema.yaml @@ -0,0 +1,282 @@ +--- +execution: + execution_id: + type: "Integer" + primary_key: True + description: "Unique identifier for execution" + description: + type: "String" + description: "Short description of execution" + register_date: + type: "DateTime" + description: "When was the execution registered in the database" + nullable: False + execution_start: + type: "DateTime" + description: "When was the execution performed at `locale`" + name: + type: "String" + description: "Identifies the code executed (e.g., could be pipeline name)" + locale: + type: "String" + description: "Site where the code was run (e.g., NERSC)" + configuration: + type: "String" + description: "Path to execution configuration file (txt, YAML, TOML, etc). Ingested as raw text" + creator_uid: + type: "StringShort" + description: "UID of person who registered the entry" + nullable: False + +provenance: + provenance_id: + type: "Integer" + primary_key: True + description: "Unique identifier for this provenance entry" + code_version_major: + type: "Integer" + description: "Major version of code when this schema was created" + nullable: False + code_version_minor: + type: "Integer" + description: "Minor version of code when this schema was created" + nullable: False + code_version_patch: + type: "Integer" + description: "Patch version of code when this schema was created" + nullable: False + code_version_suffix: + type: "String" + description: "Version suffix of code when this schema was created" + creator_uid: + type: "StringShort" + description: "UID of person who registered the entry" + nullable: False + db_version_major: + type: "Integer" + description: "Major version of schema" + nullable: False + db_version_minor: + type: "Integer" + description: "Minor version of schema" + nullable: False + db_version_patch: + type: "Integer" + description: "Patch version of schema" + nullable: False + git_hash: + type: "String" + description: "Git hash at time of schema creation" + repo_is_clean: + type: "Boolean" + description: "Was git repo clean at schema creation?" + update_method: + type: "String" + description: "What type of schema update does this entry relate to ('CREATE','MODIFY','MIGRATE')" + nullable: False + schema_enabled_date: + type: "DateTime" + description: "Date schema was created/updated" + nullable: False + comment: + type: "StringLong" + description: "Comment relating to new provenance entry" + +execution_alias: + execution_alias_id: + description: "Unique identifier for execution alias" + type: "Integer" + primary_key: True + supersede_date: + type: "DateTime" + description: "If a new entry has been added to the table with the same alias name (but different `dataset_id`), the old entry will be superseded. `supersede_date` in the old entry tracks when this happened. If the entry has not been superseded, `supersede_date` will be None" + creator_uid: + type: "StringShort" + description: "UID of person who registered the entry" + nullable: False + register_date: + type: "DateTime" + description: "Date the execution alias was registered" + nullable: False + alias: + type: "String" + description: "User given execution alias name" + nullable: False + execution_id: + type: "Integer" + foreign_key: True + foreign_key_schema: "self" + foreign_key_table: "execution" + foreign_key_column: "execution_id" + description: "Execution this alias is linked to" + +dataset_alias: + dataset_alias_id: + description: "Unique identifier for dataset alias" + type: "Integer" + primary_key: True + supersede_date: + type: "DateTime" + description: "If a new entry has been added to the table with the same alias name (but different `dataset_id`), the old entry will be superseded. `supersede_date` in the old entry tracks when this happened. If the entry has not been superseded, `supersede_date` will be None" + creator_uid: + type: "StringShort" + description: "UID of person who registered the entry" + nullable: False + register_date: + type: "DateTime" + description: "Date the dataset alias was registered" + nullable: False + alias: + type: "String" + description: "User given dataset alias name" + nullable: False + dataset_id: + type: "Integer" + foreign_key: True + foreign_key_schema: "self" + foreign_key_table: "dataset" + foreign_key_column: "dataset_id" + description: "Dataset this alias is linked to" + +dependency: + dependency_id: + description: "Unique identifier for dependency" + type: "Integer" + primary_key: True + register_date: + type: "DateTime" + description: "Date the dependency was registered" + nullable: False + input_id: + type: "Integer" + foreign_key: True + foreign_key_schema: "self" + foreign_key_table: "dataset" + foreign_key_column: "dataset_id" + description: "Dataset this dependency is linked to (for every dependency, this, or `input_production_id`, must be non-null)" + input_production_id: + type: "Integer" + foreign_key: True + foreign_key_schema: "production" + foreign_key_table: "dataset" + foreign_key_column: "dataset_id" + description: "Production dataset this dependency is linked to (for every dependency, this, or `input_id`, must be non-null)" + execution_id: + type: "Integer" + foreign_key: True + foreign_key_schema: "self" + foreign_key_table: "execution" + foreign_key_column: "execution_id" + description: "Execution this dependency is linked to" + +dataset: + dataset_id: + type: "Integer" + primary_key: True + description: "Unique identifier for this dataset" + name: + type: "String" + description: "Any convenient, evocative name for the human. Note the combination of name, version and version_suffix must be unique. If None name is generated from the relative path." + nullable: False + cli_optional: True + relative_path: + type: "String" + description: "Relative path storing the data, relative to ``" + nullable: False + version_major: + type: "Integer" + description: "Major version in semantic string (i.e., X.x.x)" + nullable: False + version_minor: + type: "Integer" + description: "Minor version in semantic string (i.e., x.X.x)" + nullable: False + version_patch: + type: "Integer" + description: "Patch version in semantic string (i.e., x.x.X)" + nullable: False + version_suffix: + type: "String" + description: "Optional version suffix to place at the end of the version string. Cannot be used for production datasets." + cli_optional: True + version_string: + type: "String" + description: "Version string" + nullable: False + creation_date: + type: "DateTime" + description: "Dataset creation date" + cli_optional: True + register_date: + type: "DateTime" + description: "Date the dataset was registered" + nullable: False + creator_uid: + type: "StringShort" + description: "UID of person who registered the entry" + nullable: False + access_API: + type: "StringShort" + description: "Describes the software that can read the dataset (e.g., 'gcr-catalogs', 'skyCatalogs')" + cli_optional: True + owner: + type: "String" + description: "Owner of the dataset (defaults to $USER)" + nullable: False + cli_optional: True + owner_type: + type: "String" + description: "Datasets owner type, can be 'user', 'group', 'project' or 'production'." + nullable: False + choices: ["user", "group", "project", "production"] + cli_default: "user" + cli_optional: True + data_org: + type: "String" + description: "Dataset organisation ('file' or 'directory')" + nullable: False + nfiles: + type: "Integer" + description: "How many files are in the dataset" + nullable: False + total_disk_space: + type: "Float" + description: "Total disk spaced used by the dataset" + nullable: False + register_root_dir: + type: "String" + description: "The `root_dir` the dataset was originally ingested into" + nullable: False + description: + type: "String" + description: "User provided human-readable description of the dataset" + cli_optional: True + is_valid: + type: "Boolean" + nullable: False + description: "False if, e.g., copy failed" + execution_id: + type: "Integer" + foreign_key: True + foreign_key_schema: "self" + foreign_key_table: "execution" + foreign_key_column: "execution_id" + description: "Execution this dataset is linked to" + cli_optional: True + is_overwritten: + type: "Boolean" + nullable: False + description: "True if the original data for this dataset has been overwritten at some point. This would have required that `is_overwritable` was set to true on the original dataset" + is_overwritable: + type: "Boolean" + nullable: False + description: "True means this dataset can be overwritten in the future" + cli_optional: True + is_external_link: + type: "Boolean" + nullable: False + description: "True if an external link" + is_archived: + type: "Boolean" + nullable: False + description: "True if dataset is archived" diff --git a/tests/end_to_end_tests/create_test_entries_cli.sh b/tests/end_to_end_tests/create_test_entries_cli.sh index 9599b48b..b008e3ac 100644 --- a/tests/end_to_end_tests/create_test_entries_cli.sh +++ b/tests/end_to_end_tests/create_test_entries_cli.sh @@ -13,14 +13,20 @@ dregs register dataset my_cli_dataset2 "patch" \ dregs register dataset my_cli_dataset3 "1.2.3" --is_dummy \ --description "This is my dataset description" \ --access_API "Awesome API" \ - --locale "Secret location" \ --owner "DESC" \ - --owner-type "group" \ + --owner_type "group" \ --version_suffix "test" \ - --root_dir "DataRegistry_data" + --root_dir "DataRegistry_data" \ + --creation_date "2020-01-01" \ + --input_datasets 1 2 \ + --execution_name "I have given the execution a name" \ + --is_overwritable # A production dataset -dregs register dataset my_production_cli_dataset "0.1.2" \ - --owner-type "production" \ - --is_dummy \ - --root_dir "DataRegistry_data" +if [ "$DATAREG_BACKEND" = "postgres" ]; then + dregs register dataset my_production_cli_dataset "0.1.2" \ + --owner_type "production" \ + --is_dummy \ + --root_dir "DataRegistry_data" \ + --schema "production" +fi