From 9edbe087aed848e8e24470c7b0c8226fd627b24a Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Sat, 2 Dec 2023 16:20:17 +0100 Subject: [PATCH 1/9] Add a central schema yaml file. This contains all the tables and row entries in a central location. Now the schema creation script, the CLI and the docs can use this, so we don't have to worry about missing somewhere when adding a new entry to a table. --- .github/workflows/documentation.yml | 2 +- docs/source/_static/css/custom.css | 3 + docs/source/conf.py | 9 +- docs/source/reference_schema.rst | 247 +-------------------- docs/source/templates/schema_table.tmpl | 22 ++ scripts/create_registry_db.py | 181 ++++++---------- src/dataregistry/schema/__init__.py | 1 + src/dataregistry/schema/load_schema.py | 35 +++ src/dataregistry/schema/schema.yaml | 271 ++++++++++++++++++++++++ 9 files changed, 410 insertions(+), 361 deletions(-) create mode 100644 docs/source/_static/css/custom.css create mode 100644 docs/source/templates/schema_table.tmpl create mode 100644 src/dataregistry/schema/__init__.py create mode 100644 src/dataregistry/schema/load_schema.py create mode 100644 src/dataregistry/schema/schema.yaml diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml index 2c7e4af8..ed2295b7 100644 --- a/.github/workflows/documentation.yml +++ b/.github/workflows/documentation.yml @@ -10,7 +10,7 @@ jobs: run: | python -m pip install --upgrade pip python -m pip install . - pip install sphinx sphinx_rtd_theme sphinx_toolbox sphinxcontrib-autoprogram + pip install sphinx sphinx_rtd_theme sphinx_toolbox sphinxcontrib-autoprogram sphinxcontrib.datatemplates - name: Sphinx build run: | sphinx-build docs/source _build diff --git a/docs/source/_static/css/custom.css b/docs/source/_static/css/custom.css new file mode 100644 index 00000000..7d2e89e6 --- /dev/null +++ b/docs/source/_static/css/custom.css @@ -0,0 +1,3 @@ +.tight-table td { + white-space: normal !important; +} diff --git a/docs/source/conf.py b/docs/source/conf.py index 8b2fa02e..6bbbb460 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -5,7 +5,8 @@ "sphinx_rtd_theme", "sphinx.ext.autodoc", 'sphinx.ext.napoleon', - 'sphinxcontrib.autoprogram' + 'sphinxcontrib.autoprogram', + 'sphinxcontrib.datatemplates' ] project = 'DESC data management' @@ -36,3 +37,9 @@ html_logo = '_static/DREGS_logo_v2.png' autoclass_content = 'both' + +templates_path = ['templates'] + +html_css_files = [ + 'css/custom.css', +] diff --git a/docs/source/reference_schema.rst b/docs/source/reference_schema.rst index 9494ac52..66da4bc2 100644 --- a/docs/source/reference_schema.rst +++ b/docs/source/reference_schema.rst @@ -7,248 +7,5 @@ database (e.g., the default and production schemas) follows the same structure. .. image:: _static/schema_plot.png :alt: Image missing -The dataset table ------------------ - -.. list-table:: - :header-rows: 1 - - * - row - - description - - type - * - ``dataset_id`` - - Unique identifier for dataset - - int - * - ``name`` - - User given name for dataset - - str - * - ``relative_path`` - - Relative path storing the data, relative to `` - - str - * - ``version_major`` - - Major version in semantic string (i.e., X.x.x) - - int - * - ``version_minor`` - - Minor version in semantic string (i.e., x.X.x) - - int - * - ``version_patch`` - - Patch version in semantic string (i.e., x.x.X) - - int - * - ``version_suffix`` - - Optional version suffix - - str - * - ``dataset_creation_date`` - - Dataset creation date - - datetime - * - ``is_archived`` - - True if the data is archived, i.e, the data is longer within `` - - bool - * - ``is_external_link`` - - ??? - - bool - * - ``is_overwritten`` - - True if the original data for this dataset has been overwritten at some point. This would have required that ``is_overwritable`` was set to ``true`` on the original dataset - - bool - * - ``is_valid`` - - ??? - - bool - * - ``register_date`` - - Date the dataset was registered - - datetime - * - ``creator_uid`` - - `uid` (user id) of the person that registered the dataset - - str - * - ``access_API`` - - Describes the software that can read the dataset (e.g., "gcr-catalogs", "skyCatalogs") - - str - * - ``execution_id`` - - ID of execution this dataset belongs to - - int - * - ``description`` - - User provided description of the dataset - - str - * - ``owner_type`` - - Datasets owner type, can be "user", "group", "project" or "production". - - str - * - ``owner`` - - Owner of the dataset - - str - * - ``data_org`` - - Dataset organisation ("file" or "directory") - - str - * - ``nfiles`` - - How many files are in the dataset - - int - * - ``total_disk_space`` - - Total disk spaced used by the dataset - - float - -The dataset_alias table ------------------------ - -.. list-table:: - :header-rows: 1 - - * - row - - description - - type - * - ``dataset_alias_id`` - - Unique identifier for alias - - int - * - ``name`` - - User given alias name - - str - * - ``dataset_id`` - - ID of dataset this is an alias for - - int - * - ``supersede_date`` - - If a new entry has been added to the table with the same alias name (but - different dataset_id), the old entry will be superseded. ``supersede_date`` - in the old entry tracks when this happened. If the entry has not been - superseded, ``supersede_date`` will be None - - datetime - * - ``register_date`` - - Date the dataset was registered - - datetime - * - ``creator_uid`` - - `uid` (user id) of the person that registered the dataset - - str - -The dependency table --------------------- - -.. list-table:: - :header-rows: 1 - - * - row - - description - - type - * - ``dependency_id`` - - Unique identifier for dependency - - int - * - ``execution_id`` - - Execution this dependency is linked to - - int - * - ``input_id`` - - Dataset ID of the dependent dataset - - int - * - ``register_date`` - - Date the dependency was registered - - datetime - -The execution table -------------------- - -.. list-table:: - :header-rows: 1 - - * - row - - description - - type - * - ``execution_id`` - - Unique identifier for execution - - int - * - ``description`` - - User given discription of execution - - str - * - ``name`` - - User given execution name - - str - * - ``register_date`` - - Date the execution was registered - - datetime - * - ``execution_start`` - - Date the execution started - - datetime - * - ``locale`` - - Locale of execution (e.g., NERSC) - - str - * - ``configuration`` - - Path to configuration file of execution - - str - * - ``creator_uid`` - - `uid` (user id) of the person that registered the dataset - - str - -The execution_alias table -------------------------- - -.. list-table:: - :header-rows: 1 - - * - row - - description - - type - * - ``execution_alias_id`` - - Unique identifier for execution alias - - int - * - ``execution_id`` - - Execution this alias is linked to - - int - * - ``alias`` - - User given execution alias name - - str - * - ``register_date`` - - Date the execution was registered - - datetime - * - ``supersede_date`` - - If a new entry has been added to the table with the same alias name (but - different dataset_id), the old entry will be superseded. ``supersede_date`` - in the old entry tracks when this happened. If the entry has not been - superseded, ``supersede_date`` will be None - - datetime - * - ``creator_uid`` - - `uid` (user id) of the person that registered the dataset - - str - -The provenance table --------------------- - -.. list-table:: - :header-rows: 1 - - * - row - - description - - type - * - ``provenance_id`` - - Unique identifier for provenance - - int - * - ``code_version_major`` - - Major version of code when this schema was created - - int - * - ``code_version_minor`` - - Minor version of code when this schema was created - - int - * - ``code_version_patch`` - - Patch version of code when this schema was created - - int - * - ``code_version_suffix`` - - Version suffix of code when this schema was created - - str - * - ``db_version_major`` - - Major version of database - - int - * - ``db_version_minor`` - - Minor version of database - - int - * - ``db_version_patch`` - - Patch version of database - - int - * - ``git_hash`` - - Git commit hash when this schema was created - - str - * - ``repo_is_clean`` - - Was repository clean when this schema was created - - bool - * - ``update_method`` - - "CREATE", "MODIFY" or "MIGRATE" - - str - * - ``schema_enabled_date`` - - When was the schema enabled - - datetime - * - ``creator_uid`` - - `uid` (user id) of the person that registered the schema - - str - * - ``comment`` - - Any comment - - str +.. datatemplate:yaml:: ../../src/dataregistry/schema/schema.yaml + :template: schema_table.tmpl diff --git a/docs/source/templates/schema_table.tmpl b/docs/source/templates/schema_table.tmpl new file mode 100644 index 00000000..7abbbce6 --- /dev/null +++ b/docs/source/templates/schema_table.tmpl @@ -0,0 +1,22 @@ +.. -*- mode: rst -*- + +{% for table in ['execution','provenance','execution_alias','dataset','dependency','dataset_alias'] %} + +The {{table}} table +---------------------------------------- + +.. list-table:: + :header-rows: 1 + :class: tight-table + + * - row + - description + - type + +{% for item in data[table] %} + * - {{item}} +{% for item2 in ['description', 'type'] %} + - {{data[table][item][item2]}} +{% endfor %} +{% endfor %} +{% endfor %} diff --git a/scripts/create_registry_db.py b/scripts/create_registry_db.py index add216a4..bda137de 100644 --- a/scripts/create_registry_db.py +++ b/scripts/create_registry_db.py @@ -7,6 +7,7 @@ from sqlalchemy.orm import relationship, DeclarativeBase from dataregistry.db_basic import DbConnection, SCHEMA_VERSION from dataregistry.db_basic import add_table_row, _insert_provenance +from dataregistry.schema import load_schema """ A script to create the default dataregistry schema and the production schema. @@ -20,6 +21,57 @@ - "provenance" : Contains information about the database/schema """ +# Conversion from string types in `schema.yaml` to SQLAlchemy +_TYPE_TRANSLATE = {"String": String, "Integer": Integer, "DateTime": DateTime, + "StringShort": String(20), "StringLong": String(250), "Boolean": Boolean, + "Float": Float} + +# Load the schema from the `schema.yaml` file +schema_yaml = load_schema() + +def _get_rows(schema, table): + """ + Build the SQLAlchemy `Column` list for this table from the information in + the `schema.yaml` file. + + Parameters + ---------- + schema : str + table : str + + Returns + ------- + return_dict : dict + SQLAlchemy Column entries for each table row + """ + + return_dict = {} + for column in schema_yaml[table].keys(): + + # Special case where row has a foreign key + if schema_yaml[table][column]["foreign_key"]: + if schema_yaml[table][column]["foreign_key_schema"] == "self": + schema_yaml[table][column]["foreign_key_schema"] = schema + + return_dict[column] = Column(column, + _TYPE_TRANSLATE[schema_yaml[table][column]["type"]], + ForeignKey(_get_ForeignKey_str( + schema_yaml[table][column]["foreign_key_schema"], + schema_yaml[table][column]["foreign_key_table"], + schema_yaml[table][column]["foreign_key_row"])), + primary_key=schema_yaml[table][column]["primary_key"], + nullable=schema_yaml[table][column]["nullable"], + ) + + # Normal case + else: + return_dict[column] = Column(column, + _TYPE_TRANSLATE[schema_yaml[table][column]["type"]], + primary_key=schema_yaml[table][column]["primary_key"], + nullable=schema_yaml[table][column]["nullable"]) + + return return_dict + class Base(DeclarativeBase): pass @@ -35,25 +87,8 @@ def _Provenance(schema): class_name = f"{schema}_provenance" - # Rows - rows = { - "provenance_id": Column("provenance_id", Integer, primary_key=True), - "code_version_major": Column("code_version_major", Integer, nullable=False), - "code_version_minor": Column("code_version_minor", Integer, nullable=False), - "code_version_patch": Column("code_version_patch", Integer, nullable=False), - "code_version_suffix": Column("code_version_suffix", String), - "db_version_major": Column("db_version_major", Integer, nullable=False), - "db_version_minor": Column("db_version_minor", Integer, nullable=False), - "db_version_patch": Column("db_version_patch", Integer, nullable=False), - "git_hash": Column("git_hash", String, nullable=True), - "repo_is_clean": Column("repo_is_clean", Boolean, nullable=True), - # update method is always "CREATE" for this script. - # Alternative could be "MODIFY" or "MIGRATE" - "update_method": Column("update_method", String(10), nullable=False), - "schema_enabled_date": Column("schema_enabled_date", DateTime, nullable=False), - "creator_uid": Column("creator_uid", String(20), nullable=False), - "comment": Column("comment", String(250)), - } + # Load rows from `schema.yaml` file + rows = _get_rows(schema, "provenance") # Table metadata meta = {"__tablename__": "provenance", "__table_args__": {"schema": schema}} @@ -67,19 +102,8 @@ def _Execution(schema): class_name = f"{schema}_execution" - # Rows - rows = { - "execution_id": Column("execution_id", Integer, primary_key=True), - "description": Column("description", String), - "register_date": Column("register_date", DateTime, nullable=False), - "execution_start": Column("execution_start", DateTime), - # name is meant to identify the code executed. E.g., could be pipeline name - "name": Column("name", String), - # locale is, e.g. site where code was run - "locale": Column("locale", String), - "configuration": Column("configuration", String), - "creator_uid": Column("creator_uid", String(20), nullable=False), - } + # Load rows from `schema.yaml` file + rows = _get_rows(schema, "execution") # Table metadata meta = {"__tablename__": "execution", "__table_args__": {"schema": schema}} @@ -93,18 +117,8 @@ def _ExecutionAlias(schema): class_name = f"{schema}_execution_alias" - # Rows - rows = { - "execution_alias_id": Column("execution_alias_id", Integer, primary_key=True), - "alias": Column(String, nullable=False), - "execution_id": Column( - Integer, - ForeignKey(_get_ForeignKey_str(schema, "execution", "execution_id")), - ), - "supersede_date": Column(DateTime, default=None), - "register_date": Column(DateTime, nullable=False), - "creator_uid": Column(String(20), nullable=False), - } + # Load rows from `schema.yaml` file + rows = _get_rows(schema, "execution_alias") # Table metadata meta = { @@ -124,17 +138,8 @@ def _DatasetAlias(schema): class_name = f"{schema}_dataset_alias" - # Rows - rows = { - "dataset_alias_id": Column(Integer, primary_key=True), - "alias": Column(String, nullable=False), - "dataset_id": Column( - Integer, ForeignKey(_get_ForeignKey_str(schema, "dataset", "dataset_id")) - ), - "supersede_date": Column(DateTime, default=None), - "register_date": Column(DateTime, nullable=False), - "creator_uid": Column(String(20), nullable=False), - } + # Load rows from `schema.yaml` file + rows = _get_rows(schema, "dataset_alias") # Table metadata meta = { @@ -154,47 +159,8 @@ def _Dataset(schema): class_name = f"{schema}_dataset" - # Rows - rows = { - "dataset_id": Column(Integer, primary_key=True), - "name": Column(String, nullable=False), - "relative_path": Column(String, nullable=False), - "version_major": Column(Integer, nullable=False), - "version_minor": Column(Integer, nullable=False), - "version_patch": Column(Integer, nullable=False), - "version_string": Column(String, nullable=False), - "version_suffix": Column(String), - "dataset_creation_date": Column(DateTime), - "is_archived": Column(Boolean, default=False), - "is_external_link": Column(Boolean, default=False), - "is_overwritable": Column(Boolean, default=False), - "is_overwritten": Column(Boolean, default=False), - "is_valid": Column(Boolean, default=True), # False if, e.g., copy failed - # The following are boilerplate, included in all or most tables - "register_date": Column(DateTime, nullable=False), - "creator_uid": Column(String(20), nullable=False), - # Make access_API a string for now, but it could be an enumeration or - # a foreign key into another table. Possible values for the column - # might include "gcr-catalogs", "skyCatalogs" - "access_API": Column("access_API", String(20)), - # A way to associate a dataset with a program execution or "run" - "execution_id": Column( - Integer, - ForeignKey(_get_ForeignKey_str(schema, "execution", "execution_id")), - ), - "description": Column(String), - "owner_type": Column(String, nullable=False), - # If ownership_type is 'production', then owner is always 'production' - # If ownership_type is 'group', owner will be a group name - # If ownership_type is 'user', owner will be a user name - "owner": Column(String, nullable=False), - # To store metadata about the dataset. - "data_org": Column("data_org", String, nullable=False), - "nfiles": Column("nfiles", Integer, nullable=False), - "total_disk_space": Column("total_disk_space", Float, nullable=False), - # What `root_dir` was the data originially ingested into - "register_root_dir": Column(String, nullable=False), - } + # Load rows from `schema.yaml` file + rows = _get_rows(schema, "dataset") # Table metadata meta = { @@ -217,25 +183,12 @@ def _Dependency(schema, has_production): class_name = f"{schema}_dependency" - # Rows - rows = { - "dependency_id": Column(Integer, primary_key=True), - "register_date": Column(DateTime, nullable=False), - "execution_id": Column( - Integer, - ForeignKey(_get_ForeignKey_str(schema, "execution", "execution_id")), - ), - "input_id": Column( - Integer, ForeignKey(_get_ForeignKey_str(schema, "dataset", "dataset_id")) - ), - } + # Load rows from `schema.yaml` file + rows = _get_rows(schema, "dependency") - # Add link to production schema. - if has_production: - rows["input_production_id"] = Column( - Integer, - ForeignKey(_get_ForeignKey_str("production", "dataset", "dataset_id")), - ) + # Remove link to production schema. + if not has_production: + del rows["input_production_id"] # Table metadata meta = {"__tablename__": "dependency", "__table_args__": {"schema": schema}} diff --git a/src/dataregistry/schema/__init__.py b/src/dataregistry/schema/__init__.py new file mode 100644 index 00000000..329fa0e8 --- /dev/null +++ b/src/dataregistry/schema/__init__.py @@ -0,0 +1 @@ +from .load_schema import load_schema diff --git a/src/dataregistry/schema/load_schema.py b/src/dataregistry/schema/load_schema.py new file mode 100644 index 00000000..e1f2b19f --- /dev/null +++ b/src/dataregistry/schema/load_schema.py @@ -0,0 +1,35 @@ +import os +import yaml + +def _populate_defaults(mydict): + """ + Populate the default values for rows that haven't been specified in the + YAML file. + + Parameters + ---------- + mydict : dict + """ + + # Attributes we check for and populate with these default value if missing + atts = {"nullable": True, "primary_key": False, "foreign_key": False} + + # Loop over eah row and ingest + for table in mydict.keys(): + for row in mydict[table].keys(): + for att in atts.keys(): + if att not in mydict[table][row].keys(): + mydict[table][row][att] = atts[att] + +def load_schema(): + """ Load the schema layout from the YAML file """ + + # Load + yaml_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "schema.yaml") + with open(yaml_file_path, 'r') as file: + yaml_data = yaml.safe_load(file) + + # Populate defaults + _populate_defaults(yaml_data) + + return yaml_data diff --git a/src/dataregistry/schema/schema.yaml b/src/dataregistry/schema/schema.yaml new file mode 100644 index 00000000..0c6dadbd --- /dev/null +++ b/src/dataregistry/schema/schema.yaml @@ -0,0 +1,271 @@ +--- +execution: + execution_id: + type: "Integer" + primary_key: True + description: "Unique identifier for execution" + description: + type: "String" + description: "Short description of execution" + register_date: + type: "DateTime" + description: "When was the execution registered in the database" + nullable: False + execution_start: + type: "DateTime" + description: "When was the execution performed at `locale`" + name: + type: "String" + description: "Identifies the code executed (e.g., could be pipeline name)" + locale: + type: "String" + description: "Site where the code was run (e.g., NERSC)" + configuration: + type: "String" + description: "Path to execution configuration file (txt, YAML, TOML, etc). Ingested as raw text" + creator_uid: + type: "StringShort" + description: "UID of person who registered the entry" + nullable: False + +provenance: + provenance_id: + type: "Integer" + primary_key: True + description: "Unique identifier for this provenance entry" + code_version_major: + type: "Integer" + description: "Major version of code when this schema was created" + nullable: False + code_version_minor: + type: "Integer" + description: "Minor version of code when this schema was created" + nullable: False + code_version_patch: + type: "Integer" + description: "Patch version of code when this schema was created" + nullable: False + code_version_suffix: + type: "String" + description: "Version suffix of code when this schema was created" + creator_uid: + type: "StringShort" + description: "UID of person who registered the entry" + nullable: False + db_version_major: + type: "Integer" + description: "Major version of schema" + nullable: False + db_version_minor: + type: "Integer" + description: "Minor version of schema" + nullable: False + db_version_patch: + type: "Integer" + description: "Patch version of schema" + nullable: False + git_hash: + type: "String" + description: "Git hash at time of schema creation" + repo_is_clean: + type: "Boolean" + description: "Was git repo clean at schema creation?" + update_method: + type: "String" + description: "What type of schema update does this entry relate to ('CREATE','MODIFY','MIGRATE')" + nullable: False + schema_enabled_date: + type: "DateTime" + description: "Date schema was created/updated" + nullable: False + comment: + type: "StringLong" + description: "Comment relating to new provenance entry" + +execution_alias: + execution_alias_id: + description: "Unique identifier for execution alias" + type: "Integer" + primary_key: True + supersede_date: + type: "DateTime" + description: "If a new entry has been added to the table with the same alias name (but different `dataset_id`), the old entry will be superseded. `supersede_date` in the old entry tracks when this happened. If the entry has not been superseded, `supersede_date` will be None" + creator_uid: + type: "StringShort" + description: "UID of person who registered the entry" + nullable: False + register_date: + type: "DateTime" + description: "Date the execution alias was registered" + nullable: False + alias: + type: "String" + description: "User given execution alias name" + nullable: False + execution_id: + type: "Integer" + foreign_key: True + foreign_key_schema: "self" + foreign_key_table: "execution" + foreign_key_row: "execution_id" + description: "Execution this alias is linked to" + +dataset_alias: + dataset_alias_id: + description: "Unique identifier for dataset alias" + type: "Integer" + primary_key: True + supersede_date: + type: "DateTime" + description: "If a new entry has been added to the table with the same alias name (but different `dataset_id`), the old entry will be superseded. `supersede_date` in the old entry tracks when this happened. If the entry has not been superseded, `supersede_date` will be None" + creator_uid: + type: "StringShort" + description: "UID of person who registered the entry" + nullable: False + register_date: + type: "DateTime" + description: "Date the dataset alias was registered" + nullable: False + alias: + type: "String" + description: "User given dataset alias name" + nullable: False + dataset_id: + type: "Integer" + foreign_key: True + foreign_key_schema: "self" + foreign_key_table: "dataset" + foreign_key_row: "dataset_id" + description: "Dataset this alias is linked to" + +dependency: + dependency_id: + description: "Unique identifier for dependency" + type: "Integer" + primary_key: True + register_date: + type: "DateTime" + description: "Date the dependency was registered" + nullable: False + input_id: + type: "Integer" + foreign_key: True + foreign_key_schema: "self" + foreign_key_table: "dataset" + foreign_key_row: "dataset_id" + description: "Dataset this dependency is linked to" + input_production_id: + type: "Integer" + foreign_key: True + foreign_key_schema: "production" + foreign_key_table: "dataset" + foreign_key_row: "dataset_id" + description: "Production dataset this dependency is linked to" + execution_id: + type: "Integer" + foreign_key: True + foreign_key_schema: "self" + foreign_key_table: "execution" + foreign_key_row: "execution_id" + description: "Execution this dependency is linked to" + +dataset: + dataset_id: + type: "Integer" + primary_key: True + description: "Unique identifier for this dataset" + name: + type: "String" + description: "User given name for this dataset" + nullable: False + relative_path: + type: "String" + description: "Relative path storing the data, relative to ``" + nullable: False + version_major: + type: "Integer" + description: "Major version in semantic string (i.e., X.x.x)" + nullable: False + version_minor: + type: "Integer" + description: "Minor version in semantic string (i.e., x.X.x)" + nullable: False + version_patch: + type: "Integer" + description: "Patch version in semantic string (i.e., x.x.X)" + nullable: False + version_suffix: + type: "String" + description: "Optional version suffix" + version_string: + type: "String" + description: "Version string" + nullable: False + dataset_creation_date: + type: "DateTime" + description: "Dataset creation date" + register_date: + type: "DateTime" + description: "Date the dataset was registered" + nullable: False + creator_uid: + type: "StringShort" + description: "UID of person who registered the entry" + nullable: False + access_API: + type: "StringShort" + description: "Describes the software that can read the dataset (e.g., 'gcr-catalogs', 'skyCatalogs')" + owner: + type: "String" + description: "Owner of the dataset" + nullable: False + owner_type: + type: "String" + description: "Datasets owner type, can be 'user', 'group', 'project' or 'production'." + nullable: False + data_org: + type: "String" + description: "Dataset organisation ('file' or 'directory')" + nullable: False + nfiles: + type: "Integer" + description: "How many files are in the dataset" + nullable: False + total_disk_space: + type: "Float" + description: "Total disk spaced used by the dataset" + nullable: False + register_root_dir: + type: "String" + description: "The `root_dir` the dataset was originally ingested into" + nullable: False + description: + type: "String" + description: "User provided description of the dataset" + is_valid: + type: "Boolean" + default: True + description: "False if, e.g., copy failed" + execution_id: + type: "Integer" + foreign_key: True + foreign_key_schema: "self" + foreign_key_table: "execution" + foreign_key_row: "execution_id" + description: "Execution this dataset is linked to" + is_overwritten: + type: "Boolean" + default: False + description: "True if the original data for this dataset has been overwritten at some point. This would have required that `is_overwritable` was set to true on the original dataset" + is_overwritable: + type: "Boolean" + default: False + description: "True means this dataset can be overwritten in the future" + is_external_link: + type: "Boolean" + default: False + description: "True if an external link" + is_archived: + type: "Boolean" + default: False + description: "True if dataset is archived" From 0b360da82fbf154198f880e84a74b085c80de265 Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Sat, 2 Dec 2023 16:25:02 +0100 Subject: [PATCH 2/9] Add schema.yaml to pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c15b3459..8cc09883 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,4 +38,4 @@ where = ["src"] dregs = "cli.cli:main" [tool.setuptools.package-data] -"dataregistry" = ["site_config/site_rootdir.yaml"] +"dataregistry" = ["site_config/site_rootdir.yaml", "schema/schema.yaml"] From 375f1eb629df1c0c8df8520ddcfe15e443c84170 Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Sat, 2 Dec 2023 16:30:00 +0100 Subject: [PATCH 3/9] Format --- scripts/create_registry_db.py | 67 +++++++++++++++++++++++++---------- 1 file changed, 49 insertions(+), 18 deletions(-) diff --git a/scripts/create_registry_db.py b/scripts/create_registry_db.py index bda137de..0e2d5aff 100644 --- a/scripts/create_registry_db.py +++ b/scripts/create_registry_db.py @@ -22,13 +22,20 @@ """ # Conversion from string types in `schema.yaml` to SQLAlchemy -_TYPE_TRANSLATE = {"String": String, "Integer": Integer, "DateTime": DateTime, - "StringShort": String(20), "StringLong": String(250), "Boolean": Boolean, - "Float": Float} +_TYPE_TRANSLATE = { + "String": String, + "Integer": Integer, + "DateTime": DateTime, + "StringShort": String(20), + "StringLong": String(250), + "Boolean": Boolean, + "Float": Float, +} # Load the schema from the `schema.yaml` file schema_yaml = load_schema() + def _get_rows(schema, table): """ Build the SQLAlchemy `Column` list for this table from the information in @@ -38,7 +45,7 @@ def _get_rows(schema, table): ---------- schema : str table : str - + Returns ------- return_dict : dict @@ -47,35 +54,59 @@ def _get_rows(schema, table): return_dict = {} for column in schema_yaml[table].keys(): - # Special case where row has a foreign key if schema_yaml[table][column]["foreign_key"]: if schema_yaml[table][column]["foreign_key_schema"] == "self": schema_yaml[table][column]["foreign_key_schema"] = schema - return_dict[column] = Column(column, - _TYPE_TRANSLATE[schema_yaml[table][column]["type"]], - ForeignKey(_get_ForeignKey_str( + return_dict[column] = Column( + column, + _TYPE_TRANSLATE[schema_yaml[table][column]["type"]], + ForeignKey( + _get_ForeignKey_str( schema_yaml[table][column]["foreign_key_schema"], - schema_yaml[table][column]["foreign_key_table"], - schema_yaml[table][column]["foreign_key_row"])), - primary_key=schema_yaml[table][column]["primary_key"], - nullable=schema_yaml[table][column]["nullable"], + schema_yaml[table][column]["foreign_key_table"], + schema_yaml[table][column]["foreign_key_row"], + ) + ), + primary_key=schema_yaml[table][column]["primary_key"], + nullable=schema_yaml[table][column]["nullable"], ) # Normal case else: - return_dict[column] = Column(column, - _TYPE_TRANSLATE[schema_yaml[table][column]["type"]], - primary_key=schema_yaml[table][column]["primary_key"], - nullable=schema_yaml[table][column]["nullable"]) + return_dict[column] = Column( + column, + _TYPE_TRANSLATE[schema_yaml[table][column]["type"]], + primary_key=schema_yaml[table][column]["primary_key"], + nullable=schema_yaml[table][column]["nullable"], + ) return return_dict + class Base(DeclarativeBase): pass + def _get_ForeignKey_str(schema, table, row): + """ + Get the string reference to the ".." a foreign key will + point to. + + The schema address will only be included for postgres backends. + + Parameters + --------- + schema : str + table : str + row : str + + Returns + ------- + - : str + """ + if schema is None: return f"{table}.{row}" else: @@ -189,7 +220,7 @@ def _Dependency(schema, has_production): # Remove link to production schema. if not has_production: del rows["input_production_id"] - + # Table metadata meta = {"__tablename__": "dependency", "__table_args__": {"schema": schema}} @@ -241,7 +272,7 @@ def _Dependency(schema, has_production): stmt = f"CREATE SCHEMA IF NOT EXISTS {SCHEMA}" conn.execute(text(stmt)) conn.commit() - + # Grant reg_reader access acct = "reg_reader" for SCHEMA in SCHEMA_LIST: From 0b499b80bca2e1ba8ee3e8a3734b13ca68ed1aa2 Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Sat, 2 Dec 2023 16:48:49 +0100 Subject: [PATCH 4/9] Remove defaults from table creation, move to registrar --- scripts/create_registry_db.py | 3 ++- src/cli/cli.py | 1 + src/dataregistry/registrar.py | 3 +++ src/dataregistry/schema/load_schema.py | 20 ++++++++++++-------- src/dataregistry/schema/schema.yaml | 10 +++++----- 5 files changed, 23 insertions(+), 14 deletions(-) diff --git a/scripts/create_registry_db.py b/scripts/create_registry_db.py index 0e2d5aff..8506ba7f 100644 --- a/scripts/create_registry_db.py +++ b/scripts/create_registry_db.py @@ -2,7 +2,7 @@ import sys import argparse from datetime import datetime -from sqlalchemy import Column, Integer, String, DateTime, Boolean, Index, Float +from sqlalchemy import Column, ColumnDefault, Integer, String, DateTime, Boolean, Index, Float from sqlalchemy import ForeignKey, UniqueConstraint, text from sqlalchemy.orm import relationship, DeclarativeBase from dataregistry.db_basic import DbConnection, SCHEMA_VERSION @@ -54,6 +54,7 @@ def _get_rows(schema, table): return_dict = {} for column in schema_yaml[table].keys(): + # Special case where row has a foreign key if schema_yaml[table][column]["foreign_key"]: if schema_yaml[table][column]["foreign_key_schema"] == "self": diff --git a/src/cli/cli.py b/src/cli/cli.py index ed07b67d..1786f825 100644 --- a/src/cli/cli.py +++ b/src/cli/cli.py @@ -4,6 +4,7 @@ from dataregistry.db_basic import SCHEMA_VERSION from .register import register_dataset from .query import dregs_ls +from dataregistry.schema import load_schema # --------------------- # The data registry CLI diff --git a/src/dataregistry/registrar.py b/src/dataregistry/registrar.py index 309d4d41..7d13725a 100644 --- a/src/dataregistry/registrar.py +++ b/src/dataregistry/registrar.py @@ -517,6 +517,9 @@ def register_dataset( values["access_API"] = access_API values["is_overwritable"] = is_overwritable values["is_overwritten"] = False + values["is_external_link"] = False + values["is_archived"] = False + values["is_valid"] = True values["register_date"] = datetime.now() values["owner_type"] = owner_type values["owner"] = owner diff --git a/src/dataregistry/schema/load_schema.py b/src/dataregistry/schema/load_schema.py index e1f2b19f..d9a1fac8 100644 --- a/src/dataregistry/schema/load_schema.py +++ b/src/dataregistry/schema/load_schema.py @@ -1,6 +1,7 @@ import os import yaml + def _populate_defaults(mydict): """ Populate the default values for rows that haven't been specified in the @@ -10,23 +11,26 @@ def _populate_defaults(mydict): ---------- mydict : dict """ - + # Attributes we check for and populate with these default value if missing atts = {"nullable": True, "primary_key": False, "foreign_key": False} # Loop over eah row and ingest for table in mydict.keys(): - for row in mydict[table].keys(): - for att in atts.keys(): - if att not in mydict[table][row].keys(): - mydict[table][row][att] = atts[att] + for row in mydict[table].keys(): + for att in atts.keys(): + if att not in mydict[table][row].keys(): + mydict[table][row][att] = atts[att] + def load_schema(): - """ Load the schema layout from the YAML file """ + """Load the schema layout from the YAML file""" # Load - yaml_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "schema.yaml") - with open(yaml_file_path, 'r') as file: + yaml_file_path = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "schema.yaml" + ) + with open(yaml_file_path, "r") as file: yaml_data = yaml.safe_load(file) # Populate defaults diff --git a/src/dataregistry/schema/schema.yaml b/src/dataregistry/schema/schema.yaml index 0c6dadbd..3449de97 100644 --- a/src/dataregistry/schema/schema.yaml +++ b/src/dataregistry/schema/schema.yaml @@ -244,7 +244,7 @@ dataset: description: "User provided description of the dataset" is_valid: type: "Boolean" - default: True + nullable: False description: "False if, e.g., copy failed" execution_id: type: "Integer" @@ -255,17 +255,17 @@ dataset: description: "Execution this dataset is linked to" is_overwritten: type: "Boolean" - default: False + nullable: False description: "True if the original data for this dataset has been overwritten at some point. This would have required that `is_overwritable` was set to true on the original dataset" is_overwritable: type: "Boolean" - default: False + nullable: False description: "True means this dataset can be overwritten in the future" is_external_link: type: "Boolean" - default: False + nullable: False description: "True if an external link" is_archived: type: "Boolean" - default: False + nullablee: False description: "True if dataset is archived" From 65f4c6eee37b188c3ff33d084028306e9fc470a2 Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Sat, 2 Dec 2023 17:33:55 +0100 Subject: [PATCH 5/9] Modify CLI to use schema.yaml --- src/cli/cli.py | 92 +++++++++---------- src/cli/register.py | 3 +- src/dataregistry/schema/load_schema.py | 3 +- src/dataregistry/schema/schema.yaml | 21 ++++- .../create_test_entries_cli.sh | 8 +- 5 files changed, 65 insertions(+), 62 deletions(-) diff --git a/src/cli/cli.py b/src/cli/cli.py index 1786f825..86bcc62d 100644 --- a/src/cli/cli.py +++ b/src/cli/cli.py @@ -46,6 +46,20 @@ # Register a dataset # ------------------ +# Load the schema information +schema_data = load_schema() + +# Conversion from string types in `schema.yaml` to SQLAlchemy +_TYPE_TRANSLATE = { + "String": str, + "Integer": int, + "DateTime": str, + "StringShort": str, + "StringLong": str, + "Boolean": bool, + "Float": float, +} + # Register a new database entry. arg_register = subparsers.add_parser( "register", help="Register a new entry to the database" @@ -58,6 +72,33 @@ # Register a new dataset. arg_register_dataset = arg_register_sub.add_parser("dataset", help="Register a dataset") +# Get some information from the `schema.yaml` file +for row in schema_data["dataset"]: + # Any default? + if schema_data["dataset"][row]["cli_default"] is not None: + default = schema_data["dataset"][row]["cli_default"] + default_str = f" (default={default})" + else: + default = None + default_str = "" + + # Restricted to choices? + if schema_data["dataset"][row]["choices"] is not None: + choices = schema_data["dataset"][row]["choices"] + else: + choices = None + + # Add flag + if schema_data["dataset"][row]["cli_optional"]: + arg_register_dataset.add_argument( + "--" + row, + help=schema_data["dataset"][row]["description"] + default_str, + default=default, + choices=choices, + type=_TYPE_TRANSLATE[schema_data["dataset"][row]["type"]], + ) + +# Entries unique to registering the dataset using the CLI arg_register_dataset.add_argument( "relative_path", help=( @@ -76,45 +117,6 @@ ), type=str, ) -arg_register_dataset.add_argument( - "--version_suffix", - help=( - "Optional suffix string to place at the end of the version string." - "Cannot be used for production datasets." - ), - type=str, -) -arg_register_dataset.add_argument( - "--name", - help=( - "Any convenient, evocative name for the human. Note the combination of" - "name, version and version_suffix must be unique. If None name is generated" - "from the relative path." - ), - type=str, -) -arg_register_dataset.add_argument( - "--creation_date", help="Manually set creation date of dataset" -) -arg_register_dataset.add_argument( - "--description", help="Human-readable description of dataset", type=str -) -arg_register_dataset.add_argument( - "--execution_id", - help="Used to associate dataset with a particular execution", - type=int, -) -arg_register_dataset.add_argument( - "--access_API", help="Hint as to how to read the data", type=str -) -arg_register_dataset.add_argument( - "--is_overwritable", - help=( - "True if dataset may be overwritten (defaults to False). Production" - "datasets cannot be overwritten." - ), - action="store_true", -) arg_register_dataset.add_argument( "--old_location", help=( @@ -138,18 +140,6 @@ default=f"{SCHEMA_VERSION}", help="Which schema to connect to", ) -arg_register_dataset.add_argument( - "--locale", - help="Location where dataset was produced", - type=str, - default="NERSC", -) -arg_register_dataset.add_argument( - "--owner", help="Owner of dataset. Defaults to $USER." -) -arg_register_dataset.add_argument( - "--owner-type", choices=["production", "group", "user"], default="user" -) arg_register_dataset.add_argument( "--config_file", help="Location of data registry config file", type=str ) diff --git a/src/cli/register.py b/src/cli/register.py index 98340e91..5518c3e0 100644 --- a/src/cli/register.py +++ b/src/cli/register.py @@ -9,7 +9,7 @@ def register_dataset(args): Parameters ---------- args : argparse object - + args.config_file : str Path to data registry config file args.schema : str @@ -43,6 +43,7 @@ def register_dataset(args): copy=(not args.make_symlink), is_dummy=args.is_dummy, owner=args.owner, + owner_type=args.owner_type, execution_name=args.execution_name, execution_description=args.execution_description, execution_start=args.execution_start, diff --git a/src/dataregistry/schema/load_schema.py b/src/dataregistry/schema/load_schema.py index d9a1fac8..fbd0f238 100644 --- a/src/dataregistry/schema/load_schema.py +++ b/src/dataregistry/schema/load_schema.py @@ -13,7 +13,8 @@ def _populate_defaults(mydict): """ # Attributes we check for and populate with these default value if missing - atts = {"nullable": True, "primary_key": False, "foreign_key": False} + atts = {"nullable": True, "primary_key": False, "foreign_key": False, "cli_optional": False, + "cli_default": None, "choices": None} # Loop over eah row and ingest for table in mydict.keys(): diff --git a/src/dataregistry/schema/schema.yaml b/src/dataregistry/schema/schema.yaml index 3449de97..e9e1c214 100644 --- a/src/dataregistry/schema/schema.yaml +++ b/src/dataregistry/schema/schema.yaml @@ -176,8 +176,9 @@ dataset: description: "Unique identifier for this dataset" name: type: "String" - description: "User given name for this dataset" + description: "Any convenient, evocative name for the human. Note the combination of name, version and version_suffix must be unique. If None name is generated from the relative path." nullable: False + cli_optional: True relative_path: type: "String" description: "Relative path storing the data, relative to ``" @@ -196,14 +197,16 @@ dataset: nullable: False version_suffix: type: "String" - description: "Optional version suffix" + description: "Optional version suffix to place at the end of the version string. Cannot be used for production datasets." + cli_optional: True version_string: type: "String" description: "Version string" nullable: False - dataset_creation_date: + creation_date: type: "DateTime" description: "Dataset creation date" + cli_optional: True register_date: type: "DateTime" description: "Date the dataset was registered" @@ -215,14 +218,19 @@ dataset: access_API: type: "StringShort" description: "Describes the software that can read the dataset (e.g., 'gcr-catalogs', 'skyCatalogs')" + cli_optional: True owner: type: "String" - description: "Owner of the dataset" + description: "Owner of the dataset (defaults to $USER)" nullable: False + cli_optional: True owner_type: type: "String" description: "Datasets owner type, can be 'user', 'group', 'project' or 'production'." nullable: False + choices: ["user", "group", "project", "production"] + cli_default: "user" + cli_optional: True data_org: type: "String" description: "Dataset organisation ('file' or 'directory')" @@ -241,7 +249,8 @@ dataset: nullable: False description: type: "String" - description: "User provided description of the dataset" + description: "User provided human-readable description of the dataset" + cli_optional: True is_valid: type: "Boolean" nullable: False @@ -253,6 +262,7 @@ dataset: foreign_key_table: "execution" foreign_key_row: "execution_id" description: "Execution this dataset is linked to" + cli_optional: True is_overwritten: type: "Boolean" nullable: False @@ -261,6 +271,7 @@ dataset: type: "Boolean" nullable: False description: "True means this dataset can be overwritten in the future" + cli_optional: True is_external_link: type: "Boolean" nullable: False diff --git a/tests/end_to_end_tests/create_test_entries_cli.sh b/tests/end_to_end_tests/create_test_entries_cli.sh index 9599b48b..db37883c 100644 --- a/tests/end_to_end_tests/create_test_entries_cli.sh +++ b/tests/end_to_end_tests/create_test_entries_cli.sh @@ -13,14 +13,14 @@ dregs register dataset my_cli_dataset2 "patch" \ dregs register dataset my_cli_dataset3 "1.2.3" --is_dummy \ --description "This is my dataset description" \ --access_API "Awesome API" \ - --locale "Secret location" \ --owner "DESC" \ - --owner-type "group" \ + --owner_type "group" \ --version_suffix "test" \ --root_dir "DataRegistry_data" # A production dataset dregs register dataset my_production_cli_dataset "0.1.2" \ - --owner-type "production" \ + --owner_type "production" \ --is_dummy \ - --root_dir "DataRegistry_data" + --root_dir "DataRegistry_data" \ + --schema "production" From 5d6b0ce9a92103073d1e7689adbda725c85c83ec Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Sat, 2 Dec 2023 17:41:09 +0100 Subject: [PATCH 6/9] Fix for sqlite --- .github/workflows/ci.yml | 2 ++ tests/end_to_end_tests/create_test_entries_cli.sh | 12 +++++++----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 88a0f860..c9e893e5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -74,6 +74,7 @@ jobs: env: DATAREG_CONFIG: "${{ github.workspace }}/config.txt" + DATAREG_BACKEND: "postgres" # Service containers to run with `runner-job` services: @@ -150,6 +151,7 @@ jobs: env: DATAREG_CONFIG: "${{ github.workspace }}/config.txt" + DATAREG_BACKEND: "sqlite" # Our strategy lists the OS and Python versions we want to test on. strategy: diff --git a/tests/end_to_end_tests/create_test_entries_cli.sh b/tests/end_to_end_tests/create_test_entries_cli.sh index db37883c..b439b4aa 100644 --- a/tests/end_to_end_tests/create_test_entries_cli.sh +++ b/tests/end_to_end_tests/create_test_entries_cli.sh @@ -19,8 +19,10 @@ dregs register dataset my_cli_dataset3 "1.2.3" --is_dummy \ --root_dir "DataRegistry_data" # A production dataset -dregs register dataset my_production_cli_dataset "0.1.2" \ - --owner_type "production" \ - --is_dummy \ - --root_dir "DataRegistry_data" \ - --schema "production" +if [ "$DATAREG_BACKEND" = "postgres" ]; then + dregs register dataset my_production_cli_dataset "0.1.2" \ + --owner_type "production" \ + --is_dummy \ + --root_dir "DataRegistry_data" \ + --schema "production" +fi From abb8492339fa8cfcc205636e954b837add0f9a5e Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Wed, 6 Dec 2023 09:30:57 +0100 Subject: [PATCH 7/9] Address reviewer comments. Mainly chaging references of 'row' to 'column'. --- scripts/create_registry_db.py | 70 ++++++++++++++++------------- src/cli/cli.py | 18 ++++---- src/dataregistry/schema/schema.yaml | 18 ++++---- 3 files changed, 57 insertions(+), 49 deletions(-) diff --git a/scripts/create_registry_db.py b/scripts/create_registry_db.py index 8506ba7f..4c21d302 100644 --- a/scripts/create_registry_db.py +++ b/scripts/create_registry_db.py @@ -2,11 +2,20 @@ import sys import argparse from datetime import datetime -from sqlalchemy import Column, ColumnDefault, Integer, String, DateTime, Boolean, Index, Float +from sqlalchemy import ( + Column, + ColumnDefault, + Integer, + String, + DateTime, + Boolean, + Index, + Float, +) from sqlalchemy import ForeignKey, UniqueConstraint, text from sqlalchemy.orm import relationship, DeclarativeBase from dataregistry.db_basic import DbConnection, SCHEMA_VERSION -from dataregistry.db_basic import add_table_row, _insert_provenance +from dataregistry.db_basic import _insert_provenance from dataregistry.schema import load_schema """ @@ -36,7 +45,7 @@ schema_yaml = load_schema() -def _get_rows(schema, table): +def _get_column_definitions(schema, table): """ Build the SQLAlchemy `Column` list for this table from the information in the `schema.yaml` file. @@ -49,13 +58,12 @@ def _get_rows(schema, table): Returns ------- return_dict : dict - SQLAlchemy Column entries for each table row + SQLAlchemy Column entries for each table """ return_dict = {} for column in schema_yaml[table].keys(): - - # Special case where row has a foreign key + # Special case where column has a foreign key if schema_yaml[table][column]["foreign_key"]: if schema_yaml[table][column]["foreign_key_schema"] == "self": schema_yaml[table][column]["foreign_key_schema"] = schema @@ -67,7 +75,7 @@ def _get_rows(schema, table): _get_ForeignKey_str( schema_yaml[table][column]["foreign_key_schema"], schema_yaml[table][column]["foreign_key_table"], - schema_yaml[table][column]["foreign_key_row"], + schema_yaml[table][column]["foreign_key_column"], ) ), primary_key=schema_yaml[table][column]["primary_key"], @@ -90,9 +98,9 @@ class Base(DeclarativeBase): pass -def _get_ForeignKey_str(schema, table, row): +def _get_ForeignKey_str(schema, table, column): """ - Get the string reference to the ".
." a foreign key will + Get the string reference to the ".
." a foreign key will point to. The schema address will only be included for postgres backends. @@ -101,7 +109,7 @@ def _get_ForeignKey_str(schema, table, row): --------- schema : str table : str - row : str + column : str Returns ------- @@ -109,9 +117,9 @@ def _get_ForeignKey_str(schema, table, row): """ if schema is None: - return f"{table}.{row}" + return f"{table}.{column}" else: - return f"{schema}.{table}.{row}" + return f"{schema}.{table}.{column}" def _Provenance(schema): @@ -119,13 +127,13 @@ def _Provenance(schema): class_name = f"{schema}_provenance" - # Load rows from `schema.yaml` file - rows = _get_rows(schema, "provenance") + # Load columns from `schema.yaml` file + columns = _get_column_definitions(schema, "provenance") # Table metadata meta = {"__tablename__": "provenance", "__table_args__": {"schema": schema}} - Model = type(class_name, (Base,), {**rows, **meta}) + Model = type(class_name, (Base,), {**columns, **meta}) return Model @@ -134,13 +142,13 @@ def _Execution(schema): class_name = f"{schema}_execution" - # Load rows from `schema.yaml` file - rows = _get_rows(schema, "execution") + # Load columns from `schema.yaml` file + columns = _get_column_definitions(schema, "execution") # Table metadata meta = {"__tablename__": "execution", "__table_args__": {"schema": schema}} - Model = type(class_name, (Base,), {**rows, **meta}) + Model = type(class_name, (Base,), {**columns, **meta}) return Model @@ -149,8 +157,8 @@ def _ExecutionAlias(schema): class_name = f"{schema}_execution_alias" - # Load rows from `schema.yaml` file - rows = _get_rows(schema, "execution_alias") + # Load columns from `schema.yaml` file + columns = _get_column_definitions(schema, "execution_alias") # Table metadata meta = { @@ -161,7 +169,7 @@ def _ExecutionAlias(schema): ), } - Model = type(class_name, (Base,), {**rows, **meta}) + Model = type(class_name, (Base,), {**columns, **meta}) return Model @@ -170,8 +178,8 @@ def _DatasetAlias(schema): class_name = f"{schema}_dataset_alias" - # Load rows from `schema.yaml` file - rows = _get_rows(schema, "dataset_alias") + # Load columns from `schema.yaml` file + columns = _get_column_definitions(schema, "dataset_alias") # Table metadata meta = { @@ -182,7 +190,7 @@ def _DatasetAlias(schema): ), } - Model = type(class_name, (Base,), {**rows, **meta}) + Model = type(class_name, (Base,), {**columns, **meta}) return Model @@ -191,8 +199,8 @@ def _Dataset(schema): class_name = f"{schema}_dataset" - # Load rows from `schema.yaml` file - rows = _get_rows(schema, "dataset") + # Load columns from `schema.yaml` file + columns = _get_column_definitions(schema, "dataset") # Table metadata meta = { @@ -206,7 +214,7 @@ def _Dataset(schema): ), } - Model = type(class_name, (Base,), {**rows, **meta}) + Model = type(class_name, (Base,), {**columns, **meta}) return Model @@ -215,17 +223,17 @@ def _Dependency(schema, has_production): class_name = f"{schema}_dependency" - # Load rows from `schema.yaml` file - rows = _get_rows(schema, "dependency") + # Load columns from `schema.yaml` file + columns = _get_column_definitions(schema, "dependency") # Remove link to production schema. if not has_production: - del rows["input_production_id"] + del columns["input_production_id"] # Table metadata meta = {"__tablename__": "dependency", "__table_args__": {"schema": schema}} - Model = type(class_name, (Base,), {**rows, **meta}) + Model = type(class_name, (Base,), {**columns, **meta}) return Model diff --git a/src/cli/cli.py b/src/cli/cli.py index 86bcc62d..b6a0f9de 100644 --- a/src/cli/cli.py +++ b/src/cli/cli.py @@ -73,29 +73,29 @@ arg_register_dataset = arg_register_sub.add_parser("dataset", help="Register a dataset") # Get some information from the `schema.yaml` file -for row in schema_data["dataset"]: +for column in schema_data["dataset"]: # Any default? - if schema_data["dataset"][row]["cli_default"] is not None: - default = schema_data["dataset"][row]["cli_default"] + if schema_data["dataset"][column]["cli_default"] is not None: + default = schema_data["dataset"][column]["cli_default"] default_str = f" (default={default})" else: default = None default_str = "" # Restricted to choices? - if schema_data["dataset"][row]["choices"] is not None: - choices = schema_data["dataset"][row]["choices"] + if schema_data["dataset"][column]["choices"] is not None: + choices = schema_data["dataset"][column]["choices"] else: choices = None # Add flag - if schema_data["dataset"][row]["cli_optional"]: + if schema_data["dataset"][column]["cli_optional"]: arg_register_dataset.add_argument( - "--" + row, - help=schema_data["dataset"][row]["description"] + default_str, + "--" + column, + help=schema_data["dataset"][column]["description"] + default_str, default=default, choices=choices, - type=_TYPE_TRANSLATE[schema_data["dataset"][row]["type"]], + type=_TYPE_TRANSLATE[schema_data["dataset"][column]["type"]], ) # Entries unique to registering the dataset using the CLI diff --git a/src/dataregistry/schema/schema.yaml b/src/dataregistry/schema/schema.yaml index e9e1c214..972fa4e5 100644 --- a/src/dataregistry/schema/schema.yaml +++ b/src/dataregistry/schema/schema.yaml @@ -107,7 +107,7 @@ execution_alias: foreign_key: True foreign_key_schema: "self" foreign_key_table: "execution" - foreign_key_row: "execution_id" + foreign_key_column: "execution_id" description: "Execution this alias is linked to" dataset_alias: @@ -135,7 +135,7 @@ dataset_alias: foreign_key: True foreign_key_schema: "self" foreign_key_table: "dataset" - foreign_key_row: "dataset_id" + foreign_key_column: "dataset_id" description: "Dataset this alias is linked to" dependency: @@ -152,21 +152,21 @@ dependency: foreign_key: True foreign_key_schema: "self" foreign_key_table: "dataset" - foreign_key_row: "dataset_id" - description: "Dataset this dependency is linked to" + foreign_key_column: "dataset_id" + description: "Dataset this dependency is linked to (for every dependency, this, or `input_production_id`, must be non-null)" input_production_id: type: "Integer" foreign_key: True foreign_key_schema: "production" foreign_key_table: "dataset" - foreign_key_row: "dataset_id" - description: "Production dataset this dependency is linked to" + foreign_key_column: "dataset_id" + description: "Production dataset this dependency is linked to (for every dependency, this, or `input_id`, must be non-null)" execution_id: type: "Integer" foreign_key: True foreign_key_schema: "self" foreign_key_table: "execution" - foreign_key_row: "execution_id" + foreign_key_column: "execution_id" description: "Execution this dependency is linked to" dataset: @@ -260,7 +260,7 @@ dataset: foreign_key: True foreign_key_schema: "self" foreign_key_table: "execution" - foreign_key_row: "execution_id" + foreign_key_column: "execution_id" description: "Execution this dataset is linked to" cli_optional: True is_overwritten: @@ -278,5 +278,5 @@ dataset: description: "True if an external link" is_archived: type: "Boolean" - nullablee: False + nullable: False description: "True if dataset is archived" From 920bc090bf4f1f768c65f6cdef145e8b676cceef Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Wed, 6 Dec 2023 10:15:51 +0100 Subject: [PATCH 8/9] Fix CLI for all schema columns when registering a dataset --- src/cli/cli.py | 19 +++++++++++-------- src/cli/register.py | 3 +++ src/dataregistry/registrar.py | 4 ++-- .../create_test_entries_cli.sh | 6 +++++- 4 files changed, 21 insertions(+), 11 deletions(-) diff --git a/src/cli/cli.py b/src/cli/cli.py index b6a0f9de..a60ecc9d 100644 --- a/src/cli/cli.py +++ b/src/cli/cli.py @@ -74,28 +74,31 @@ # Get some information from the `schema.yaml` file for column in schema_data["dataset"]: + extra_args = {} + # Any default? if schema_data["dataset"][column]["cli_default"] is not None: - default = schema_data["dataset"][column]["cli_default"] - default_str = f" (default={default})" + extra_args["default"] = schema_data["dataset"][column]["cli_default"] + default_str = f" (default={extra_args['default']})" else: - default = None default_str = "" # Restricted to choices? if schema_data["dataset"][column]["choices"] is not None: - choices = schema_data["dataset"][column]["choices"] + extra_args["choices"] = schema_data["dataset"][column]["choices"] + + # Is this a boolean flag? + if schema_data["dataset"][column]["type"] == "Boolean": + extra_args["action"] = "store_true" else: - choices = None + extra_args["type"] = _TYPE_TRANSLATE[schema_data["dataset"][column]["type"]] # Add flag if schema_data["dataset"][column]["cli_optional"]: arg_register_dataset.add_argument( "--" + column, help=schema_data["dataset"][column]["description"] + default_str, - default=default, - choices=choices, - type=_TYPE_TRANSLATE[schema_data["dataset"][column]["type"]], + **extra_args, ) # Entries unique to registering the dataset using the CLI diff --git a/src/cli/register.py b/src/cli/register.py index 5518c3e0..ca74870b 100644 --- a/src/cli/register.py +++ b/src/cli/register.py @@ -38,6 +38,9 @@ def register_dataset(args): name=args.name, version_suffix=args.version_suffix, creation_date=args.creation_date, + access_API=args.access_API, + execution_id=args.execution_id, + is_overwritable=args.is_overwritable, description=args.description, old_location=args.old_location, copy=(not args.make_symlink), diff --git a/src/dataregistry/registrar.py b/src/dataregistry/registrar.py index 7d13725a..3866fded 100644 --- a/src/dataregistry/registrar.py +++ b/src/dataregistry/registrar.py @@ -505,10 +505,10 @@ def register_dataset( if version_suffix: values["version_suffix"] = version_suffix if creation_date: - values["dataset_creation_date"] = creation_date + values["creation_date"] = creation_date else: if ds_creation_date: - values["dataset_creation_date"] = ds_creation_date + values["creation_date"] = ds_creation_date if description: values["description"] = description if execution_id: diff --git a/tests/end_to_end_tests/create_test_entries_cli.sh b/tests/end_to_end_tests/create_test_entries_cli.sh index b439b4aa..b008e3ac 100644 --- a/tests/end_to_end_tests/create_test_entries_cli.sh +++ b/tests/end_to_end_tests/create_test_entries_cli.sh @@ -16,7 +16,11 @@ dregs register dataset my_cli_dataset3 "1.2.3" --is_dummy \ --owner "DESC" \ --owner_type "group" \ --version_suffix "test" \ - --root_dir "DataRegistry_data" + --root_dir "DataRegistry_data" \ + --creation_date "2020-01-01" \ + --input_datasets 1 2 \ + --execution_name "I have given the execution a name" \ + --is_overwritable # A production dataset if [ "$DATAREG_BACKEND" = "postgres" ]; then From 59da1d870fea11ae399ab68ed4d9ce2db016f52c Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Wed, 6 Dec 2023 10:24:26 +0100 Subject: [PATCH 9/9] Reduce register doc strings --- src/dataregistry/registrar.py | 101 +++++++++++++--------------------- 1 file changed, 37 insertions(+), 64 deletions(-) diff --git a/src/dataregistry/registrar.py b/src/dataregistry/registrar.py index 3866fded..4eba83cd 100644 --- a/src/dataregistry/registrar.py +++ b/src/dataregistry/registrar.py @@ -229,22 +229,19 @@ def register_execution( """ Register a new execution in the DESC data registry. + Any args marked with '**' share their name with the associated column + in the registry schema. Descriptions of what these columns are can be + found in `schema.yaml` or the documentation. + Parameters ---------- - name : str - Typically pipeline name or program name - description : str, optional - Human readible description of execution - execution_start : datetime, optional - Date the execution started - locale : str, optional - Where was the execution performed? - configuration : str, optional - Path to text file used to configure the execution - input_datasets : list, optional - List of dataset ids that were the input to this execution - input_production_datasets : list, optional - List of production dataset ids that were the input to this execution + name** : str + description** : str, optional + execution_start** : datetime, optional + locale** : str, optional + configuration** : str, optional + input_datasets** : list, optional + input_production_datasets** : list, optional max_config_length : int, optional Maxiumum number of lines to read from a configuration file @@ -326,37 +323,21 @@ def register_dataset( """ Register a new dataset in the DESC data registry. + Any args marked with '**' share their name with the associated column + in the registry schema. Descriptions of what these columns are can be + found in `schema.yaml` or the documentation. + Parameters ---------- - relative_path : str - Destination for the dataset within the data registry. Path is - relative to ``//``. - version : str - Semantic version string of the format MAJOR.MINOR.PATCH *or* - a special flag: "patch", "minor" or "major". - - When a special flag is used it automatically bumps the relative - version for you (see examples for more details). - version_suffix : str, optional - Suffix string to place at the end of the version string. Cannot be - used for production datasets. - name : str, optional - Any convenient, evocative name for the human. - - Note the combination of name, version and version_suffix must be - unique. - creation_date : datetime, optional - Manually set creation date of dataset - description : str, optional - Human-readable description of dataset - execution_id : int, optional - Used to associate dataset with a particular execution - access_API : str, optional - Hint as to how to read the data - is_overwritable : bool, optional - True if dataset may be overwritten (defaults to False). - - Note production datasets cannot be overwritten. + relative_path** : str + version** : str + version_suffix** : str, optional + name** : str, optional + creation_date** : datetime, optional + description** : str, optional + execution_id** : int, optional + access_API** : str, optional + is_overwritable** : bool, optional old_location : str, optional Absolute location of dataset to copy into the data registry. @@ -371,23 +352,13 @@ def register_dataset( only) verbose : bool, optional Provide some additional output information - owner : str, optional - Owner of the dataset. If None, defaults to what was set in - Registrar __init__, if that is also None, defaults to $USER. - owner_type : str, optional - Owner type: "user", "group", or "production". If None, defaults to - what was set in Registrar __init__, if that is also None, defaults - to "user". - execution_name : str, optional - Typically pipeline name or program name - execution_description : str, optional - Human readible description of execution - execution_start : datetime, optional - Date the execution started - execution_locale : str, optional - Where was the execution performed? - execution_configuration : str, optional - Path to text file used to configure the execution + owner** : str, optional + owner_type** : str, optional + execution_name** : str, optional + execution_description** : str, optional + execution_start** : datetime, optional + execution_locale** : str, optional + execution_configuration** : str, optional input_datasets : list, optional List of dataset ids that were the input to this execution input_production_datasets : list, optional @@ -549,12 +520,14 @@ def register_dataset_alias(self, aliasname, dataset_id): """ Register a new dataset alias in the DESC data registry. + Any args marked with '**' share their name with the associated column + in the registry schema. Descriptions of what these columns are can be + found in `schema.yaml` or the documentation. + Parameters ---------- - aliasname : str - Human readible alias for the dataset - dataset_id : int - Existing dataset ID to attach dataset alias to + aliasname** : str + dataset_id** : int Returns -------