From 9edbe087aed848e8e24470c7b0c8226fd627b24a Mon Sep 17 00:00:00 2001
From: Stuart McAlpine <stuart.mcalpine@fysik.su.se>
Date: Sat, 2 Dec 2023 16:20:17 +0100
Subject: [PATCH 1/9] Add a central schema yaml file.

This contains all the tables and row entries in a central location.

Now the schema creation script, the CLI and the docs can use this, so we don't have to worry about missing somewhere when adding a new entry to a table.
---
 .github/workflows/documentation.yml     |   2 +-
 docs/source/_static/css/custom.css      |   3 +
 docs/source/conf.py                     |   9 +-
 docs/source/reference_schema.rst        | 247 +--------------------
 docs/source/templates/schema_table.tmpl |  22 ++
 scripts/create_registry_db.py           | 181 ++++++----------
 src/dataregistry/schema/__init__.py     |   1 +
 src/dataregistry/schema/load_schema.py  |  35 +++
 src/dataregistry/schema/schema.yaml     | 271 ++++++++++++++++++++++++
 9 files changed, 410 insertions(+), 361 deletions(-)
 create mode 100644 docs/source/_static/css/custom.css
 create mode 100644 docs/source/templates/schema_table.tmpl
 create mode 100644 src/dataregistry/schema/__init__.py
 create mode 100644 src/dataregistry/schema/load_schema.py
 create mode 100644 src/dataregistry/schema/schema.yaml

diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
index 2c7e4af8..ed2295b7 100644
--- a/.github/workflows/documentation.yml
+++ b/.github/workflows/documentation.yml
@@ -10,7 +10,7 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           python -m pip install .
-          pip install sphinx sphinx_rtd_theme sphinx_toolbox sphinxcontrib-autoprogram
+          pip install sphinx sphinx_rtd_theme sphinx_toolbox sphinxcontrib-autoprogram sphinxcontrib.datatemplates
       - name: Sphinx build
         run: |
           sphinx-build docs/source _build
diff --git a/docs/source/_static/css/custom.css b/docs/source/_static/css/custom.css
new file mode 100644
index 00000000..7d2e89e6
--- /dev/null
+++ b/docs/source/_static/css/custom.css
@@ -0,0 +1,3 @@
+.tight-table td {
+    white-space: normal !important;
+}
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 8b2fa02e..6bbbb460 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -5,7 +5,8 @@
     "sphinx_rtd_theme",
     "sphinx.ext.autodoc",
     'sphinx.ext.napoleon',
-    'sphinxcontrib.autoprogram'
+    'sphinxcontrib.autoprogram',
+    'sphinxcontrib.datatemplates'
 ]
 
 project = 'DESC data management'
@@ -36,3 +37,9 @@
 html_logo = '_static/DREGS_logo_v2.png'
 
 autoclass_content = 'both'
+
+templates_path = ['templates']
+
+html_css_files = [
+    'css/custom.css',
+]
diff --git a/docs/source/reference_schema.rst b/docs/source/reference_schema.rst
index 9494ac52..66da4bc2 100644
--- a/docs/source/reference_schema.rst
+++ b/docs/source/reference_schema.rst
@@ -7,248 +7,5 @@ database (e.g., the default and production schemas) follows the same structure.
 .. image:: _static/schema_plot.png
    :alt: Image missing
 
-The dataset table
------------------
-
-.. list-table::
-   :header-rows: 1
-
-   * - row
-     - description
-     - type
-   * - ``dataset_id``
-     - Unique identifier for dataset
-     - int
-   * - ``name``
-     - User given name for dataset
-     - str
-   * - ``relative_path``
-     - Relative path storing the data, relative to `<root_dir>` 
-     - str
-   * - ``version_major``
-     - Major version in semantic string (i.e., X.x.x)
-     - int
-   * - ``version_minor``
-     - Minor version in semantic string (i.e., x.X.x)
-     - int
-   * - ``version_patch``
-     - Patch version in semantic string (i.e., x.x.X)
-     - int
-   * - ``version_suffix``
-     - Optional version suffix
-     - str
-   * - ``dataset_creation_date``
-     - Dataset creation date
-     - datetime
-   * - ``is_archived``
-     - True if the data is archived, i.e, the data is longer within `<root_dir>`
-     - bool
-   * - ``is_external_link``
-     - ???
-     - bool
-   * - ``is_overwritten``
-     - True if the original data for this dataset has been overwritten at some point. This would have required that ``is_overwritable`` was set to ``true`` on the original dataset  
-     - bool
-   * - ``is_valid``
-     - ???
-     - bool
-   * - ``register_date``
-     - Date the dataset was registered 
-     - datetime
-   * - ``creator_uid``
-     - `uid` (user id) of the person that registered the dataset
-     - str
-   * - ``access_API``
-     - Describes the software that can read the dataset (e.g., "gcr-catalogs", "skyCatalogs")
-     - str
-   * - ``execution_id``
-     - ID of execution this dataset belongs to
-     - int
-   * - ``description``
-     - User provided description of the dataset
-     - str
-   * - ``owner_type``
-     - Datasets owner type, can be "user", "group", "project" or "production".
-     - str
-   * - ``owner``
-     - Owner of the dataset
-     - str
-   * - ``data_org``
-     - Dataset organisation ("file" or "directory")
-     - str
-   * - ``nfiles``
-     - How many files are in the dataset
-     - int
-   * - ``total_disk_space``
-     - Total disk spaced used by the dataset
-     - float
-
-The dataset_alias table
------------------------
-
-.. list-table::
-   :header-rows: 1
-
-   * - row
-     - description
-     - type
-   * - ``dataset_alias_id``
-     - Unique identifier for alias
-     - int
-   * - ``name``
-     - User given alias name
-     - str
-   * - ``dataset_id``
-     - ID of dataset this is an alias for
-     - int
-   * - ``supersede_date``
-     - If a new entry has been added to the table with the same alias name (but
-       different dataset_id), the old entry will be superseded. ``supersede_date``
-       in the old entry tracks when this happened. If the entry has not been
-       superseded, ``supersede_date`` will be None
-     - datetime
-   * - ``register_date``
-     - Date the dataset was registered
-     - datetime
-   * - ``creator_uid``
-     - `uid` (user id) of the person that registered the dataset
-     - str
-
-The dependency table
---------------------
-
-.. list-table::
-   :header-rows: 1
-
-   * - row
-     - description
-     - type
-   * - ``dependency_id``
-     - Unique identifier for dependency
-     - int
-   * - ``execution_id``
-     - Execution this dependency is linked to
-     - int
-   * - ``input_id``
-     - Dataset ID of the dependent dataset
-     - int
-   * - ``register_date``
-     - Date the dependency was registered
-     - datetime
-
-The execution table
--------------------
-
-.. list-table::
-   :header-rows: 1
-
-   * - row
-     - description
-     - type
-   * - ``execution_id``
-     - Unique identifier for execution
-     - int
-   * - ``description``
-     - User given discription of execution
-     - str
-   * - ``name``
-     - User given execution name
-     - str
-   * - ``register_date``
-     - Date the execution was registered
-     - datetime
-   * - ``execution_start``
-     - Date the execution started
-     - datetime
-   * - ``locale``
-     - Locale of execution (e.g., NERSC)
-     - str
-   * - ``configuration``
-     - Path to configuration file of execution
-     - str
-   * - ``creator_uid``
-     - `uid` (user id) of the person that registered the dataset
-     - str
-
-The execution_alias table
--------------------------
-
-.. list-table::
-   :header-rows: 1
-
-   * - row
-     - description
-     - type
-   * - ``execution_alias_id``
-     - Unique identifier for execution alias
-     - int
-   * - ``execution_id``
-     - Execution this alias is linked to
-     - int
-   * - ``alias``
-     - User given execution alias name
-     - str
-   * - ``register_date``
-     - Date the execution was registered
-     - datetime
-   * - ``supersede_date``
-     - If a new entry has been added to the table with the same alias name (but
-       different dataset_id), the old entry will be superseded. ``supersede_date``
-       in the old entry tracks when this happened. If the entry has not been
-       superseded, ``supersede_date`` will be None
-     - datetime
-   * - ``creator_uid``
-     - `uid` (user id) of the person that registered the dataset
-     - str
-
-The provenance table
---------------------
-
-.. list-table::
-   :header-rows: 1
-
-   * - row
-     - description
-     - type
-   * - ``provenance_id``
-     - Unique identifier for provenance
-     - int
-   * - ``code_version_major``
-     - Major version of code when this schema was created
-     - int
-   * - ``code_version_minor``
-     - Minor version of code when this schema was created
-     - int
-   * - ``code_version_patch``
-     - Patch version of code when this schema was created
-     - int
-   * - ``code_version_suffix``
-     - Version suffix of code when this schema was created
-     - str
-   * - ``db_version_major``
-     - Major version of database
-     - int
-   * - ``db_version_minor``
-     - Minor version of database
-     - int
-   * - ``db_version_patch``
-     - Patch version of database
-     - int
-   * - ``git_hash``
-     - Git commit hash when this schema was created
-     - str
-   * - ``repo_is_clean``
-     - Was repository clean when this schema was created
-     - bool
-   * - ``update_method``
-     - "CREATE", "MODIFY" or "MIGRATE"
-     - str
-   * - ``schema_enabled_date``
-     - When was the schema enabled
-     - datetime
-   * - ``creator_uid``
-     - `uid` (user id) of the person that registered the schema
-     - str
-   * - ``comment``
-     - Any comment
-     - str
+.. datatemplate:yaml:: ../../src/dataregistry/schema/schema.yaml
+   :template: schema_table.tmpl
diff --git a/docs/source/templates/schema_table.tmpl b/docs/source/templates/schema_table.tmpl
new file mode 100644
index 00000000..7abbbce6
--- /dev/null
+++ b/docs/source/templates/schema_table.tmpl
@@ -0,0 +1,22 @@
+.. -*- mode: rst -*-
+
+{% for table in ['execution','provenance','execution_alias','dataset','dependency','dataset_alias'] %}
+
+The {{table}} table
+----------------------------------------
+
+.. list-table::
+   :header-rows: 1
+   :class: tight-table 
+
+   * - row
+     - description
+     - type
+
+{% for item in data[table] %}
+   * - {{item}}
+{% for item2 in ['description', 'type'] %}
+     - {{data[table][item][item2]}}
+{% endfor %}
+{% endfor %}
+{% endfor %}
diff --git a/scripts/create_registry_db.py b/scripts/create_registry_db.py
index add216a4..bda137de 100644
--- a/scripts/create_registry_db.py
+++ b/scripts/create_registry_db.py
@@ -7,6 +7,7 @@
 from sqlalchemy.orm import relationship, DeclarativeBase
 from dataregistry.db_basic import DbConnection, SCHEMA_VERSION
 from dataregistry.db_basic import add_table_row, _insert_provenance
+from dataregistry.schema import load_schema
 
 """
 A script to create the default dataregistry schema and the production schema.
@@ -20,6 +21,57 @@
     - "provenance"      : Contains information about the database/schema 
 """
 
+# Conversion from string types in `schema.yaml` to SQLAlchemy
+_TYPE_TRANSLATE = {"String": String, "Integer": Integer, "DateTime": DateTime,
+        "StringShort": String(20), "StringLong": String(250), "Boolean": Boolean,
+        "Float": Float}
+
+# Load the schema from the `schema.yaml` file
+schema_yaml = load_schema()
+
+def _get_rows(schema, table):
+    """
+    Build the SQLAlchemy `Column` list for this table from the information in
+    the `schema.yaml` file.
+
+    Parameters
+    ----------
+    schema : str
+    table : str
+    
+    Returns
+    -------
+    return_dict : dict
+        SQLAlchemy Column entries for each table row
+    """
+
+    return_dict = {}
+    for column in schema_yaml[table].keys():
+
+        # Special case where row has a foreign key
+        if schema_yaml[table][column]["foreign_key"]:
+            if schema_yaml[table][column]["foreign_key_schema"] == "self":
+                schema_yaml[table][column]["foreign_key_schema"] = schema
+
+            return_dict[column] = Column(column,
+                    _TYPE_TRANSLATE[schema_yaml[table][column]["type"]],
+                    ForeignKey(_get_ForeignKey_str(
+                        schema_yaml[table][column]["foreign_key_schema"],
+                    schema_yaml[table][column]["foreign_key_table"],
+                    schema_yaml[table][column]["foreign_key_row"])),
+                    primary_key=schema_yaml[table][column]["primary_key"],
+                    nullable=schema_yaml[table][column]["nullable"],
+            )
+
+        # Normal case
+        else:
+            return_dict[column] = Column(column, 
+                    _TYPE_TRANSLATE[schema_yaml[table][column]["type"]],
+                    primary_key=schema_yaml[table][column]["primary_key"],
+                    nullable=schema_yaml[table][column]["nullable"])
+
+    return return_dict
+
 class Base(DeclarativeBase):
     pass
 
@@ -35,25 +87,8 @@ def _Provenance(schema):
 
     class_name = f"{schema}_provenance"
 
-    # Rows
-    rows = {
-        "provenance_id": Column("provenance_id", Integer, primary_key=True),
-        "code_version_major": Column("code_version_major", Integer, nullable=False),
-        "code_version_minor": Column("code_version_minor", Integer, nullable=False),
-        "code_version_patch": Column("code_version_patch", Integer, nullable=False),
-        "code_version_suffix": Column("code_version_suffix", String),
-        "db_version_major": Column("db_version_major", Integer, nullable=False),
-        "db_version_minor": Column("db_version_minor", Integer, nullable=False),
-        "db_version_patch": Column("db_version_patch", Integer, nullable=False),
-        "git_hash": Column("git_hash", String, nullable=True),
-        "repo_is_clean": Column("repo_is_clean", Boolean, nullable=True),
-        # update method is always "CREATE" for this script.
-        # Alternative could be "MODIFY" or "MIGRATE"
-        "update_method": Column("update_method", String(10), nullable=False),
-        "schema_enabled_date": Column("schema_enabled_date", DateTime, nullable=False),
-        "creator_uid": Column("creator_uid", String(20), nullable=False),
-        "comment": Column("comment", String(250)),
-    }
+    # Load rows from `schema.yaml` file
+    rows = _get_rows(schema, "provenance")
 
     # Table metadata
     meta = {"__tablename__": "provenance", "__table_args__": {"schema": schema}}
@@ -67,19 +102,8 @@ def _Execution(schema):
 
     class_name = f"{schema}_execution"
 
-    # Rows
-    rows = {
-        "execution_id": Column("execution_id", Integer, primary_key=True),
-        "description": Column("description", String),
-        "register_date": Column("register_date", DateTime, nullable=False),
-        "execution_start": Column("execution_start", DateTime),
-        # name is meant to identify the code executed.  E.g., could be pipeline name
-        "name": Column("name", String),
-        # locale is, e.g. site where code was run
-        "locale": Column("locale", String),
-        "configuration": Column("configuration", String),
-        "creator_uid": Column("creator_uid", String(20), nullable=False),
-    }
+    # Load rows from `schema.yaml` file
+    rows = _get_rows(schema, "execution")
 
     # Table metadata
     meta = {"__tablename__": "execution", "__table_args__": {"schema": schema}}
@@ -93,18 +117,8 @@ def _ExecutionAlias(schema):
 
     class_name = f"{schema}_execution_alias"
 
-    # Rows
-    rows = {
-        "execution_alias_id": Column("execution_alias_id", Integer, primary_key=True),
-        "alias": Column(String, nullable=False),
-        "execution_id": Column(
-            Integer,
-            ForeignKey(_get_ForeignKey_str(schema, "execution", "execution_id")),
-        ),
-        "supersede_date": Column(DateTime, default=None),
-        "register_date": Column(DateTime, nullable=False),
-        "creator_uid": Column(String(20), nullable=False),
-    }
+    # Load rows from `schema.yaml` file
+    rows = _get_rows(schema, "execution_alias")
 
     # Table metadata
     meta = {
@@ -124,17 +138,8 @@ def _DatasetAlias(schema):
 
     class_name = f"{schema}_dataset_alias"
 
-    # Rows
-    rows = {
-        "dataset_alias_id": Column(Integer, primary_key=True),
-        "alias": Column(String, nullable=False),
-        "dataset_id": Column(
-            Integer, ForeignKey(_get_ForeignKey_str(schema, "dataset", "dataset_id"))
-        ),
-        "supersede_date": Column(DateTime, default=None),
-        "register_date": Column(DateTime, nullable=False),
-        "creator_uid": Column(String(20), nullable=False),
-    }
+    # Load rows from `schema.yaml` file
+    rows = _get_rows(schema, "dataset_alias")
 
     # Table metadata
     meta = {
@@ -154,47 +159,8 @@ def _Dataset(schema):
 
     class_name = f"{schema}_dataset"
 
-    # Rows
-    rows = {
-        "dataset_id": Column(Integer, primary_key=True),
-        "name": Column(String, nullable=False),
-        "relative_path": Column(String, nullable=False),
-        "version_major": Column(Integer, nullable=False),
-        "version_minor": Column(Integer, nullable=False),
-        "version_patch": Column(Integer, nullable=False),
-        "version_string": Column(String, nullable=False),
-        "version_suffix": Column(String),
-        "dataset_creation_date": Column(DateTime),
-        "is_archived": Column(Boolean, default=False),
-        "is_external_link": Column(Boolean, default=False),
-        "is_overwritable": Column(Boolean, default=False),
-        "is_overwritten": Column(Boolean, default=False),
-        "is_valid": Column(Boolean, default=True),  # False if, e.g., copy failed
-        # The following are boilerplate, included in all or most tables
-        "register_date": Column(DateTime, nullable=False),
-        "creator_uid": Column(String(20), nullable=False),
-        # Make access_API a string for now, but it could be an enumeration or
-        # a foreign key into another table.   Possible values for the column
-        # might include "gcr-catalogs", "skyCatalogs"
-        "access_API": Column("access_API", String(20)),
-        # A way to associate a dataset with a program execution or "run"
-        "execution_id": Column(
-            Integer,
-            ForeignKey(_get_ForeignKey_str(schema, "execution", "execution_id")),
-        ),
-        "description": Column(String),
-        "owner_type": Column(String, nullable=False),
-        # If ownership_type is 'production', then owner is always 'production'
-        # If ownership_type is 'group', owner will be a group name
-        # If ownership_type is 'user', owner will be a user name
-        "owner": Column(String, nullable=False),
-        # To store metadata about the dataset.
-        "data_org": Column("data_org", String, nullable=False),
-        "nfiles": Column("nfiles", Integer, nullable=False),
-        "total_disk_space": Column("total_disk_space", Float, nullable=False),
-        # What `root_dir` was the data originially ingested into
-        "register_root_dir": Column(String, nullable=False),
-    }
+    # Load rows from `schema.yaml` file
+    rows = _get_rows(schema, "dataset")
 
     # Table metadata
     meta = {
@@ -217,25 +183,12 @@ def _Dependency(schema, has_production):
 
     class_name = f"{schema}_dependency"
 
-    # Rows
-    rows = {
-        "dependency_id": Column(Integer, primary_key=True),
-        "register_date": Column(DateTime, nullable=False),
-        "execution_id": Column(
-            Integer,
-            ForeignKey(_get_ForeignKey_str(schema, "execution", "execution_id")),
-        ),
-        "input_id": Column(
-            Integer, ForeignKey(_get_ForeignKey_str(schema, "dataset", "dataset_id"))
-        ),
-    }
+    # Load rows from `schema.yaml` file
+    rows = _get_rows(schema, "dependency")
 
-    # Add link to production schema.
-    if has_production:
-        rows["input_production_id"] = Column(
-            Integer,
-            ForeignKey(_get_ForeignKey_str("production", "dataset", "dataset_id")),
-        )
+    # Remove link to production schema.
+    if not has_production:
+        del rows["input_production_id"]
     
     # Table metadata
     meta = {"__tablename__": "dependency", "__table_args__": {"schema": schema}}
diff --git a/src/dataregistry/schema/__init__.py b/src/dataregistry/schema/__init__.py
new file mode 100644
index 00000000..329fa0e8
--- /dev/null
+++ b/src/dataregistry/schema/__init__.py
@@ -0,0 +1 @@
+from .load_schema import load_schema
diff --git a/src/dataregistry/schema/load_schema.py b/src/dataregistry/schema/load_schema.py
new file mode 100644
index 00000000..e1f2b19f
--- /dev/null
+++ b/src/dataregistry/schema/load_schema.py
@@ -0,0 +1,35 @@
+import os
+import yaml
+
+def _populate_defaults(mydict):
+    """
+    Populate the default values for rows that haven't been specified in the
+    YAML file.
+
+    Parameters
+    ----------
+    mydict : dict
+    """
+    
+    # Attributes we check for and populate with these default value if missing
+    atts = {"nullable": True, "primary_key": False, "foreign_key": False}
+
+    # Loop over eah row and ingest
+    for table in mydict.keys():
+            for row in mydict[table].keys():
+                for att in atts.keys():
+                    if att not in mydict[table][row].keys():
+                        mydict[table][row][att] = atts[att]
+
+def load_schema():
+    """ Load the schema layout from the YAML file """
+
+    # Load
+    yaml_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "schema.yaml")
+    with open(yaml_file_path, 'r') as file:
+        yaml_data = yaml.safe_load(file)
+
+    # Populate defaults
+    _populate_defaults(yaml_data)
+
+    return yaml_data
diff --git a/src/dataregistry/schema/schema.yaml b/src/dataregistry/schema/schema.yaml
new file mode 100644
index 00000000..0c6dadbd
--- /dev/null
+++ b/src/dataregistry/schema/schema.yaml
@@ -0,0 +1,271 @@
+---
+execution:
+  execution_id:
+    type: "Integer"
+    primary_key: True
+    description: "Unique identifier for execution"
+  description:
+    type: "String"
+    description: "Short description of execution"
+  register_date:
+    type: "DateTime"
+    description: "When was the execution registered in the database"
+    nullable: False
+  execution_start:
+    type: "DateTime"
+    description: "When was the execution performed at `locale`"
+  name:
+    type: "String"
+    description: "Identifies the code executed (e.g., could be pipeline name)"
+  locale:
+    type: "String"
+    description: "Site where the code was run (e.g., NERSC)"
+  configuration:
+    type: "String"
+    description: "Path to execution configuration file (txt, YAML, TOML, etc). Ingested as raw text"
+  creator_uid:
+    type: "StringShort"
+    description: "UID of person who registered the entry"
+    nullable: False
+
+provenance:
+  provenance_id:
+    type: "Integer"
+    primary_key: True
+    description: "Unique identifier for this provenance entry"
+  code_version_major:
+    type: "Integer"
+    description: "Major version of code when this schema was created"
+    nullable: False
+  code_version_minor:
+    type: "Integer"
+    description: "Minor version of code when this schema was created"
+    nullable: False
+  code_version_patch:
+    type: "Integer"
+    description: "Patch version of code when this schema was created"
+    nullable: False
+  code_version_suffix:
+    type: "String"
+    description: "Version suffix of code when this schema was created"
+  creator_uid:
+    type: "StringShort"
+    description: "UID of person who registered the entry"
+    nullable: False
+  db_version_major:
+    type: "Integer"
+    description: "Major version of schema"
+    nullable: False
+  db_version_minor:
+    type: "Integer"
+    description: "Minor version of schema"
+    nullable: False
+  db_version_patch:
+    type: "Integer"
+    description: "Patch version of schema"
+    nullable: False
+  git_hash:
+    type: "String"
+    description: "Git hash at time of schema creation"
+  repo_is_clean:
+    type: "Boolean"
+    description: "Was git repo clean at schema creation?"
+  update_method:
+    type: "String"
+    description: "What type of schema update does this entry relate to ('CREATE','MODIFY','MIGRATE')"
+    nullable: False
+  schema_enabled_date:
+    type: "DateTime"
+    description: "Date schema was created/updated"
+    nullable: False
+  comment:
+    type: "StringLong"
+    description: "Comment relating to new provenance entry"
+
+execution_alias:
+  execution_alias_id:
+    description: "Unique identifier for execution alias"
+    type: "Integer"
+    primary_key: True
+  supersede_date:
+    type: "DateTime"
+    description: "If a new entry has been added to the table with the same alias name (but different `dataset_id`), the old entry will be superseded. `supersede_date` in the old entry tracks when this happened. If the entry has not been superseded, `supersede_date` will be None"
+  creator_uid:
+    type: "StringShort"
+    description: "UID of person who registered the entry"
+    nullable: False
+  register_date:
+    type: "DateTime"
+    description: "Date the execution alias was registered"
+    nullable: False
+  alias:
+    type: "String"
+    description: "User given execution alias name"
+    nullable: False
+  execution_id:
+    type: "Integer"
+    foreign_key: True
+    foreign_key_schema: "self"
+    foreign_key_table: "execution"
+    foreign_key_row: "execution_id"
+    description: "Execution this alias is linked to"
+
+dataset_alias:
+  dataset_alias_id:
+    description: "Unique identifier for dataset alias"
+    type: "Integer"
+    primary_key: True
+  supersede_date:
+    type: "DateTime"
+    description: "If a new entry has been added to the table with the same alias name (but different `dataset_id`), the old entry will be superseded. `supersede_date` in the old entry tracks when this happened. If the entry has not been superseded, `supersede_date` will be None"
+  creator_uid:
+    type: "StringShort"
+    description: "UID of person who registered the entry"
+    nullable: False
+  register_date:
+    type: "DateTime"
+    description: "Date the dataset alias was registered"
+    nullable: False
+  alias:
+    type: "String"
+    description: "User given dataset alias name"
+    nullable: False
+  dataset_id:
+    type: "Integer"
+    foreign_key: True
+    foreign_key_schema: "self"
+    foreign_key_table: "dataset"
+    foreign_key_row: "dataset_id"
+    description: "Dataset this alias is linked to"
+
+dependency:
+  dependency_id:
+    description: "Unique identifier for dependency"
+    type: "Integer"
+    primary_key: True
+  register_date:
+    type: "DateTime"
+    description: "Date the dependency was registered"
+    nullable: False
+  input_id:
+    type: "Integer"
+    foreign_key: True
+    foreign_key_schema: "self"
+    foreign_key_table: "dataset"
+    foreign_key_row: "dataset_id"
+    description: "Dataset this dependency is linked to"
+  input_production_id:
+    type: "Integer"
+    foreign_key: True
+    foreign_key_schema: "production"
+    foreign_key_table: "dataset"
+    foreign_key_row: "dataset_id"
+    description: "Production dataset this dependency is linked to"
+  execution_id:
+    type: "Integer"
+    foreign_key: True
+    foreign_key_schema: "self"
+    foreign_key_table: "execution"
+    foreign_key_row: "execution_id"
+    description: "Execution this dependency is linked to"
+
+dataset:
+  dataset_id:
+    type: "Integer"
+    primary_key: True
+    description: "Unique identifier for this dataset"
+  name:
+    type: "String"
+    description: "User given name for this dataset"
+    nullable: False
+  relative_path:
+    type: "String"
+    description: "Relative path storing the data, relative to `<root_dir>`"
+    nullable: False
+  version_major:
+    type: "Integer"
+    description: "Major version in semantic string (i.e., X.x.x)"
+    nullable: False
+  version_minor:
+    type: "Integer"
+    description: "Minor version in semantic string (i.e., x.X.x)"
+    nullable: False
+  version_patch:
+    type: "Integer"
+    description: "Patch version in semantic string (i.e., x.x.X)"
+    nullable: False
+  version_suffix:
+    type: "String"
+    description: "Optional version suffix"
+  version_string:
+    type: "String"
+    description: "Version string"
+    nullable: False
+  dataset_creation_date:
+    type: "DateTime"
+    description: "Dataset creation date"
+  register_date:
+    type: "DateTime"
+    description: "Date the dataset was registered"
+    nullable: False
+  creator_uid:
+    type: "StringShort"
+    description: "UID of person who registered the entry"
+    nullable: False
+  access_API:
+    type: "StringShort"
+    description: "Describes the software that can read the dataset (e.g., 'gcr-catalogs', 'skyCatalogs')"
+  owner:
+    type: "String"
+    description: "Owner of the dataset"
+    nullable: False
+  owner_type:
+    type: "String"
+    description: "Datasets owner type, can be 'user', 'group', 'project' or 'production'."
+    nullable: False
+  data_org:
+    type: "String"
+    description: "Dataset organisation ('file' or 'directory')"
+    nullable: False
+  nfiles:
+    type: "Integer"
+    description: "How many files are in the dataset"
+    nullable: False
+  total_disk_space:
+    type: "Float"
+    description: "Total disk spaced used by the dataset"
+    nullable: False
+  register_root_dir:
+    type: "String"
+    description: "The `root_dir` the dataset was originally ingested into"
+    nullable: False
+  description:
+    type: "String"
+    description: "User provided description of the dataset"
+  is_valid:
+    type: "Boolean"
+    default: True
+    description: "False if, e.g., copy failed"
+  execution_id:
+    type: "Integer"
+    foreign_key: True
+    foreign_key_schema: "self"
+    foreign_key_table: "execution"
+    foreign_key_row: "execution_id"
+    description: "Execution this dataset is linked to"
+  is_overwritten:
+    type: "Boolean"
+    default: False
+    description: "True if the original data for this dataset has been overwritten at some point. This would have required that `is_overwritable` was set to true on the original dataset"
+  is_overwritable:
+    type: "Boolean"
+    default: False
+    description: "True means this dataset can be overwritten in the future"
+  is_external_link:
+    type: "Boolean"
+    default: False
+    description: "True if an external link"
+  is_archived:
+    type: "Boolean"
+    default: False
+    description: "True if dataset is archived"

From 0b360da82fbf154198f880e84a74b085c80de265 Mon Sep 17 00:00:00 2001
From: Stuart McAlpine <stuart.mcalpine@fysik.su.se>
Date: Sat, 2 Dec 2023 16:25:02 +0100
Subject: [PATCH 2/9] Add schema.yaml to pyproject.toml

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index c15b3459..8cc09883 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,4 +38,4 @@ where = ["src"]
 dregs = "cli.cli:main"
 
 [tool.setuptools.package-data]
-"dataregistry" = ["site_config/site_rootdir.yaml"]
+"dataregistry" = ["site_config/site_rootdir.yaml", "schema/schema.yaml"]

From 375f1eb629df1c0c8df8520ddcfe15e443c84170 Mon Sep 17 00:00:00 2001
From: Stuart McAlpine <stuart.mcalpine@fysik.su.se>
Date: Sat, 2 Dec 2023 16:30:00 +0100
Subject: [PATCH 3/9] Format

---
 scripts/create_registry_db.py | 67 +++++++++++++++++++++++++----------
 1 file changed, 49 insertions(+), 18 deletions(-)

diff --git a/scripts/create_registry_db.py b/scripts/create_registry_db.py
index bda137de..0e2d5aff 100644
--- a/scripts/create_registry_db.py
+++ b/scripts/create_registry_db.py
@@ -22,13 +22,20 @@
 """
 
 # Conversion from string types in `schema.yaml` to SQLAlchemy
-_TYPE_TRANSLATE = {"String": String, "Integer": Integer, "DateTime": DateTime,
-        "StringShort": String(20), "StringLong": String(250), "Boolean": Boolean,
-        "Float": Float}
+_TYPE_TRANSLATE = {
+    "String": String,
+    "Integer": Integer,
+    "DateTime": DateTime,
+    "StringShort": String(20),
+    "StringLong": String(250),
+    "Boolean": Boolean,
+    "Float": Float,
+}
 
 # Load the schema from the `schema.yaml` file
 schema_yaml = load_schema()
 
+
 def _get_rows(schema, table):
     """
     Build the SQLAlchemy `Column` list for this table from the information in
@@ -38,7 +45,7 @@ def _get_rows(schema, table):
     ----------
     schema : str
     table : str
-    
+
     Returns
     -------
     return_dict : dict
@@ -47,35 +54,59 @@ def _get_rows(schema, table):
 
     return_dict = {}
     for column in schema_yaml[table].keys():
-
         # Special case where row has a foreign key
         if schema_yaml[table][column]["foreign_key"]:
             if schema_yaml[table][column]["foreign_key_schema"] == "self":
                 schema_yaml[table][column]["foreign_key_schema"] = schema
 
-            return_dict[column] = Column(column,
-                    _TYPE_TRANSLATE[schema_yaml[table][column]["type"]],
-                    ForeignKey(_get_ForeignKey_str(
+            return_dict[column] = Column(
+                column,
+                _TYPE_TRANSLATE[schema_yaml[table][column]["type"]],
+                ForeignKey(
+                    _get_ForeignKey_str(
                         schema_yaml[table][column]["foreign_key_schema"],
-                    schema_yaml[table][column]["foreign_key_table"],
-                    schema_yaml[table][column]["foreign_key_row"])),
-                    primary_key=schema_yaml[table][column]["primary_key"],
-                    nullable=schema_yaml[table][column]["nullable"],
+                        schema_yaml[table][column]["foreign_key_table"],
+                        schema_yaml[table][column]["foreign_key_row"],
+                    )
+                ),
+                primary_key=schema_yaml[table][column]["primary_key"],
+                nullable=schema_yaml[table][column]["nullable"],
             )
 
         # Normal case
         else:
-            return_dict[column] = Column(column, 
-                    _TYPE_TRANSLATE[schema_yaml[table][column]["type"]],
-                    primary_key=schema_yaml[table][column]["primary_key"],
-                    nullable=schema_yaml[table][column]["nullable"])
+            return_dict[column] = Column(
+                column,
+                _TYPE_TRANSLATE[schema_yaml[table][column]["type"]],
+                primary_key=schema_yaml[table][column]["primary_key"],
+                nullable=schema_yaml[table][column]["nullable"],
+            )
 
     return return_dict
 
+
 class Base(DeclarativeBase):
     pass
 
+
 def _get_ForeignKey_str(schema, table, row):
+    """
+    Get the string reference to the "<shema>.<table>.<row>" a foreign key will
+    point to.
+
+    The schema address will only be included for postgres backends.
+
+    Parameters
+    ---------
+    schema : str
+    table : str
+    row : str
+
+    Returns
+    -------
+    - : str
+    """
+
     if schema is None:
         return f"{table}.{row}"
     else:
@@ -189,7 +220,7 @@ def _Dependency(schema, has_production):
     # Remove link to production schema.
     if not has_production:
         del rows["input_production_id"]
-    
+
     # Table metadata
     meta = {"__tablename__": "dependency", "__table_args__": {"schema": schema}}
 
@@ -241,7 +272,7 @@ def _Dependency(schema, has_production):
         stmt = f"CREATE SCHEMA IF NOT EXISTS {SCHEMA}"
         conn.execute(text(stmt))
         conn.commit()
-        
+
 # Grant reg_reader access
 acct = "reg_reader"
 for SCHEMA in SCHEMA_LIST:

From 0b499b80bca2e1ba8ee3e8a3734b13ca68ed1aa2 Mon Sep 17 00:00:00 2001
From: Stuart McAlpine <stuart.mcalpine@fysik.su.se>
Date: Sat, 2 Dec 2023 16:48:49 +0100
Subject: [PATCH 4/9] Remove defaults from table creation, move to registrar

---
 scripts/create_registry_db.py          |  3 ++-
 src/cli/cli.py                         |  1 +
 src/dataregistry/registrar.py          |  3 +++
 src/dataregistry/schema/load_schema.py | 20 ++++++++++++--------
 src/dataregistry/schema/schema.yaml    | 10 +++++-----
 5 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/scripts/create_registry_db.py b/scripts/create_registry_db.py
index 0e2d5aff..8506ba7f 100644
--- a/scripts/create_registry_db.py
+++ b/scripts/create_registry_db.py
@@ -2,7 +2,7 @@
 import sys
 import argparse
 from datetime import datetime
-from sqlalchemy import Column, Integer, String, DateTime, Boolean, Index, Float
+from sqlalchemy import Column, ColumnDefault, Integer, String, DateTime, Boolean, Index, Float
 from sqlalchemy import ForeignKey, UniqueConstraint, text
 from sqlalchemy.orm import relationship, DeclarativeBase
 from dataregistry.db_basic import DbConnection, SCHEMA_VERSION
@@ -54,6 +54,7 @@ def _get_rows(schema, table):
 
     return_dict = {}
     for column in schema_yaml[table].keys():
+
         # Special case where row has a foreign key
         if schema_yaml[table][column]["foreign_key"]:
             if schema_yaml[table][column]["foreign_key_schema"] == "self":
diff --git a/src/cli/cli.py b/src/cli/cli.py
index ed07b67d..1786f825 100644
--- a/src/cli/cli.py
+++ b/src/cli/cli.py
@@ -4,6 +4,7 @@
 from dataregistry.db_basic import SCHEMA_VERSION
 from .register import register_dataset
 from .query import dregs_ls
+from dataregistry.schema import load_schema
 
 # ---------------------
 # The data registry CLI
diff --git a/src/dataregistry/registrar.py b/src/dataregistry/registrar.py
index 309d4d41..7d13725a 100644
--- a/src/dataregistry/registrar.py
+++ b/src/dataregistry/registrar.py
@@ -517,6 +517,9 @@ def register_dataset(
             values["access_API"] = access_API
         values["is_overwritable"] = is_overwritable
         values["is_overwritten"] = False
+        values["is_external_link"] = False
+        values["is_archived"] = False
+        values["is_valid"] = True
         values["register_date"] = datetime.now()
         values["owner_type"] = owner_type
         values["owner"] = owner
diff --git a/src/dataregistry/schema/load_schema.py b/src/dataregistry/schema/load_schema.py
index e1f2b19f..d9a1fac8 100644
--- a/src/dataregistry/schema/load_schema.py
+++ b/src/dataregistry/schema/load_schema.py
@@ -1,6 +1,7 @@
 import os
 import yaml
 
+
 def _populate_defaults(mydict):
     """
     Populate the default values for rows that haven't been specified in the
@@ -10,23 +11,26 @@ def _populate_defaults(mydict):
     ----------
     mydict : dict
     """
-    
+
     # Attributes we check for and populate with these default value if missing
     atts = {"nullable": True, "primary_key": False, "foreign_key": False}
 
     # Loop over eah row and ingest
     for table in mydict.keys():
-            for row in mydict[table].keys():
-                for att in atts.keys():
-                    if att not in mydict[table][row].keys():
-                        mydict[table][row][att] = atts[att]
+        for row in mydict[table].keys():
+            for att in atts.keys():
+                if att not in mydict[table][row].keys():
+                    mydict[table][row][att] = atts[att]
+
 
 def load_schema():
-    """ Load the schema layout from the YAML file """
+    """Load the schema layout from the YAML file"""
 
     # Load
-    yaml_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "schema.yaml")
-    with open(yaml_file_path, 'r') as file:
+    yaml_file_path = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), "schema.yaml"
+    )
+    with open(yaml_file_path, "r") as file:
         yaml_data = yaml.safe_load(file)
 
     # Populate defaults
diff --git a/src/dataregistry/schema/schema.yaml b/src/dataregistry/schema/schema.yaml
index 0c6dadbd..3449de97 100644
--- a/src/dataregistry/schema/schema.yaml
+++ b/src/dataregistry/schema/schema.yaml
@@ -244,7 +244,7 @@ dataset:
     description: "User provided description of the dataset"
   is_valid:
     type: "Boolean"
-    default: True
+    nullable: False
     description: "False if, e.g., copy failed"
   execution_id:
     type: "Integer"
@@ -255,17 +255,17 @@ dataset:
     description: "Execution this dataset is linked to"
   is_overwritten:
     type: "Boolean"
-    default: False
+    nullable: False
     description: "True if the original data for this dataset has been overwritten at some point. This would have required that `is_overwritable` was set to true on the original dataset"
   is_overwritable:
     type: "Boolean"
-    default: False
+    nullable: False
     description: "True means this dataset can be overwritten in the future"
   is_external_link:
     type: "Boolean"
-    default: False
+    nullable: False
     description: "True if an external link"
   is_archived:
     type: "Boolean"
-    default: False
+    nullablee: False
     description: "True if dataset is archived"

From 65f4c6eee37b188c3ff33d084028306e9fc470a2 Mon Sep 17 00:00:00 2001
From: Stuart McAlpine <stuart.mcalpine@fysik.su.se>
Date: Sat, 2 Dec 2023 17:33:55 +0100
Subject: [PATCH 5/9] Modify CLI to use schema.yaml

---
 src/cli/cli.py                                | 92 +++++++++----------
 src/cli/register.py                           |  3 +-
 src/dataregistry/schema/load_schema.py        |  3 +-
 src/dataregistry/schema/schema.yaml           | 21 ++++-
 .../create_test_entries_cli.sh                |  8 +-
 5 files changed, 65 insertions(+), 62 deletions(-)

diff --git a/src/cli/cli.py b/src/cli/cli.py
index 1786f825..86bcc62d 100644
--- a/src/cli/cli.py
+++ b/src/cli/cli.py
@@ -46,6 +46,20 @@
 # Register a dataset
 # ------------------
 
+# Load the schema information
+schema_data = load_schema()
+
+# Conversion from string types in `schema.yaml` to SQLAlchemy
+_TYPE_TRANSLATE = {
+    "String": str,
+    "Integer": int,
+    "DateTime": str,
+    "StringShort": str,
+    "StringLong": str,
+    "Boolean": bool,
+    "Float": float,
+}
+
 # Register a new database entry.
 arg_register = subparsers.add_parser(
     "register", help="Register a new entry to the database"
@@ -58,6 +72,33 @@
 # Register a new dataset.
 arg_register_dataset = arg_register_sub.add_parser("dataset", help="Register a dataset")
 
+# Get some information from the `schema.yaml` file
+for row in schema_data["dataset"]:
+    # Any default?
+    if schema_data["dataset"][row]["cli_default"] is not None:
+        default = schema_data["dataset"][row]["cli_default"]
+        default_str = f" (default={default})"
+    else:
+        default = None
+        default_str = ""
+
+    # Restricted to choices?
+    if schema_data["dataset"][row]["choices"] is not None:
+        choices = schema_data["dataset"][row]["choices"]
+    else:
+        choices = None
+
+    # Add flag
+    if schema_data["dataset"][row]["cli_optional"]:
+        arg_register_dataset.add_argument(
+            "--" + row,
+            help=schema_data["dataset"][row]["description"] + default_str,
+            default=default,
+            choices=choices,
+            type=_TYPE_TRANSLATE[schema_data["dataset"][row]["type"]],
+        )
+
+# Entries unique to registering the dataset using the CLI
 arg_register_dataset.add_argument(
     "relative_path",
     help=(
@@ -76,45 +117,6 @@
     ),
     type=str,
 )
-arg_register_dataset.add_argument(
-    "--version_suffix",
-    help=(
-        "Optional suffix string to place at the end of the version string."
-        "Cannot be used for production datasets."
-    ),
-    type=str,
-)
-arg_register_dataset.add_argument(
-    "--name",
-    help=(
-        "Any convenient, evocative name for the human. Note the combination of"
-        "name, version and version_suffix must be unique. If None name is generated"
-        "from the relative path."
-    ),
-    type=str,
-)
-arg_register_dataset.add_argument(
-    "--creation_date", help="Manually set creation date of dataset"
-)
-arg_register_dataset.add_argument(
-    "--description", help="Human-readable description of dataset", type=str
-)
-arg_register_dataset.add_argument(
-    "--execution_id",
-    help="Used to associate dataset with a particular execution",
-    type=int,
-)
-arg_register_dataset.add_argument(
-    "--access_API", help="Hint as to how to read the data", type=str
-)
-arg_register_dataset.add_argument(
-    "--is_overwritable",
-    help=(
-        "True if dataset may be overwritten (defaults to False). Production"
-        "datasets cannot be overwritten."
-    ),
-    action="store_true",
-)
 arg_register_dataset.add_argument(
     "--old_location",
     help=(
@@ -138,18 +140,6 @@
     default=f"{SCHEMA_VERSION}",
     help="Which schema to connect to",
 )
-arg_register_dataset.add_argument(
-    "--locale",
-    help="Location where dataset was produced",
-    type=str,
-    default="NERSC",
-)
-arg_register_dataset.add_argument(
-    "--owner", help="Owner of dataset. Defaults to $USER."
-)
-arg_register_dataset.add_argument(
-    "--owner-type", choices=["production", "group", "user"], default="user"
-)
 arg_register_dataset.add_argument(
     "--config_file", help="Location of data registry config file", type=str
 )
diff --git a/src/cli/register.py b/src/cli/register.py
index 98340e91..5518c3e0 100644
--- a/src/cli/register.py
+++ b/src/cli/register.py
@@ -9,7 +9,7 @@ def register_dataset(args):
     Parameters
     ----------
     args : argparse object
-    
+
     args.config_file : str
         Path to data registry config file
     args.schema : str
@@ -43,6 +43,7 @@ def register_dataset(args):
         copy=(not args.make_symlink),
         is_dummy=args.is_dummy,
         owner=args.owner,
+        owner_type=args.owner_type,
         execution_name=args.execution_name,
         execution_description=args.execution_description,
         execution_start=args.execution_start,
diff --git a/src/dataregistry/schema/load_schema.py b/src/dataregistry/schema/load_schema.py
index d9a1fac8..fbd0f238 100644
--- a/src/dataregistry/schema/load_schema.py
+++ b/src/dataregistry/schema/load_schema.py
@@ -13,7 +13,8 @@ def _populate_defaults(mydict):
     """
 
     # Attributes we check for and populate with these default value if missing
-    atts = {"nullable": True, "primary_key": False, "foreign_key": False}
+    atts = {"nullable": True, "primary_key": False, "foreign_key": False, "cli_optional": False,
+            "cli_default": None, "choices": None}
 
     # Loop over eah row and ingest
     for table in mydict.keys():
diff --git a/src/dataregistry/schema/schema.yaml b/src/dataregistry/schema/schema.yaml
index 3449de97..e9e1c214 100644
--- a/src/dataregistry/schema/schema.yaml
+++ b/src/dataregistry/schema/schema.yaml
@@ -176,8 +176,9 @@ dataset:
     description: "Unique identifier for this dataset"
   name:
     type: "String"
-    description: "User given name for this dataset"
+    description: "Any convenient, evocative name for the human. Note the combination of name, version and version_suffix must be unique. If None name is generated from the relative path."
     nullable: False
+    cli_optional: True
   relative_path:
     type: "String"
     description: "Relative path storing the data, relative to `<root_dir>`"
@@ -196,14 +197,16 @@ dataset:
     nullable: False
   version_suffix:
     type: "String"
-    description: "Optional version suffix"
+    description: "Optional version suffix to place at the end of the version string. Cannot be used for production datasets."
+    cli_optional: True
   version_string:
     type: "String"
     description: "Version string"
     nullable: False
-  dataset_creation_date:
+  creation_date:
     type: "DateTime"
     description: "Dataset creation date"
+    cli_optional: True
   register_date:
     type: "DateTime"
     description: "Date the dataset was registered"
@@ -215,14 +218,19 @@ dataset:
   access_API:
     type: "StringShort"
     description: "Describes the software that can read the dataset (e.g., 'gcr-catalogs', 'skyCatalogs')"
+    cli_optional: True
   owner:
     type: "String"
-    description: "Owner of the dataset"
+    description: "Owner of the dataset (defaults to $USER)"
     nullable: False
+    cli_optional: True
   owner_type:
     type: "String"
     description: "Datasets owner type, can be 'user', 'group', 'project' or 'production'."
     nullable: False
+    choices: ["user", "group", "project", "production"]
+    cli_default: "user"
+    cli_optional: True
   data_org:
     type: "String"
     description: "Dataset organisation ('file' or 'directory')"
@@ -241,7 +249,8 @@ dataset:
     nullable: False
   description:
     type: "String"
-    description: "User provided description of the dataset"
+    description: "User provided human-readable description of the dataset"
+    cli_optional: True
   is_valid:
     type: "Boolean"
     nullable: False
@@ -253,6 +262,7 @@ dataset:
     foreign_key_table: "execution"
     foreign_key_row: "execution_id"
     description: "Execution this dataset is linked to"
+    cli_optional: True
   is_overwritten:
     type: "Boolean"
     nullable: False
@@ -261,6 +271,7 @@ dataset:
     type: "Boolean"
     nullable: False
     description: "True means this dataset can be overwritten in the future"
+    cli_optional: True
   is_external_link:
     type: "Boolean"
     nullable: False
diff --git a/tests/end_to_end_tests/create_test_entries_cli.sh b/tests/end_to_end_tests/create_test_entries_cli.sh
index 9599b48b..db37883c 100644
--- a/tests/end_to_end_tests/create_test_entries_cli.sh
+++ b/tests/end_to_end_tests/create_test_entries_cli.sh
@@ -13,14 +13,14 @@ dregs register dataset my_cli_dataset2 "patch" \
 dregs register dataset my_cli_dataset3 "1.2.3" --is_dummy \
     --description "This is my dataset description" \
     --access_API "Awesome API" \
-    --locale "Secret location" \
     --owner "DESC" \
-    --owner-type "group" \
+    --owner_type "group" \
     --version_suffix "test" \
     --root_dir "DataRegistry_data"
 
 # A production dataset
 dregs register dataset my_production_cli_dataset "0.1.2" \
-    --owner-type "production" \
+    --owner_type "production" \
     --is_dummy \
-    --root_dir "DataRegistry_data"
+    --root_dir "DataRegistry_data" \
+    --schema "production"

From 5d6b0ce9a92103073d1e7689adbda725c85c83ec Mon Sep 17 00:00:00 2001
From: Stuart McAlpine <stuart.mcalpine@fysik.su.se>
Date: Sat, 2 Dec 2023 17:41:09 +0100
Subject: [PATCH 6/9] Fix for sqlite

---
 .github/workflows/ci.yml                          |  2 ++
 tests/end_to_end_tests/create_test_entries_cli.sh | 12 +++++++-----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 88a0f860..c9e893e5 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -74,6 +74,7 @@ jobs:
 
     env:
       DATAREG_CONFIG: "${{ github.workspace }}/config.txt"
+      DATAREG_BACKEND: "postgres"
 
     # Service containers to run with `runner-job`
     services:
@@ -150,6 +151,7 @@ jobs:
 
     env:
       DATAREG_CONFIG: "${{ github.workspace }}/config.txt"
+      DATAREG_BACKEND: "sqlite"
 
     # Our strategy lists the OS and Python versions we want to test on.
     strategy:
diff --git a/tests/end_to_end_tests/create_test_entries_cli.sh b/tests/end_to_end_tests/create_test_entries_cli.sh
index db37883c..b439b4aa 100644
--- a/tests/end_to_end_tests/create_test_entries_cli.sh
+++ b/tests/end_to_end_tests/create_test_entries_cli.sh
@@ -19,8 +19,10 @@ dregs register dataset my_cli_dataset3 "1.2.3" --is_dummy \
     --root_dir "DataRegistry_data"
 
 # A production dataset
-dregs register dataset my_production_cli_dataset "0.1.2" \
-    --owner_type "production" \
-    --is_dummy \
-    --root_dir "DataRegistry_data" \
-    --schema "production"
+if [ "$DATAREG_BACKEND" = "postgres" ]; then
+  dregs register dataset my_production_cli_dataset "0.1.2" \
+      --owner_type "production" \
+      --is_dummy \
+      --root_dir "DataRegistry_data" \
+      --schema "production"
+fi

From abb8492339fa8cfcc205636e954b837add0f9a5e Mon Sep 17 00:00:00 2001
From: Stuart McAlpine <stuart.mcalpine@fysik.su.se>
Date: Wed, 6 Dec 2023 09:30:57 +0100
Subject: [PATCH 7/9] Address reviewer comments. Mainly chaging references of
 'row' to 'column'.

---
 scripts/create_registry_db.py       | 70 ++++++++++++++++-------------
 src/cli/cli.py                      | 18 ++++----
 src/dataregistry/schema/schema.yaml | 18 ++++----
 3 files changed, 57 insertions(+), 49 deletions(-)

diff --git a/scripts/create_registry_db.py b/scripts/create_registry_db.py
index 8506ba7f..4c21d302 100644
--- a/scripts/create_registry_db.py
+++ b/scripts/create_registry_db.py
@@ -2,11 +2,20 @@
 import sys
 import argparse
 from datetime import datetime
-from sqlalchemy import Column, ColumnDefault, Integer, String, DateTime, Boolean, Index, Float
+from sqlalchemy import (
+    Column,
+    ColumnDefault,
+    Integer,
+    String,
+    DateTime,
+    Boolean,
+    Index,
+    Float,
+)
 from sqlalchemy import ForeignKey, UniqueConstraint, text
 from sqlalchemy.orm import relationship, DeclarativeBase
 from dataregistry.db_basic import DbConnection, SCHEMA_VERSION
-from dataregistry.db_basic import add_table_row, _insert_provenance
+from dataregistry.db_basic import _insert_provenance
 from dataregistry.schema import load_schema
 
 """
@@ -36,7 +45,7 @@
 schema_yaml = load_schema()
 
 
-def _get_rows(schema, table):
+def _get_column_definitions(schema, table):
     """
     Build the SQLAlchemy `Column` list for this table from the information in
     the `schema.yaml` file.
@@ -49,13 +58,12 @@ def _get_rows(schema, table):
     Returns
     -------
     return_dict : dict
-        SQLAlchemy Column entries for each table row
+        SQLAlchemy Column entries for each table
     """
 
     return_dict = {}
     for column in schema_yaml[table].keys():
-
-        # Special case where row has a foreign key
+        # Special case where column has a foreign key
         if schema_yaml[table][column]["foreign_key"]:
             if schema_yaml[table][column]["foreign_key_schema"] == "self":
                 schema_yaml[table][column]["foreign_key_schema"] = schema
@@ -67,7 +75,7 @@ def _get_rows(schema, table):
                     _get_ForeignKey_str(
                         schema_yaml[table][column]["foreign_key_schema"],
                         schema_yaml[table][column]["foreign_key_table"],
-                        schema_yaml[table][column]["foreign_key_row"],
+                        schema_yaml[table][column]["foreign_key_column"],
                     )
                 ),
                 primary_key=schema_yaml[table][column]["primary_key"],
@@ -90,9 +98,9 @@ class Base(DeclarativeBase):
     pass
 
 
-def _get_ForeignKey_str(schema, table, row):
+def _get_ForeignKey_str(schema, table, column):
     """
-    Get the string reference to the "<shema>.<table>.<row>" a foreign key will
+    Get the string reference to the "<shema>.<table>.<column>" a foreign key will
     point to.
 
     The schema address will only be included for postgres backends.
@@ -101,7 +109,7 @@ def _get_ForeignKey_str(schema, table, row):
     ---------
     schema : str
     table : str
-    row : str
+    column : str
 
     Returns
     -------
@@ -109,9 +117,9 @@ def _get_ForeignKey_str(schema, table, row):
     """
 
     if schema is None:
-        return f"{table}.{row}"
+        return f"{table}.{column}"
     else:
-        return f"{schema}.{table}.{row}"
+        return f"{schema}.{table}.{column}"
 
 
 def _Provenance(schema):
@@ -119,13 +127,13 @@ def _Provenance(schema):
 
     class_name = f"{schema}_provenance"
 
-    # Load rows from `schema.yaml` file
-    rows = _get_rows(schema, "provenance")
+    # Load columns from `schema.yaml` file
+    columns = _get_column_definitions(schema, "provenance")
 
     # Table metadata
     meta = {"__tablename__": "provenance", "__table_args__": {"schema": schema}}
 
-    Model = type(class_name, (Base,), {**rows, **meta})
+    Model = type(class_name, (Base,), {**columns, **meta})
     return Model
 
 
@@ -134,13 +142,13 @@ def _Execution(schema):
 
     class_name = f"{schema}_execution"
 
-    # Load rows from `schema.yaml` file
-    rows = _get_rows(schema, "execution")
+    # Load columns from `schema.yaml` file
+    columns = _get_column_definitions(schema, "execution")
 
     # Table metadata
     meta = {"__tablename__": "execution", "__table_args__": {"schema": schema}}
 
-    Model = type(class_name, (Base,), {**rows, **meta})
+    Model = type(class_name, (Base,), {**columns, **meta})
     return Model
 
 
@@ -149,8 +157,8 @@ def _ExecutionAlias(schema):
 
     class_name = f"{schema}_execution_alias"
 
-    # Load rows from `schema.yaml` file
-    rows = _get_rows(schema, "execution_alias")
+    # Load columns from `schema.yaml` file
+    columns = _get_column_definitions(schema, "execution_alias")
 
     # Table metadata
     meta = {
@@ -161,7 +169,7 @@ def _ExecutionAlias(schema):
         ),
     }
 
-    Model = type(class_name, (Base,), {**rows, **meta})
+    Model = type(class_name, (Base,), {**columns, **meta})
     return Model
 
 
@@ -170,8 +178,8 @@ def _DatasetAlias(schema):
 
     class_name = f"{schema}_dataset_alias"
 
-    # Load rows from `schema.yaml` file
-    rows = _get_rows(schema, "dataset_alias")
+    # Load columns from `schema.yaml` file
+    columns = _get_column_definitions(schema, "dataset_alias")
 
     # Table metadata
     meta = {
@@ -182,7 +190,7 @@ def _DatasetAlias(schema):
         ),
     }
 
-    Model = type(class_name, (Base,), {**rows, **meta})
+    Model = type(class_name, (Base,), {**columns, **meta})
     return Model
 
 
@@ -191,8 +199,8 @@ def _Dataset(schema):
 
     class_name = f"{schema}_dataset"
 
-    # Load rows from `schema.yaml` file
-    rows = _get_rows(schema, "dataset")
+    # Load columns from `schema.yaml` file
+    columns = _get_column_definitions(schema, "dataset")
 
     # Table metadata
     meta = {
@@ -206,7 +214,7 @@ def _Dataset(schema):
         ),
     }
 
-    Model = type(class_name, (Base,), {**rows, **meta})
+    Model = type(class_name, (Base,), {**columns, **meta})
     return Model
 
 
@@ -215,17 +223,17 @@ def _Dependency(schema, has_production):
 
     class_name = f"{schema}_dependency"
 
-    # Load rows from `schema.yaml` file
-    rows = _get_rows(schema, "dependency")
+    # Load columns from `schema.yaml` file
+    columns = _get_column_definitions(schema, "dependency")
 
     # Remove link to production schema.
     if not has_production:
-        del rows["input_production_id"]
+        del columns["input_production_id"]
 
     # Table metadata
     meta = {"__tablename__": "dependency", "__table_args__": {"schema": schema}}
 
-    Model = type(class_name, (Base,), {**rows, **meta})
+    Model = type(class_name, (Base,), {**columns, **meta})
     return Model
 
 
diff --git a/src/cli/cli.py b/src/cli/cli.py
index 86bcc62d..b6a0f9de 100644
--- a/src/cli/cli.py
+++ b/src/cli/cli.py
@@ -73,29 +73,29 @@
 arg_register_dataset = arg_register_sub.add_parser("dataset", help="Register a dataset")
 
 # Get some information from the `schema.yaml` file
-for row in schema_data["dataset"]:
+for column in schema_data["dataset"]:
     # Any default?
-    if schema_data["dataset"][row]["cli_default"] is not None:
-        default = schema_data["dataset"][row]["cli_default"]
+    if schema_data["dataset"][column]["cli_default"] is not None:
+        default = schema_data["dataset"][column]["cli_default"]
         default_str = f" (default={default})"
     else:
         default = None
         default_str = ""
 
     # Restricted to choices?
-    if schema_data["dataset"][row]["choices"] is not None:
-        choices = schema_data["dataset"][row]["choices"]
+    if schema_data["dataset"][column]["choices"] is not None:
+        choices = schema_data["dataset"][column]["choices"]
     else:
         choices = None
 
     # Add flag
-    if schema_data["dataset"][row]["cli_optional"]:
+    if schema_data["dataset"][column]["cli_optional"]:
         arg_register_dataset.add_argument(
-            "--" + row,
-            help=schema_data["dataset"][row]["description"] + default_str,
+            "--" + column,
+            help=schema_data["dataset"][column]["description"] + default_str,
             default=default,
             choices=choices,
-            type=_TYPE_TRANSLATE[schema_data["dataset"][row]["type"]],
+            type=_TYPE_TRANSLATE[schema_data["dataset"][column]["type"]],
         )
 
 # Entries unique to registering the dataset using the CLI
diff --git a/src/dataregistry/schema/schema.yaml b/src/dataregistry/schema/schema.yaml
index e9e1c214..972fa4e5 100644
--- a/src/dataregistry/schema/schema.yaml
+++ b/src/dataregistry/schema/schema.yaml
@@ -107,7 +107,7 @@ execution_alias:
     foreign_key: True
     foreign_key_schema: "self"
     foreign_key_table: "execution"
-    foreign_key_row: "execution_id"
+    foreign_key_column: "execution_id"
     description: "Execution this alias is linked to"
 
 dataset_alias:
@@ -135,7 +135,7 @@ dataset_alias:
     foreign_key: True
     foreign_key_schema: "self"
     foreign_key_table: "dataset"
-    foreign_key_row: "dataset_id"
+    foreign_key_column: "dataset_id"
     description: "Dataset this alias is linked to"
 
 dependency:
@@ -152,21 +152,21 @@ dependency:
     foreign_key: True
     foreign_key_schema: "self"
     foreign_key_table: "dataset"
-    foreign_key_row: "dataset_id"
-    description: "Dataset this dependency is linked to"
+    foreign_key_column: "dataset_id"
+    description: "Dataset this dependency is linked to (for every dependency, this, or `input_production_id`, must be non-null)"
   input_production_id:
     type: "Integer"
     foreign_key: True
     foreign_key_schema: "production"
     foreign_key_table: "dataset"
-    foreign_key_row: "dataset_id"
-    description: "Production dataset this dependency is linked to"
+    foreign_key_column: "dataset_id"
+    description: "Production dataset this dependency is linked to (for every dependency, this, or `input_id`, must be non-null)"
   execution_id:
     type: "Integer"
     foreign_key: True
     foreign_key_schema: "self"
     foreign_key_table: "execution"
-    foreign_key_row: "execution_id"
+    foreign_key_column: "execution_id"
     description: "Execution this dependency is linked to"
 
 dataset:
@@ -260,7 +260,7 @@ dataset:
     foreign_key: True
     foreign_key_schema: "self"
     foreign_key_table: "execution"
-    foreign_key_row: "execution_id"
+    foreign_key_column: "execution_id"
     description: "Execution this dataset is linked to"
     cli_optional: True
   is_overwritten:
@@ -278,5 +278,5 @@ dataset:
     description: "True if an external link"
   is_archived:
     type: "Boolean"
-    nullablee: False
+    nullable: False
     description: "True if dataset is archived"

From 920bc090bf4f1f768c65f6cdef145e8b676cceef Mon Sep 17 00:00:00 2001
From: Stuart McAlpine <stuart.mcalpine@fysik.su.se>
Date: Wed, 6 Dec 2023 10:15:51 +0100
Subject: [PATCH 8/9] Fix CLI for all schema columns when registering a dataset

---
 src/cli/cli.py                                | 19 +++++++++++--------
 src/cli/register.py                           |  3 +++
 src/dataregistry/registrar.py                 |  4 ++--
 .../create_test_entries_cli.sh                |  6 +++++-
 4 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/src/cli/cli.py b/src/cli/cli.py
index b6a0f9de..a60ecc9d 100644
--- a/src/cli/cli.py
+++ b/src/cli/cli.py
@@ -74,28 +74,31 @@
 
 # Get some information from the `schema.yaml` file
 for column in schema_data["dataset"]:
+    extra_args = {}
+
     # Any default?
     if schema_data["dataset"][column]["cli_default"] is not None:
-        default = schema_data["dataset"][column]["cli_default"]
-        default_str = f" (default={default})"
+        extra_args["default"] = schema_data["dataset"][column]["cli_default"]
+        default_str = f" (default={extra_args['default']})"
     else:
-        default = None
         default_str = ""
 
     # Restricted to choices?
     if schema_data["dataset"][column]["choices"] is not None:
-        choices = schema_data["dataset"][column]["choices"]
+        extra_args["choices"] = schema_data["dataset"][column]["choices"]
+
+    # Is this a boolean flag?
+    if schema_data["dataset"][column]["type"] == "Boolean":
+        extra_args["action"] = "store_true"
     else:
-        choices = None
+        extra_args["type"] = _TYPE_TRANSLATE[schema_data["dataset"][column]["type"]]
 
     # Add flag
     if schema_data["dataset"][column]["cli_optional"]:
         arg_register_dataset.add_argument(
             "--" + column,
             help=schema_data["dataset"][column]["description"] + default_str,
-            default=default,
-            choices=choices,
-            type=_TYPE_TRANSLATE[schema_data["dataset"][column]["type"]],
+            **extra_args,
         )
 
 # Entries unique to registering the dataset using the CLI
diff --git a/src/cli/register.py b/src/cli/register.py
index 5518c3e0..ca74870b 100644
--- a/src/cli/register.py
+++ b/src/cli/register.py
@@ -38,6 +38,9 @@ def register_dataset(args):
         name=args.name,
         version_suffix=args.version_suffix,
         creation_date=args.creation_date,
+        access_API=args.access_API,
+        execution_id=args.execution_id,
+        is_overwritable=args.is_overwritable,
         description=args.description,
         old_location=args.old_location,
         copy=(not args.make_symlink),
diff --git a/src/dataregistry/registrar.py b/src/dataregistry/registrar.py
index 7d13725a..3866fded 100644
--- a/src/dataregistry/registrar.py
+++ b/src/dataregistry/registrar.py
@@ -505,10 +505,10 @@ def register_dataset(
         if version_suffix:
             values["version_suffix"] = version_suffix
         if creation_date:
-            values["dataset_creation_date"] = creation_date
+            values["creation_date"] = creation_date
         else:
             if ds_creation_date:
-                values["dataset_creation_date"] = ds_creation_date
+                values["creation_date"] = ds_creation_date
         if description:
             values["description"] = description
         if execution_id:
diff --git a/tests/end_to_end_tests/create_test_entries_cli.sh b/tests/end_to_end_tests/create_test_entries_cli.sh
index b439b4aa..b008e3ac 100644
--- a/tests/end_to_end_tests/create_test_entries_cli.sh
+++ b/tests/end_to_end_tests/create_test_entries_cli.sh
@@ -16,7 +16,11 @@ dregs register dataset my_cli_dataset3 "1.2.3" --is_dummy \
     --owner "DESC" \
     --owner_type "group" \
     --version_suffix "test" \
-    --root_dir "DataRegistry_data"
+    --root_dir "DataRegistry_data" \
+    --creation_date "2020-01-01" \
+    --input_datasets 1 2 \
+    --execution_name "I have given the execution a name" \
+    --is_overwritable
 
 # A production dataset
 if [ "$DATAREG_BACKEND" = "postgres" ]; then

From 59da1d870fea11ae399ab68ed4d9ce2db016f52c Mon Sep 17 00:00:00 2001
From: Stuart McAlpine <stuart.mcalpine@fysik.su.se>
Date: Wed, 6 Dec 2023 10:24:26 +0100
Subject: [PATCH 9/9] Reduce register doc strings

---
 src/dataregistry/registrar.py | 101 +++++++++++++---------------------
 1 file changed, 37 insertions(+), 64 deletions(-)

diff --git a/src/dataregistry/registrar.py b/src/dataregistry/registrar.py
index 3866fded..4eba83cd 100644
--- a/src/dataregistry/registrar.py
+++ b/src/dataregistry/registrar.py
@@ -229,22 +229,19 @@ def register_execution(
         """
         Register a new execution in the DESC data registry.
 
+        Any args marked with '**' share their name with the associated column
+        in the registry schema. Descriptions of what these columns are can be
+        found in `schema.yaml` or the documentation.
+
         Parameters
         ----------
-        name : str
-            Typically pipeline name or program name
-        description : str, optional
-            Human readible description of execution
-        execution_start : datetime, optional
-            Date the execution started
-        locale : str, optional
-            Where was the execution performed?
-        configuration : str, optional
-            Path to text file used to configure the execution
-        input_datasets : list, optional
-            List of dataset ids that were the input to this execution
-        input_production_datasets : list, optional
-            List of production dataset ids that were the input to this execution
+        name** : str
+        description** : str, optional
+        execution_start** : datetime, optional
+        locale** : str, optional
+        configuration** : str, optional
+        input_datasets** : list, optional
+        input_production_datasets** : list, optional
         max_config_length : int, optional
             Maxiumum number of lines to read from a configuration file
 
@@ -326,37 +323,21 @@ def register_dataset(
         """
         Register a new dataset in the DESC data registry.
 
+        Any args marked with '**' share their name with the associated column
+        in the registry schema. Descriptions of what these columns are can be
+        found in `schema.yaml` or the documentation.
+
         Parameters
         ----------
-        relative_path : str
-            Destination for the dataset within the data registry. Path is
-            relative to ``<registry root>/<owner_type>/<owner>``.
-        version : str
-            Semantic version string of the format MAJOR.MINOR.PATCH *or*
-            a special flag: "patch", "minor" or "major".
-
-            When a special flag is used it automatically bumps the relative
-            version for you (see examples for more details).
-        version_suffix : str, optional
-            Suffix string to place at the end of the version string. Cannot be
-            used for production datasets.
-        name : str, optional
-            Any convenient, evocative name for the human.
-
-            Note the combination of name, version and version_suffix must be
-            unique.
-        creation_date : datetime, optional
-            Manually set creation date of dataset
-        description : str, optional
-            Human-readable description of dataset
-        execution_id : int, optional
-            Used to associate dataset with a particular execution
-        access_API : str, optional
-            Hint as to how to read the data
-        is_overwritable : bool, optional
-            True if dataset may be overwritten (defaults to False).
-
-            Note production datasets cannot be overwritten.
+        relative_path** : str
+        version** : str
+        version_suffix** : str, optional
+        name** : str, optional
+        creation_date** : datetime, optional
+        description** : str, optional
+        execution_id** : int, optional
+        access_API** : str, optional
+        is_overwritable** : bool, optional
         old_location : str, optional
             Absolute location of dataset to copy into the data registry.
 
@@ -371,23 +352,13 @@ def register_dataset(
             only)
         verbose : bool, optional
             Provide some additional output information
-        owner : str, optional
-            Owner of the dataset. If None, defaults to what was set in
-            Registrar __init__, if that is also None, defaults to $USER.
-        owner_type : str, optional
-            Owner type: "user", "group", or "production". If None, defaults to
-            what was set in Registrar __init__, if that is also None, defaults
-            to "user".
-        execution_name : str, optional
-            Typically pipeline name or program name
-        execution_description : str, optional
-            Human readible description of execution
-        execution_start : datetime, optional
-            Date the execution started
-        execution_locale : str, optional
-            Where was the execution performed?
-        execution_configuration : str, optional
-            Path to text file used to configure the execution
+        owner** : str, optional
+        owner_type** : str, optional
+        execution_name** : str, optional
+        execution_description** : str, optional
+        execution_start** : datetime, optional
+        execution_locale** : str, optional
+        execution_configuration** : str, optional
         input_datasets : list, optional
             List of dataset ids that were the input to this execution
         input_production_datasets : list, optional
@@ -549,12 +520,14 @@ def register_dataset_alias(self, aliasname, dataset_id):
         """
         Register a new dataset alias in the DESC data registry.
 
+        Any args marked with '**' share their name with the associated column
+        in the registry schema. Descriptions of what these columns are can be
+        found in `schema.yaml` or the documentation.
+
         Parameters
         ----------
-        aliasname : str
-            Human readible alias for the dataset
-        dataset_id : int
-            Existing dataset ID to attach dataset alias to
+        aliasname** : str
+        dataset_id** : int
 
         Returns
         -------