LSSTDESC · stuartmcalpine · Nov 26, 2024 · Oct 29, 2024 · Oct 29, 2024 · Oct 29, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,13 @@
+## Version 1.0.4
+
+Make more options for querying
+
+- There is now a `~=` query operator that can utalise the `.ilike` filter to
+  allow non-case-sensitive filering with wildcards (i.e., the `%` character).
+- `dregs ls` can now filter on the dataset name, including `%` wildcards, using
+  the `--name` option.
+- `dregs_ls` can return arbitrary columns using the `--return_cols` option
+
 ## Version 1.0.3
 
 Some changes to the way the `relative_path` is automatically generated from the

diff --git a/docs/source/tutorial_cli.rst b/docs/source/tutorial_cli.rst
@@ -112,11 +112,23 @@ For example, to see all the datasets from the DESC Generic Working Group we woul
 
    dregs ls --owner "DESC Generic Working Group"
 
-To list entries from all owners type
+To list entries from all owners do ``--owner none``.
+
+You can search against the ``dataset.name`` column, with wildcard support,
+where ``*`` is the wildcard character, e.g.,
+
+.. code-block:: bash
+
+   dregs ls --name dataset:dc2:*
+
+will search for all datasets whose name starts with the pattern "dataset:dc2:"
+(note the ``--name`` queries here are case insensitive).
+
+To select what columns are printed in the result use the ``--return_cols`` option, e.g.,
 
 .. code-block:: bash
 
-   dregs ls --all
+   dregs ls --return_cols dataset_id name description status
 
 Using ``dregs ls`` is a quick an easy way to remind yourself what names you
 gave to previous datasets, and what relative paths they reside at.
diff --git a/docs/source/tutorial_notebooks/query_datasets.ipynb b/docs/source/tutorial_notebooks/query_datasets.ipynb
@@ -116,6 +116,33 @@
     "\n",
     "The allowed boolean logic operators are: `==`, `!=`, `<`, `<=`, `>` and `>=`.\n",
     "\n",
+    "A special operator, `~=`, can be use to perform wildcard querties, where `*` is the wildcard character. This is particularly useful when only a partial dataset name is known, or when we want to return all datasets with a similar naming pattern, for example"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "578951ce-cc30-4ab3-88b4-7bb98b734b9c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a filter that queries on the dataset name with a wildcard\n",
+    "f = datareg.Query.gen_filter('dataset.name', '~=', 'nersc_tutorial:*')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "877e32d9-08b6-4121-8afa-07f3ed8f8524",
+   "metadata": {},
+   "source": [
+    "will return all datasets whose name begins with the pattern `nersc_tutorial:`. The `~=` operator is case insensitive, for case sensitive wildcard searching, one can use the `~==` operator."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "37c23c39-7a78-4931-a281-226a7bfc9333",
+   "metadata": {},
+   "source": [
     "### Performing the query\n",
     "\n",
     "Now we can pass this filter through to a query using the `Query` extension of the `DataRegistry` class, e.g.,"

diff --git a/src/dataregistry/_version.py b/src/dataregistry/_version.py
@@ -1 +1 @@
-__version__ = "1.0.3"
+__version__ = "1.0.4"
diff --git a/src/dataregistry/query.py b/src/dataregistry/query.py
@@ -59,6 +59,8 @@
     "<=": "__le__",
     ">": "__gt__",
     ">=": "__ge__",
+    "~=": None,
+    "~==": None,
 }
 
 ALL_ORDERABLE = (
@@ -74,6 +76,12 @@
     .union(LITE_TYPES)
 )
 
+ILIKE_ALLOWED = [
+    "dataset.name",
+    "dataset.owner",
+    "dataset.relative_path",
+    "dataset.access_api"
+]
 
 def is_orderable_type(ctype):
     return type(ctype) in ALL_ORDERABLE
@@ -272,12 +280,28 @@ def _render_filter(self, f, stmt):
 
         # Extract the property we are ordering on (also making sure it
         # is orderable)
-        if not column_is_orderable[0] and f[1] not in ["==", "=", "!="]:
+        if not column_is_orderable[0] and f[1] not in ["~==", "~=", "==", "=", "!="]:
             raise ValueError('check_filter: Cannot apply "{f[1]}" to "{f[0]}"')
         else:
             value = f[2]
 
-        return stmt.where(column_ref[0].__getattribute__(the_op)(value))
+        # String partial matching with wildcard
+        if f[1] in ["~=", "~=="]:
+            if f[0] not in ILIKE_ALLOWED:
+                raise ValueError(f"Can only perform ~= search on {ILIKE_ALLOWED}")
+
+            tmp = value.replace('%', r'\%').replace('_', r'\_').replace('*', '%')
+
+            # Case insensitive wildcard matching (wildcard is '*')
+            if f[1] == "~=":
+                return stmt.where(column_ref[0].ilike(tmp))
+            # Case sensitive wildcard matching (wildcard is '*')
+            else:
+                return stmt.where(column_ref[0].like(tmp))
+
+        # General case using traditional boolean operator 
+        else:
+            return stmt.where(column_ref[0].__getattribute__(the_op)(value))
 
     def _append_filter_tables(self, tables_required, filters):
         """

diff --git a/src/dataregistry_cli/cli.py b/src/dataregistry_cli/cli.py
@@ -77,9 +77,14 @@ def get_parser():
         help="List datasets for a given owner type",
         choices=["user", "group", "production", "project"],
     )
-    arg_ls.add_argument("--all", help="List all datasets", action="store_true")
     arg_ls.add_argument(
-        "--extended", help="List more properties than the default", action="store_true"
+        "--name", help="Only return datasets with a given name (wildcard support)"
+    )
+    arg_ls.add_argument(
+        "--return_cols",
+        help="List of columns to return in the query",
+        nargs="+",
+        type=str,
     )
     arg_ls.add_argument(
         "--max_rows",

diff --git a/src/dataregistry_cli/query.py b/src/dataregistry_cli/query.py
@@ -1,6 +1,50 @@
 import os
 from dataregistry import DataRegistry
 import pandas as pd
+from dataregistry import Filter
+
+
+def _render_filters(datareg, args):
+    """
+    Apply a filter to the query.
+
+    Both the owner and owner_type columns can be filtered against. In addition,
+    the dataset name column can be filtered against (allowed for % wildcards).
+
+    Keywords can also be filtered against. These have to be treated separately,
+    as they need to be linked to the keywords table.
+
+    Parameters
+    ----------
+    datareg : DataRegistry object
+    args : argparse object
+
+    Returns
+    -------
+    filters : list[Filter]
+    """
+
+    filters = []
+
+    # Dataset columns we can filter by
+    queriables = ["owner", "owner_type", "name"]
+
+    print("\nDataRegistry query:", end=" ")
+    for col in queriables:
+        # Add filter on this column
+        if getattr(args, col) is not None:
+            if col == "name":
+                filters.append(Filter(f"dataset.{col}", "~=", getattr(args, col)))
+            else:
+                if not (col == "owner" and getattr(args, col).lower() == "none"):
+                    filters.append(Filter(f"dataset.{col}", "==", getattr(args, col)))
+            print(f"{col}=={getattr(args, col)}", end=" ")
+
+    # Add keywords filter
+    if args.keyword is not None:
+        filters.append(datareg.Query.gen_filter("keyword.keyword", "==", args.keyword))
+
+    return filters
 
 
 def dregs_ls(args):
@@ -20,6 +64,9 @@ def dregs_ls(args):
         Owner to list dataset entries for
     args.owner_type : str
         Owner type to list dataset entries for
+    args.name : str
+        Filter to only those results with a given dataset name (% can be used
+        as a wildcard)
     args.all : bool
         True to show all datasets, no filters
     args.config_file : str
@@ -30,8 +77,8 @@ def dregs_ls(args):
         Path to root_dir
     args.site : str
         Look up root_dir using a site
-    args.extended : bool
-        True to list more dataset properties
+    args.return_cols : list[str]
+        List of dataset columns to return
     args.max_chars : int
         Maximum number of character to print per column
     args.max_rows : int
@@ -59,28 +106,12 @@ def dregs_ls(args):
     else:
         datareg_prod = None
 
-    # Filter on dataset owner and/or owner_type
-    filters = []
+    # By default, search for "our" dataset
+    if args.owner is None:
+        args.owner = os.getenv("USER")
 
-    print("\nDataRegistry query:", end=" ")
-    if not args.all:
-        # Add owner_type filter
-        if args.owner_type is not None:
-            filters.append(Filter("dataset.owner_type", "==", args.owner_type))
-            print(f"owner_type=={args.owner_type}", end=" ")
-
-        # Add owner filter
-        if args.owner is None:
-            if args.owner_type is None:
-                filters.append(
-                    datareg.Query.gen_filter("dataset.owner", "==", os.getenv("USER"))
-                )
-                print(f"owner=={os.getenv('USER')}", end=" ")
-        else:
-            filters.append(datareg.Query.gen_filter("dataset.owner", "==", args.owner))
-            print(f"owner=={args.owner}", end=" ")
-    else:
-        print("all datasets", end=" ")
+    # Render search filters
+    filters = _render_filters(datareg, args)
 
     # What columns are we printing
     _print_cols = [
@@ -90,23 +121,11 @@ def dregs_ls(args):
         "dataset.owner_type",
         "dataset.description",
     ]
-    if args.extended:
-        _print_cols.extend(
-            [
-                "dataset.dataset_id",
-                "dataset.relative_path",
-                "dataset.status",
-                "dataset.register_date",
-                "dataset.is_overwritable",
-            ]
-        )
-
-    # Add keywords filter
+    if args.return_cols is not None:
+        _print_cols = [f"dataset.{x}" for x in args.return_cols]
     if args.keyword is not None:
         _print_cols.append("keyword.keyword")
 
-        filters.append(datareg.Query.gen_filter("keyword.keyword", "==", args.keyword))
-
     # Loop over this schema and the production schema and print the results
     for this_datareg in [datareg, datareg_prod]:
         if this_datareg is None:

diff --git a/tests/end_to_end_tests/test_query.py b/tests/end_to_end_tests/test_query.py
@@ -1,3 +1,4 @@
+import pytest
 import os
 import pandas as pd
 import sqlalchemy
@@ -107,3 +108,42 @@ def test_query_between_columns(dummy_file):
             assert i < 1
             assert getattr(r, "dataset.name") == _NAME
             assert getattr(r, "dataset.version_string") == _V_STRING
+
+@pytest.mark.skipif(
+    datareg.db_connection._dialect == "sqlite", reason="wildcards break for sqlite"
+)
+@pytest.mark.parametrize(
+    "op,qstr,ans,tag",
+    [
+        ("~=", "DESC:datasets:test_query_name_nocasewildcard*", 3, "nocasewildcard"),
+        ("==", "DESC:datasets:test_query_name_exactmatch_first", 1, "exactmatch"),
+        ("~==", "DESC:datasets:Test_Query_Name_nocasewildcard*", 0, "casewildcardfail"),
+        ("~==", "DESC:datasets:test_query_name_nocasewildcard*", 3, "casewildcardpass"),
+    ],
+)
+def test_query_name(dummy_file, op, qstr, ans, tag):
+    """Test a quering on a partial name with wildcards"""
+
+    # Establish connection to database
+    tmp_src_dir, tmp_root_dir = dummy_file
+    datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING)
+
+    # Add entry
+    for tmp_tag in ["first", "second", "third"]:
+        d_id = _insert_dataset_entry(
+            datareg,
+            f"DESC:datasets:test_query_name_{tag}_{tmp_tag}",
+            "0.0.1",
+        )
+
+    # Do a wildcard search on the name
+    f = datareg.Query.gen_filter("dataset.name", op, qstr)
+    results = datareg.Query.find_datasets(property_names=None, filters=[f])
+
+    # How many datasets did we find
+    if ans == 0:
+        assert len(results) == 0
+    else:
+        assert len(results) > 0
+        for c, v in results.items():
+            assert len(v) == ans