diff --git a/CHANGELOG.md b/CHANGELOG.md index d4b4756e..1ee11601 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +## Version 1.0.4 + +Make more options for querying + +- There is now a `~=` query operator that can utalise the `.ilike` filter to + allow non-case-sensitive filering with wildcards (i.e., the `%` character). +- `dregs ls` can now filter on the dataset name, including `%` wildcards, using + the `--name` option. +- `dregs_ls` can return arbitrary columns using the `--return_cols` option + ## Version 1.0.3 Some changes to the way the `relative_path` is automatically generated from the diff --git a/docs/source/tutorial_cli.rst b/docs/source/tutorial_cli.rst index c9f34ed4..af7c5394 100644 --- a/docs/source/tutorial_cli.rst +++ b/docs/source/tutorial_cli.rst @@ -112,11 +112,23 @@ For example, to see all the datasets from the DESC Generic Working Group we woul dregs ls --owner "DESC Generic Working Group" -To list entries from all owners type +To list entries from all owners do ``--owner none``. + +You can search against the ``dataset.name`` column, with wildcard support, +where ``*`` is the wildcard character, e.g., + +.. code-block:: bash + + dregs ls --name dataset:dc2:* + +will search for all datasets whose name starts with the pattern "dataset:dc2:" +(note the ``--name`` queries here are case insensitive). + +To select what columns are printed in the result use the ``--return_cols`` option, e.g., .. code-block:: bash - dregs ls --all + dregs ls --return_cols dataset_id name description status Using ``dregs ls`` is a quick an easy way to remind yourself what names you gave to previous datasets, and what relative paths they reside at. diff --git a/docs/source/tutorial_notebooks/query_datasets.ipynb b/docs/source/tutorial_notebooks/query_datasets.ipynb index cd832aff..5af3cb8f 100644 --- a/docs/source/tutorial_notebooks/query_datasets.ipynb +++ b/docs/source/tutorial_notebooks/query_datasets.ipynb @@ -116,6 +116,33 @@ "\n", "The allowed boolean logic operators are: `==`, `!=`, `<`, `<=`, `>` and `>=`.\n", "\n", + "A special operator, `~=`, can be use to perform wildcard querties, where `*` is the wildcard character. This is particularly useful when only a partial dataset name is known, or when we want to return all datasets with a similar naming pattern, for example" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "578951ce-cc30-4ab3-88b4-7bb98b734b9c", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a filter that queries on the dataset name with a wildcard\n", + "f = datareg.Query.gen_filter('dataset.name', '~=', 'nersc_tutorial:*')" + ] + }, + { + "cell_type": "markdown", + "id": "877e32d9-08b6-4121-8afa-07f3ed8f8524", + "metadata": {}, + "source": [ + "will return all datasets whose name begins with the pattern `nersc_tutorial:`. The `~=` operator is case insensitive, for case sensitive wildcard searching, one can use the `~==` operator." + ] + }, + { + "cell_type": "markdown", + "id": "37c23c39-7a78-4931-a281-226a7bfc9333", + "metadata": {}, + "source": [ "### Performing the query\n", "\n", "Now we can pass this filter through to a query using the `Query` extension of the `DataRegistry` class, e.g.," diff --git a/src/dataregistry/_version.py b/src/dataregistry/_version.py index 976498ab..92192eed 100644 --- a/src/dataregistry/_version.py +++ b/src/dataregistry/_version.py @@ -1 +1 @@ -__version__ = "1.0.3" +__version__ = "1.0.4" diff --git a/src/dataregistry/query.py b/src/dataregistry/query.py index fa134fe7..70fd965a 100644 --- a/src/dataregistry/query.py +++ b/src/dataregistry/query.py @@ -59,6 +59,8 @@ "<=": "__le__", ">": "__gt__", ">=": "__ge__", + "~=": None, + "~==": None, } ALL_ORDERABLE = ( @@ -74,6 +76,12 @@ .union(LITE_TYPES) ) +ILIKE_ALLOWED = [ + "dataset.name", + "dataset.owner", + "dataset.relative_path", + "dataset.access_api" +] def is_orderable_type(ctype): return type(ctype) in ALL_ORDERABLE @@ -272,12 +280,28 @@ def _render_filter(self, f, stmt): # Extract the property we are ordering on (also making sure it # is orderable) - if not column_is_orderable[0] and f[1] not in ["==", "=", "!="]: + if not column_is_orderable[0] and f[1] not in ["~==", "~=", "==", "=", "!="]: raise ValueError('check_filter: Cannot apply "{f[1]}" to "{f[0]}"') else: value = f[2] - return stmt.where(column_ref[0].__getattribute__(the_op)(value)) + # String partial matching with wildcard + if f[1] in ["~=", "~=="]: + if f[0] not in ILIKE_ALLOWED: + raise ValueError(f"Can only perform ~= search on {ILIKE_ALLOWED}") + + tmp = value.replace('%', r'\%').replace('_', r'\_').replace('*', '%') + + # Case insensitive wildcard matching (wildcard is '*') + if f[1] == "~=": + return stmt.where(column_ref[0].ilike(tmp)) + # Case sensitive wildcard matching (wildcard is '*') + else: + return stmt.where(column_ref[0].like(tmp)) + + # General case using traditional boolean operator + else: + return stmt.where(column_ref[0].__getattribute__(the_op)(value)) def _append_filter_tables(self, tables_required, filters): """ diff --git a/src/dataregistry_cli/cli.py b/src/dataregistry_cli/cli.py index be82b15c..ce6eb1e4 100644 --- a/src/dataregistry_cli/cli.py +++ b/src/dataregistry_cli/cli.py @@ -77,9 +77,14 @@ def get_parser(): help="List datasets for a given owner type", choices=["user", "group", "production", "project"], ) - arg_ls.add_argument("--all", help="List all datasets", action="store_true") arg_ls.add_argument( - "--extended", help="List more properties than the default", action="store_true" + "--name", help="Only return datasets with a given name (wildcard support)" + ) + arg_ls.add_argument( + "--return_cols", + help="List of columns to return in the query", + nargs="+", + type=str, ) arg_ls.add_argument( "--max_rows", diff --git a/src/dataregistry_cli/query.py b/src/dataregistry_cli/query.py index bafd755c..b16eb68d 100644 --- a/src/dataregistry_cli/query.py +++ b/src/dataregistry_cli/query.py @@ -1,6 +1,50 @@ import os from dataregistry import DataRegistry import pandas as pd +from dataregistry import Filter + + +def _render_filters(datareg, args): + """ + Apply a filter to the query. + + Both the owner and owner_type columns can be filtered against. In addition, + the dataset name column can be filtered against (allowed for % wildcards). + + Keywords can also be filtered against. These have to be treated separately, + as they need to be linked to the keywords table. + + Parameters + ---------- + datareg : DataRegistry object + args : argparse object + + Returns + ------- + filters : list[Filter] + """ + + filters = [] + + # Dataset columns we can filter by + queriables = ["owner", "owner_type", "name"] + + print("\nDataRegistry query:", end=" ") + for col in queriables: + # Add filter on this column + if getattr(args, col) is not None: + if col == "name": + filters.append(Filter(f"dataset.{col}", "~=", getattr(args, col))) + else: + if not (col == "owner" and getattr(args, col).lower() == "none"): + filters.append(Filter(f"dataset.{col}", "==", getattr(args, col))) + print(f"{col}=={getattr(args, col)}", end=" ") + + # Add keywords filter + if args.keyword is not None: + filters.append(datareg.Query.gen_filter("keyword.keyword", "==", args.keyword)) + + return filters def dregs_ls(args): @@ -20,6 +64,9 @@ def dregs_ls(args): Owner to list dataset entries for args.owner_type : str Owner type to list dataset entries for + args.name : str + Filter to only those results with a given dataset name (% can be used + as a wildcard) args.all : bool True to show all datasets, no filters args.config_file : str @@ -30,8 +77,8 @@ def dregs_ls(args): Path to root_dir args.site : str Look up root_dir using a site - args.extended : bool - True to list more dataset properties + args.return_cols : list[str] + List of dataset columns to return args.max_chars : int Maximum number of character to print per column args.max_rows : int @@ -59,28 +106,12 @@ def dregs_ls(args): else: datareg_prod = None - # Filter on dataset owner and/or owner_type - filters = [] + # By default, search for "our" dataset + if args.owner is None: + args.owner = os.getenv("USER") - print("\nDataRegistry query:", end=" ") - if not args.all: - # Add owner_type filter - if args.owner_type is not None: - filters.append(Filter("dataset.owner_type", "==", args.owner_type)) - print(f"owner_type=={args.owner_type}", end=" ") - - # Add owner filter - if args.owner is None: - if args.owner_type is None: - filters.append( - datareg.Query.gen_filter("dataset.owner", "==", os.getenv("USER")) - ) - print(f"owner=={os.getenv('USER')}", end=" ") - else: - filters.append(datareg.Query.gen_filter("dataset.owner", "==", args.owner)) - print(f"owner=={args.owner}", end=" ") - else: - print("all datasets", end=" ") + # Render search filters + filters = _render_filters(datareg, args) # What columns are we printing _print_cols = [ @@ -90,23 +121,11 @@ def dregs_ls(args): "dataset.owner_type", "dataset.description", ] - if args.extended: - _print_cols.extend( - [ - "dataset.dataset_id", - "dataset.relative_path", - "dataset.status", - "dataset.register_date", - "dataset.is_overwritable", - ] - ) - - # Add keywords filter + if args.return_cols is not None: + _print_cols = [f"dataset.{x}" for x in args.return_cols] if args.keyword is not None: _print_cols.append("keyword.keyword") - filters.append(datareg.Query.gen_filter("keyword.keyword", "==", args.keyword)) - # Loop over this schema and the production schema and print the results for this_datareg in [datareg, datareg_prod]: if this_datareg is None: diff --git a/tests/end_to_end_tests/test_query.py b/tests/end_to_end_tests/test_query.py index 92268e6e..2614f75b 100644 --- a/tests/end_to_end_tests/test_query.py +++ b/tests/end_to_end_tests/test_query.py @@ -1,3 +1,4 @@ +import pytest import os import pandas as pd import sqlalchemy @@ -107,3 +108,42 @@ def test_query_between_columns(dummy_file): assert i < 1 assert getattr(r, "dataset.name") == _NAME assert getattr(r, "dataset.version_string") == _V_STRING + +@pytest.mark.skipif( + datareg.db_connection._dialect == "sqlite", reason="wildcards break for sqlite" +) +@pytest.mark.parametrize( + "op,qstr,ans,tag", + [ + ("~=", "DESC:datasets:test_query_name_nocasewildcard*", 3, "nocasewildcard"), + ("==", "DESC:datasets:test_query_name_exactmatch_first", 1, "exactmatch"), + ("~==", "DESC:datasets:Test_Query_Name_nocasewildcard*", 0, "casewildcardfail"), + ("~==", "DESC:datasets:test_query_name_nocasewildcard*", 3, "casewildcardpass"), + ], +) +def test_query_name(dummy_file, op, qstr, ans, tag): + """Test a quering on a partial name with wildcards""" + + # Establish connection to database + tmp_src_dir, tmp_root_dir = dummy_file + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) + + # Add entry + for tmp_tag in ["first", "second", "third"]: + d_id = _insert_dataset_entry( + datareg, + f"DESC:datasets:test_query_name_{tag}_{tmp_tag}", + "0.0.1", + ) + + # Do a wildcard search on the name + f = datareg.Query.gen_filter("dataset.name", op, qstr) + results = datareg.Query.find_datasets(property_names=None, filters=[f]) + + # How many datasets did we find + if ans == 0: + assert len(results) == 0 + else: + assert len(results) > 0 + for c, v in results.items(): + assert len(v) == ans