From ae721030d7ccc09361270fdd6285737206c98964 Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Tue, 29 Oct 2024 19:41:28 +0100 Subject: [PATCH 01/12] Add ~= operator to search using wildcards --- src/dataregistry/query.py | 10 +++++++-- tests/end_to_end_tests/test_query.py | 31 ++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/src/dataregistry/query.py b/src/dataregistry/query.py index fa134fe7..a2c56b25 100644 --- a/src/dataregistry/query.py +++ b/src/dataregistry/query.py @@ -59,6 +59,7 @@ "<=": "__le__", ">": "__gt__", ">=": "__ge__", + "~=": None, } ALL_ORDERABLE = ( @@ -272,12 +273,17 @@ def _render_filter(self, f, stmt): # Extract the property we are ordering on (also making sure it # is orderable) - if not column_is_orderable[0] and f[1] not in ["==", "=", "!="]: + if not column_is_orderable[0] and f[1] not in ["~=", "==", "=", "!="]: raise ValueError('check_filter: Cannot apply "{f[1]}" to "{f[0]}"') else: value = f[2] - return stmt.where(column_ref[0].__getattribute__(the_op)(value)) + # Special case where we are partially matching with a wildcard + if f[1] == "~=": + return stmt.where(column_ref[0].ilike(value)) + # General case + else: + return stmt.where(column_ref[0].__getattribute__(the_op)(value)) def _append_filter_tables(self, tables_required, filters): """ diff --git a/tests/end_to_end_tests/test_query.py b/tests/end_to_end_tests/test_query.py index 92268e6e..3f9fb6f3 100644 --- a/tests/end_to_end_tests/test_query.py +++ b/tests/end_to_end_tests/test_query.py @@ -107,3 +107,34 @@ def test_query_between_columns(dummy_file): assert i < 1 assert getattr(r, "dataset.name") == _NAME assert getattr(r, "dataset.version_string") == _V_STRING + +def test_query_name(dummy_file): + """Test a quering on a partial name'""" + + # Establish connection to database + tmp_src_dir, tmp_root_dir = dummy_file + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) + + # Add entry + for tag in ["first", "second", "third"]: + d_id = _insert_dataset_entry( + datareg, + f"DESC:datasets:test_query_name_{tag}", + "0.0.1", + ) + + # Do a wildcard search on the name + f = datareg.Query.gen_filter("dataset.name", "~=", "DESC:datasets:test_query_name%") + results = datareg.Query.find_datasets(property_names=None, filters=[f]) + + # Should return all three of our datasets + for c, v in results.items(): + assert len(v) == 3 + + # Do an exact search on the name + f = datareg.Query.gen_filter("dataset.name", "==", "DESC:datasets:test_query_name_first") + results = datareg.Query.find_datasets(property_names=None, filters=[f]) + + # Should return just one dataset + for c, v in results.items(): + assert len(v) == 1 From a0c047b527b9047c83117123648b61a7e36b5a4c Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Tue, 29 Oct 2024 20:23:19 +0100 Subject: [PATCH 02/12] Update dregs ls to allow for wildcard querying on the dataset name --- CHANGELOG.md | 10 ++++ src/dataregistry_cli/cli.py | 4 +- src/dataregistry_cli/query.py | 95 +++++++++++++++++++++-------------- 3 files changed, 70 insertions(+), 39 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d4b4756e..1ee11601 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +## Version 1.0.4 + +Make more options for querying + +- There is now a `~=` query operator that can utalise the `.ilike` filter to + allow non-case-sensitive filering with wildcards (i.e., the `%` character). +- `dregs ls` can now filter on the dataset name, including `%` wildcards, using + the `--name` option. +- `dregs_ls` can return arbitrary columns using the `--return_cols` option + ## Version 1.0.3 Some changes to the way the `relative_path` is automatically generated from the diff --git a/src/dataregistry_cli/cli.py b/src/dataregistry_cli/cli.py index be82b15c..034cfda2 100644 --- a/src/dataregistry_cli/cli.py +++ b/src/dataregistry_cli/cli.py @@ -77,10 +77,10 @@ def get_parser(): help="List datasets for a given owner type", choices=["user", "group", "production", "project"], ) + arg_ls.add_argument("--name", help="Only return datasets with a given name (% can be used as a wildcard)") arg_ls.add_argument("--all", help="List all datasets", action="store_true") arg_ls.add_argument( - "--extended", help="List more properties than the default", action="store_true" - ) + "--return_cols", help="List of columns to return in the query", nargs="+", type=str) arg_ls.add_argument( "--max_rows", help="Maximum number of rows to print (default 500)", diff --git a/src/dataregistry_cli/query.py b/src/dataregistry_cli/query.py index bafd755c..2018adee 100644 --- a/src/dataregistry_cli/query.py +++ b/src/dataregistry_cli/query.py @@ -1,6 +1,52 @@ import os from dataregistry import DataRegistry import pandas as pd +from dataregistry import Filter + + +def _render_filters(datareg, args): + """ + Apply a filter to the query. + + Both the owner and owner_type columns can be filtered against. In addition, + the dataset name column can be filtered against (allowed for % wildcards). + + Keywords can also be filtered against. These have to be treated separately, + as they need to be linked to the keywords table. + + Parameters + ---------- + datareg : DataRegistry object + args : argparse object + + Returns + ------- + filters : list[Filter] + """ + + filters = [] + + # Dataset columns we can filter by + queriables = ["owner", "owner_type", "name"] + + print("\nDataRegistry query:", end=" ") + if not args.all: + for col in queriables: + # Add filter on this column + if getattr(args, col) is not None: + if col == "name": + filters.append(Filter(f"dataset.{col}", "~=", getattr(args, col))) + else: + filters.append(Filter(f"dataset.{col}", "==", getattr(args, col))) + print(f"{col}=={getattr(args, col)}", end=" ") + else: + print("all datasets", end=" ") + + # Add keywords filter + if args.keyword is not None: + filters.append(datareg.Query.gen_filter("keyword.keyword", "==", args.keyword)) + + return filters def dregs_ls(args): @@ -20,6 +66,9 @@ def dregs_ls(args): Owner to list dataset entries for args.owner_type : str Owner type to list dataset entries for + args.name : str + Filter to only those results with a given dataset name (% can be used + as a wildcard) args.all : bool True to show all datasets, no filters args.config_file : str @@ -30,8 +79,8 @@ def dregs_ls(args): Path to root_dir args.site : str Look up root_dir using a site - args.extended : bool - True to list more dataset properties + args.return_cols : list[str] + List of dataset columns to return args.max_chars : int Maximum number of character to print per column args.max_rows : int @@ -59,28 +108,12 @@ def dregs_ls(args): else: datareg_prod = None - # Filter on dataset owner and/or owner_type - filters = [] + # By default, search for "our" dataset + if args.owner is None: + args.owner = os.getenv("USER") - print("\nDataRegistry query:", end=" ") - if not args.all: - # Add owner_type filter - if args.owner_type is not None: - filters.append(Filter("dataset.owner_type", "==", args.owner_type)) - print(f"owner_type=={args.owner_type}", end=" ") - - # Add owner filter - if args.owner is None: - if args.owner_type is None: - filters.append( - datareg.Query.gen_filter("dataset.owner", "==", os.getenv("USER")) - ) - print(f"owner=={os.getenv('USER')}", end=" ") - else: - filters.append(datareg.Query.gen_filter("dataset.owner", "==", args.owner)) - print(f"owner=={args.owner}", end=" ") - else: - print("all datasets", end=" ") + # Render search filters + filters = _render_filters(datareg, args) # What columns are we printing _print_cols = [ @@ -90,23 +123,11 @@ def dregs_ls(args): "dataset.owner_type", "dataset.description", ] - if args.extended: - _print_cols.extend( - [ - "dataset.dataset_id", - "dataset.relative_path", - "dataset.status", - "dataset.register_date", - "dataset.is_overwritable", - ] - ) - - # Add keywords filter + if args.return_cols is not None: + _print_cols = [f"dataset.{x}" for x in args.return_cols] if args.keyword is not None: _print_cols.append("keyword.keyword") - filters.append(datareg.Query.gen_filter("keyword.keyword", "==", args.keyword)) - # Loop over this schema and the production schema and print the results for this_datareg in [datareg, datareg_prod]: if this_datareg is None: From bed26bcdfffdc4bd7405fd2c273cb3a10960738d Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Tue, 29 Oct 2024 20:23:41 +0100 Subject: [PATCH 03/12] Update version to 1.0.4 --- src/dataregistry/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dataregistry/_version.py b/src/dataregistry/_version.py index 976498ab..92192eed 100644 --- a/src/dataregistry/_version.py +++ b/src/dataregistry/_version.py @@ -1 +1 @@ -__version__ = "1.0.3" +__version__ = "1.0.4" From 3e03e68848a643d40dcb5778dfd5f18f5d238714 Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Tue, 29 Oct 2024 20:43:19 +0100 Subject: [PATCH 04/12] Remove % from help string --- src/dataregistry_cli/cli.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/dataregistry_cli/cli.py b/src/dataregistry_cli/cli.py index 034cfda2..17754340 100644 --- a/src/dataregistry_cli/cli.py +++ b/src/dataregistry_cli/cli.py @@ -77,10 +77,16 @@ def get_parser(): help="List datasets for a given owner type", choices=["user", "group", "production", "project"], ) - arg_ls.add_argument("--name", help="Only return datasets with a given name (% can be used as a wildcard)") + arg_ls.add_argument( + "--name", help="Only return datasets with a given name (wildcard support)" + ) arg_ls.add_argument("--all", help="List all datasets", action="store_true") arg_ls.add_argument( - "--return_cols", help="List of columns to return in the query", nargs="+", type=str) + "--return_cols", + help="List of columns to return in the query", + nargs="+", + type=str, + ) arg_ls.add_argument( "--max_rows", help="Maximum number of rows to print (default 500)", From 66381207d3f0e5213084ca23d020d98bb55c72ab Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Tue, 29 Oct 2024 20:54:22 +0100 Subject: [PATCH 05/12] Update docs --- docs/source/tutorial_cli.rst | 14 ++++++++++++-- .../tutorial_notebooks/query_datasets.ipynb | 19 +++++++++++++++++++ 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/docs/source/tutorial_cli.rst b/docs/source/tutorial_cli.rst index c9f34ed4..bc67ca21 100644 --- a/docs/source/tutorial_cli.rst +++ b/docs/source/tutorial_cli.rst @@ -112,11 +112,21 @@ For example, to see all the datasets from the DESC Generic Working Group we woul dregs ls --owner "DESC Generic Working Group" -To list entries from all owners type +To list entries from all owners do ``--owner none``. + +You can search against the ``dataset.name`` column, with wildcard support, e.g., + +.. code-block:: bash + + dregs ls --name dataset:dc2:% + +will search for all datasets whose name starts with the pattern "dataset:dc2:". + +To select what columns are printed in the result use the ``--return_cols`` option, e.g., .. code-block:: bash - dregs ls --all + dregs ls --return_cols dataset_id name description status Using ``dregs ls`` is a quick an easy way to remind yourself what names you gave to previous datasets, and what relative paths they reside at. diff --git a/docs/source/tutorial_notebooks/query_datasets.ipynb b/docs/source/tutorial_notebooks/query_datasets.ipynb index cd832aff..9ae576b5 100644 --- a/docs/source/tutorial_notebooks/query_datasets.ipynb +++ b/docs/source/tutorial_notebooks/query_datasets.ipynb @@ -116,6 +116,25 @@ "\n", "The allowed boolean logic operators are: `==`, `!=`, `<`, `<=`, `>` and `>=`.\n", "\n", + "A special operator, `~=`, can be use to perform wildcard querties. This is particularly useful when only a partial dataset name is known, or when we want to return all datasets with a similar naming pattern, form example" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "578951ce-cc30-4ab3-88b4-7bb98b734b9c", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a filter that queries on the dataset name with a wildcard\n", + "f = datareg.Query.gen_filter('dataset.name', '~=', 'nersc_tutorial:%')" + ] + }, + { + "cell_type": "markdown", + "id": "37c23c39-7a78-4931-a281-226a7bfc9333", + "metadata": {}, + "source": [ "### Performing the query\n", "\n", "Now we can pass this filter through to a query using the `Query` extension of the `DataRegistry` class, e.g.," From b81e9152009de99f21d2694bbf2a0f7abb061b43 Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Tue, 29 Oct 2024 20:56:21 +0100 Subject: [PATCH 06/12] Add support for searching all owners --- src/dataregistry_cli/query.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/src/dataregistry_cli/query.py b/src/dataregistry_cli/query.py index 2018adee..b16eb68d 100644 --- a/src/dataregistry_cli/query.py +++ b/src/dataregistry_cli/query.py @@ -30,17 +30,15 @@ def _render_filters(datareg, args): queriables = ["owner", "owner_type", "name"] print("\nDataRegistry query:", end=" ") - if not args.all: - for col in queriables: - # Add filter on this column - if getattr(args, col) is not None: - if col == "name": - filters.append(Filter(f"dataset.{col}", "~=", getattr(args, col))) - else: + for col in queriables: + # Add filter on this column + if getattr(args, col) is not None: + if col == "name": + filters.append(Filter(f"dataset.{col}", "~=", getattr(args, col))) + else: + if not (col == "owner" and getattr(args, col).lower() == "none"): filters.append(Filter(f"dataset.{col}", "==", getattr(args, col))) - print(f"{col}=={getattr(args, col)}", end=" ") - else: - print("all datasets", end=" ") + print(f"{col}=={getattr(args, col)}", end=" ") # Add keywords filter if args.keyword is not None: From 2421509541b0557eddf316506c46d39f97387e70 Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Tue, 29 Oct 2024 20:58:00 +0100 Subject: [PATCH 07/12] Get rid of all from cli --- src/dataregistry_cli/cli.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/dataregistry_cli/cli.py b/src/dataregistry_cli/cli.py index 17754340..ce6eb1e4 100644 --- a/src/dataregistry_cli/cli.py +++ b/src/dataregistry_cli/cli.py @@ -80,7 +80,6 @@ def get_parser(): arg_ls.add_argument( "--name", help="Only return datasets with a given name (wildcard support)" ) - arg_ls.add_argument("--all", help="List all datasets", action="store_true") arg_ls.add_argument( "--return_cols", help="List of columns to return in the query", From 7627aa58c9882f1da58b6238216ec682d6f7ccab Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Fri, 1 Nov 2024 11:11:53 +0100 Subject: [PATCH 08/12] Add case-sensitive wildcard operator ~== --- docs/source/tutorial_cli.rst | 3 +- .../tutorial_notebooks/query_datasets.ipynb | 10 ++++- src/dataregistry/query.py | 10 +++-- tests/end_to_end_tests/test_query.py | 38 +++++++++++-------- 4 files changed, 40 insertions(+), 21 deletions(-) diff --git a/docs/source/tutorial_cli.rst b/docs/source/tutorial_cli.rst index bc67ca21..3a3bf736 100644 --- a/docs/source/tutorial_cli.rst +++ b/docs/source/tutorial_cli.rst @@ -120,7 +120,8 @@ You can search against the ``dataset.name`` column, with wildcard support, e.g., dregs ls --name dataset:dc2:% -will search for all datasets whose name starts with the pattern "dataset:dc2:". +will search for all datasets whose name starts with the pattern "dataset:dc2:" +(note the ``--name`` queries here are case insensitive). To select what columns are printed in the result use the ``--return_cols`` option, e.g., diff --git a/docs/source/tutorial_notebooks/query_datasets.ipynb b/docs/source/tutorial_notebooks/query_datasets.ipynb index 9ae576b5..33ce6aec 100644 --- a/docs/source/tutorial_notebooks/query_datasets.ipynb +++ b/docs/source/tutorial_notebooks/query_datasets.ipynb @@ -116,7 +116,7 @@ "\n", "The allowed boolean logic operators are: `==`, `!=`, `<`, `<=`, `>` and `>=`.\n", "\n", - "A special operator, `~=`, can be use to perform wildcard querties. This is particularly useful when only a partial dataset name is known, or when we want to return all datasets with a similar naming pattern, form example" + "A special operator, `~=`, can be use to perform wildcard querties. This is particularly useful when only a partial dataset name is known, or when we want to return all datasets with a similar naming pattern, for example" ] }, { @@ -130,6 +130,14 @@ "f = datareg.Query.gen_filter('dataset.name', '~=', 'nersc_tutorial:%')" ] }, + { + "cell_type": "markdown", + "id": "877e32d9-08b6-4121-8afa-07f3ed8f8524", + "metadata": {}, + "source": [ + "will return all datasets whose name begins with the pattern `nersc_tutorial:`. The `~=` operator is case insensitive, for case sensitive wildcard searching, one can use the `~==` operator." + ] + }, { "cell_type": "markdown", "id": "37c23c39-7a78-4931-a281-226a7bfc9333", diff --git a/src/dataregistry/query.py b/src/dataregistry/query.py index a2c56b25..34b1cba9 100644 --- a/src/dataregistry/query.py +++ b/src/dataregistry/query.py @@ -60,6 +60,7 @@ ">": "__gt__", ">=": "__ge__", "~=": None, + "~==": None, } ALL_ORDERABLE = ( @@ -273,15 +274,18 @@ def _render_filter(self, f, stmt): # Extract the property we are ordering on (also making sure it # is orderable) - if not column_is_orderable[0] and f[1] not in ["~=", "==", "=", "!="]: + if not column_is_orderable[0] and f[1] not in ["~==", "~=", "==", "=", "!="]: raise ValueError('check_filter: Cannot apply "{f[1]}" to "{f[0]}"') else: value = f[2] - # Special case where we are partially matching with a wildcard + # Case insensitive wildcard matching if f[1] == "~=": return stmt.where(column_ref[0].ilike(value)) - # General case + # Case sensitive wildcard matching + elif f[1] == "~==": + return stmt.where(column_ref[0].like(value)) + # General case using traditional boolean operator else: return stmt.where(column_ref[0].__getattribute__(the_op)(value)) diff --git a/tests/end_to_end_tests/test_query.py b/tests/end_to_end_tests/test_query.py index 3f9fb6f3..5dca69d8 100644 --- a/tests/end_to_end_tests/test_query.py +++ b/tests/end_to_end_tests/test_query.py @@ -1,3 +1,4 @@ +import pytest import os import pandas as pd import sqlalchemy @@ -108,33 +109,38 @@ def test_query_between_columns(dummy_file): assert getattr(r, "dataset.name") == _NAME assert getattr(r, "dataset.version_string") == _V_STRING -def test_query_name(dummy_file): - """Test a quering on a partial name'""" +@pytest.mark.parametrize( + "op,qstr,ans,tag", + [ + ("~=", "DESC:datasets:test_query_name_nocasewildcard%", 3, "nocasewildcard"), + ("==", "DESC:datasets:test_query_name_exactmatch_first", 1, "exactmatch"), + ("~==", "DESC:datasets:Test_Query_Name_nocasewildcard%", 0, "casewildcardfail"), + ("~==", "DESC:datasets:test_query_name_nocasewildcard%", 3, "casewildcardpass"), + ], +) +def test_query_name(dummy_file,op,qstr,ans,tag): + """Test a quering on a partial name with wildcards""" # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) # Add entry - for tag in ["first", "second", "third"]: + for tmp_tag in ["first", "second", "third"]: d_id = _insert_dataset_entry( datareg, - f"DESC:datasets:test_query_name_{tag}", + f"DESC:datasets:test_query_name_{tag}_{tmp_tag}", "0.0.1", ) # Do a wildcard search on the name - f = datareg.Query.gen_filter("dataset.name", "~=", "DESC:datasets:test_query_name%") + f = datareg.Query.gen_filter("dataset.name", op, qstr) results = datareg.Query.find_datasets(property_names=None, filters=[f]) - # Should return all three of our datasets - for c, v in results.items(): - assert len(v) == 3 - - # Do an exact search on the name - f = datareg.Query.gen_filter("dataset.name", "==", "DESC:datasets:test_query_name_first") - results = datareg.Query.find_datasets(property_names=None, filters=[f]) - - # Should return just one dataset - for c, v in results.items(): - assert len(v) == 1 + # How many datasets did we find + if ans == 0: + assert len(results) == 0 + else: + assert len(results) > 0 + for c, v in results.items(): + assert len(v) == ans From 5012a1f31e3a5e8e9a7e7f9e57a192e75d68285f Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Fri, 1 Nov 2024 11:33:12 +0100 Subject: [PATCH 09/12] Change query wildcard to * over % --- docs/source/tutorial_cli.rst | 5 +++-- docs/source/tutorial_notebooks/query_datasets.ipynb | 4 ++-- src/dataregistry/query.py | 8 +++++--- tests/end_to_end_tests/test_query.py | 9 +++++---- 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/docs/source/tutorial_cli.rst b/docs/source/tutorial_cli.rst index 3a3bf736..af7c5394 100644 --- a/docs/source/tutorial_cli.rst +++ b/docs/source/tutorial_cli.rst @@ -114,11 +114,12 @@ For example, to see all the datasets from the DESC Generic Working Group we woul To list entries from all owners do ``--owner none``. -You can search against the ``dataset.name`` column, with wildcard support, e.g., +You can search against the ``dataset.name`` column, with wildcard support, +where ``*`` is the wildcard character, e.g., .. code-block:: bash - dregs ls --name dataset:dc2:% + dregs ls --name dataset:dc2:* will search for all datasets whose name starts with the pattern "dataset:dc2:" (note the ``--name`` queries here are case insensitive). diff --git a/docs/source/tutorial_notebooks/query_datasets.ipynb b/docs/source/tutorial_notebooks/query_datasets.ipynb index 33ce6aec..5af3cb8f 100644 --- a/docs/source/tutorial_notebooks/query_datasets.ipynb +++ b/docs/source/tutorial_notebooks/query_datasets.ipynb @@ -116,7 +116,7 @@ "\n", "The allowed boolean logic operators are: `==`, `!=`, `<`, `<=`, `>` and `>=`.\n", "\n", - "A special operator, `~=`, can be use to perform wildcard querties. This is particularly useful when only a partial dataset name is known, or when we want to return all datasets with a similar naming pattern, for example" + "A special operator, `~=`, can be use to perform wildcard querties, where `*` is the wildcard character. This is particularly useful when only a partial dataset name is known, or when we want to return all datasets with a similar naming pattern, for example" ] }, { @@ -127,7 +127,7 @@ "outputs": [], "source": [ "# Create a filter that queries on the dataset name with a wildcard\n", - "f = datareg.Query.gen_filter('dataset.name', '~=', 'nersc_tutorial:%')" + "f = datareg.Query.gen_filter('dataset.name', '~=', 'nersc_tutorial:*')" ] }, { diff --git a/src/dataregistry/query.py b/src/dataregistry/query.py index 34b1cba9..4fe293ba 100644 --- a/src/dataregistry/query.py +++ b/src/dataregistry/query.py @@ -279,11 +279,13 @@ def _render_filter(self, f, stmt): else: value = f[2] - # Case insensitive wildcard matching + # Case insensitive wildcard matching (wildcard is '*') if f[1] == "~=": - return stmt.where(column_ref[0].ilike(value)) - # Case sensitive wildcard matching + tmp = value.replace('%', r'\%').replace('_', r'\_').replace('*', '%') + return stmt.where(column_ref[0].ilike(tmp)) + # Case sensitive wildcard matching (wildcard is '*') elif f[1] == "~==": + tmp = value.replace('%', r'\%').replace('_', r'\_').replace('*', '%') return stmt.where(column_ref[0].like(value)) # General case using traditional boolean operator else: diff --git a/tests/end_to_end_tests/test_query.py b/tests/end_to_end_tests/test_query.py index 5dca69d8..386c864d 100644 --- a/tests/end_to_end_tests/test_query.py +++ b/tests/end_to_end_tests/test_query.py @@ -109,16 +109,17 @@ def test_query_between_columns(dummy_file): assert getattr(r, "dataset.name") == _NAME assert getattr(r, "dataset.version_string") == _V_STRING + @pytest.mark.parametrize( "op,qstr,ans,tag", [ - ("~=", "DESC:datasets:test_query_name_nocasewildcard%", 3, "nocasewildcard"), + ("~=", "DESC:datasets:test_query_name_nocasewildcard*", 3, "nocasewildcard"), ("==", "DESC:datasets:test_query_name_exactmatch_first", 1, "exactmatch"), - ("~==", "DESC:datasets:Test_Query_Name_nocasewildcard%", 0, "casewildcardfail"), - ("~==", "DESC:datasets:test_query_name_nocasewildcard%", 3, "casewildcardpass"), + ("~==", "DESC:datasets:Test_Query_Name_nocasewildcard*", 0, "casewildcardfail"), + ("~==", "DESC:datasets:test_query_name_nocasewildcard*", 3, "casewildcardpass"), ], ) -def test_query_name(dummy_file,op,qstr,ans,tag): +def test_query_name(dummy_file, op, qstr, ans, tag): """Test a quering on a partial name with wildcards""" # Establish connection to database From 47ef94e83072f738f539e3acfd092698eb0fa8ed Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Fri, 1 Nov 2024 11:38:43 +0100 Subject: [PATCH 10/12] Fix bug --- src/dataregistry/query.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dataregistry/query.py b/src/dataregistry/query.py index 4fe293ba..a054289b 100644 --- a/src/dataregistry/query.py +++ b/src/dataregistry/query.py @@ -286,7 +286,7 @@ def _render_filter(self, f, stmt): # Case sensitive wildcard matching (wildcard is '*') elif f[1] == "~==": tmp = value.replace('%', r'\%').replace('_', r'\_').replace('*', '%') - return stmt.where(column_ref[0].like(value)) + return stmt.where(column_ref[0].like(tmp)) # General case using traditional boolean operator else: return stmt.where(column_ref[0].__getattribute__(the_op)(value)) From a99b3ba177610f4053be9fae143e03cf194df2b7 Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Fri, 1 Nov 2024 11:51:14 +0100 Subject: [PATCH 11/12] Fix query test for sqlite --- tests/end_to_end_tests/test_query.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/end_to_end_tests/test_query.py b/tests/end_to_end_tests/test_query.py index 386c864d..2614f75b 100644 --- a/tests/end_to_end_tests/test_query.py +++ b/tests/end_to_end_tests/test_query.py @@ -109,7 +109,9 @@ def test_query_between_columns(dummy_file): assert getattr(r, "dataset.name") == _NAME assert getattr(r, "dataset.version_string") == _V_STRING - +@pytest.mark.skipif( + datareg.db_connection._dialect == "sqlite", reason="wildcards break for sqlite" +) @pytest.mark.parametrize( "op,qstr,ans,tag", [ From 264ebfb966cf706a6809ca6a8012da49dcbb1a23 Mon Sep 17 00:00:00 2001 From: Stuart McAlpine Date: Tue, 12 Nov 2024 14:41:55 +0100 Subject: [PATCH 12/12] Restrict ~= to only work on certain columns --- src/dataregistry/query.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/src/dataregistry/query.py b/src/dataregistry/query.py index a054289b..70fd965a 100644 --- a/src/dataregistry/query.py +++ b/src/dataregistry/query.py @@ -76,6 +76,12 @@ .union(LITE_TYPES) ) +ILIKE_ALLOWED = [ + "dataset.name", + "dataset.owner", + "dataset.relative_path", + "dataset.access_api" +] def is_orderable_type(ctype): return type(ctype) in ALL_ORDERABLE @@ -279,14 +285,20 @@ def _render_filter(self, f, stmt): else: value = f[2] - # Case insensitive wildcard matching (wildcard is '*') - if f[1] == "~=": - tmp = value.replace('%', r'\%').replace('_', r'\_').replace('*', '%') - return stmt.where(column_ref[0].ilike(tmp)) - # Case sensitive wildcard matching (wildcard is '*') - elif f[1] == "~==": + # String partial matching with wildcard + if f[1] in ["~=", "~=="]: + if f[0] not in ILIKE_ALLOWED: + raise ValueError(f"Can only perform ~= search on {ILIKE_ALLOWED}") + tmp = value.replace('%', r'\%').replace('_', r'\_').replace('*', '%') - return stmt.where(column_ref[0].like(tmp)) + + # Case insensitive wildcard matching (wildcard is '*') + if f[1] == "~=": + return stmt.where(column_ref[0].ilike(tmp)) + # Case sensitive wildcard matching (wildcard is '*') + else: + return stmt.where(column_ref[0].like(tmp)) + # General case using traditional boolean operator else: return stmt.where(column_ref[0].__getattribute__(the_op)(value))