Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update queries #163

Merged
merged 12 commits into from
Nov 26, 2024
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
## Version 1.0.4

Make more options for querying

- There is now a `~=` query operator that can utalise the `.ilike` filter to
allow non-case-sensitive filering with wildcards (i.e., the `%` character).
- `dregs ls` can now filter on the dataset name, including `%` wildcards, using
the `--name` option.
- `dregs_ls` can return arbitrary columns using the `--return_cols` option

## Version 1.0.3

Some changes to the way the `relative_path` is automatically generated from the
Expand Down
16 changes: 14 additions & 2 deletions docs/source/tutorial_cli.rst
Original file line number Diff line number Diff line change
Expand Up @@ -112,11 +112,23 @@ For example, to see all the datasets from the DESC Generic Working Group we woul

dregs ls --owner "DESC Generic Working Group"

To list entries from all owners type
To list entries from all owners do ``--owner none``.

You can search against the ``dataset.name`` column, with wildcard support,
where ``*`` is the wildcard character, e.g.,

.. code-block:: bash

dregs ls --name dataset:dc2:*

will search for all datasets whose name starts with the pattern "dataset:dc2:"
(note the ``--name`` queries here are case insensitive).

To select what columns are printed in the result use the ``--return_cols`` option, e.g.,

.. code-block:: bash

dregs ls --all
dregs ls --return_cols dataset_id name description status

Using ``dregs ls`` is a quick an easy way to remind yourself what names you
gave to previous datasets, and what relative paths they reside at.
27 changes: 27 additions & 0 deletions docs/source/tutorial_notebooks/query_datasets.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,33 @@
"\n",
"The allowed boolean logic operators are: `==`, `!=`, `<`, `<=`, `>` and `>=`.\n",
"\n",
"A special operator, `~=`, can be use to perform wildcard querties, where `*` is the wildcard character. This is particularly useful when only a partial dataset name is known, or when we want to return all datasets with a similar naming pattern, for example"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "578951ce-cc30-4ab3-88b4-7bb98b734b9c",
"metadata": {},
"outputs": [],
"source": [
"# Create a filter that queries on the dataset name with a wildcard\n",
"f = datareg.Query.gen_filter('dataset.name', '~=', 'nersc_tutorial:*')"
]
},
{
"cell_type": "markdown",
"id": "877e32d9-08b6-4121-8afa-07f3ed8f8524",
"metadata": {},
"source": [
"will return all datasets whose name begins with the pattern `nersc_tutorial:`. The `~=` operator is case insensitive, for case sensitive wildcard searching, one can use the `~==` operator."
]
},
{
"cell_type": "markdown",
"id": "37c23c39-7a78-4931-a281-226a7bfc9333",
"metadata": {},
"source": [
"### Performing the query\n",
"\n",
"Now we can pass this filter through to a query using the `Query` extension of the `DataRegistry` class, e.g.,"
Expand Down
2 changes: 1 addition & 1 deletion src/dataregistry/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.0.3"
__version__ = "1.0.4"
28 changes: 26 additions & 2 deletions src/dataregistry/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@
"<=": "__le__",
">": "__gt__",
">=": "__ge__",
"~=": None,
"~==": None,
}

ALL_ORDERABLE = (
Expand All @@ -74,6 +76,12 @@
.union(LITE_TYPES)
)

ILIKE_ALLOWED = [
"dataset.name",
"dataset.owner",
"dataset.relative_path",
"dataset.access_api"
]

def is_orderable_type(ctype):
return type(ctype) in ALL_ORDERABLE
Expand Down Expand Up @@ -272,12 +280,28 @@ def _render_filter(self, f, stmt):

# Extract the property we are ordering on (also making sure it
# is orderable)
if not column_is_orderable[0] and f[1] not in ["==", "=", "!="]:
if not column_is_orderable[0] and f[1] not in ["~==", "~=", "==", "=", "!="]:
raise ValueError('check_filter: Cannot apply "{f[1]}" to "{f[0]}"')
else:
value = f[2]

return stmt.where(column_ref[0].__getattribute__(the_op)(value))
# String partial matching with wildcard
if f[1] in ["~=", "~=="]:
if f[0] not in ILIKE_ALLOWED:
raise ValueError(f"Can only perform ~= search on {ILIKE_ALLOWED}")

tmp = value.replace('%', r'\%').replace('_', r'\_').replace('*', '%')

# Case insensitive wildcard matching (wildcard is '*')
if f[1] == "~=":
return stmt.where(column_ref[0].ilike(tmp))
# Case sensitive wildcard matching (wildcard is '*')
else:
return stmt.where(column_ref[0].like(tmp))

# General case using traditional boolean operator
else:
return stmt.where(column_ref[0].__getattribute__(the_op)(value))

def _append_filter_tables(self, tables_required, filters):
"""
Expand Down
9 changes: 7 additions & 2 deletions src/dataregistry_cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,14 @@ def get_parser():
help="List datasets for a given owner type",
choices=["user", "group", "production", "project"],
)
arg_ls.add_argument("--all", help="List all datasets", action="store_true")
arg_ls.add_argument(
"--extended", help="List more properties than the default", action="store_true"
"--name", help="Only return datasets with a given name (wildcard support)"
)
arg_ls.add_argument(
"--return_cols",
help="List of columns to return in the query",
nargs="+",
type=str,
)
arg_ls.add_argument(
"--max_rows",
Expand Down
93 changes: 56 additions & 37 deletions src/dataregistry_cli/query.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,50 @@
import os
from dataregistry import DataRegistry
import pandas as pd
from dataregistry import Filter


def _render_filters(datareg, args):
"""
Apply a filter to the query.

Both the owner and owner_type columns can be filtered against. In addition,
the dataset name column can be filtered against (allowed for % wildcards).

Keywords can also be filtered against. These have to be treated separately,
as they need to be linked to the keywords table.

Parameters
----------
datareg : DataRegistry object
args : argparse object

Returns
-------
filters : list[Filter]
"""

filters = []

# Dataset columns we can filter by
queriables = ["owner", "owner_type", "name"]

print("\nDataRegistry query:", end=" ")
for col in queriables:
# Add filter on this column
if getattr(args, col) is not None:
if col == "name":
filters.append(Filter(f"dataset.{col}", "~=", getattr(args, col)))
else:
if not (col == "owner" and getattr(args, col).lower() == "none"):
filters.append(Filter(f"dataset.{col}", "==", getattr(args, col)))
print(f"{col}=={getattr(args, col)}", end=" ")

# Add keywords filter
if args.keyword is not None:
filters.append(datareg.Query.gen_filter("keyword.keyword", "==", args.keyword))

return filters


def dregs_ls(args):
Expand All @@ -20,6 +64,9 @@ def dregs_ls(args):
Owner to list dataset entries for
args.owner_type : str
Owner type to list dataset entries for
args.name : str
Filter to only those results with a given dataset name (% can be used
as a wildcard)
args.all : bool
True to show all datasets, no filters
args.config_file : str
Expand All @@ -30,8 +77,8 @@ def dregs_ls(args):
Path to root_dir
args.site : str
Look up root_dir using a site
args.extended : bool
True to list more dataset properties
args.return_cols : list[str]
List of dataset columns to return
args.max_chars : int
Maximum number of character to print per column
args.max_rows : int
Expand Down Expand Up @@ -59,28 +106,12 @@ def dregs_ls(args):
else:
datareg_prod = None

# Filter on dataset owner and/or owner_type
filters = []
# By default, search for "our" dataset
if args.owner is None:
args.owner = os.getenv("USER")

print("\nDataRegistry query:", end=" ")
if not args.all:
# Add owner_type filter
if args.owner_type is not None:
filters.append(Filter("dataset.owner_type", "==", args.owner_type))
print(f"owner_type=={args.owner_type}", end=" ")

# Add owner filter
if args.owner is None:
if args.owner_type is None:
filters.append(
datareg.Query.gen_filter("dataset.owner", "==", os.getenv("USER"))
)
print(f"owner=={os.getenv('USER')}", end=" ")
else:
filters.append(datareg.Query.gen_filter("dataset.owner", "==", args.owner))
print(f"owner=={args.owner}", end=" ")
else:
print("all datasets", end=" ")
# Render search filters
filters = _render_filters(datareg, args)

# What columns are we printing
_print_cols = [
Expand All @@ -90,23 +121,11 @@ def dregs_ls(args):
"dataset.owner_type",
"dataset.description",
]
if args.extended:
_print_cols.extend(
[
"dataset.dataset_id",
"dataset.relative_path",
"dataset.status",
"dataset.register_date",
"dataset.is_overwritable",
]
)

# Add keywords filter
if args.return_cols is not None:
_print_cols = [f"dataset.{x}" for x in args.return_cols]
if args.keyword is not None:
_print_cols.append("keyword.keyword")

filters.append(datareg.Query.gen_filter("keyword.keyword", "==", args.keyword))

# Loop over this schema and the production schema and print the results
for this_datareg in [datareg, datareg_prod]:
if this_datareg is None:
Expand Down
40 changes: 40 additions & 0 deletions tests/end_to_end_tests/test_query.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import pytest
import os
import pandas as pd
import sqlalchemy
Expand Down Expand Up @@ -107,3 +108,42 @@ def test_query_between_columns(dummy_file):
assert i < 1
assert getattr(r, "dataset.name") == _NAME
assert getattr(r, "dataset.version_string") == _V_STRING

@pytest.mark.skipif(
datareg.db_connection._dialect == "sqlite", reason="wildcards break for sqlite"
)
@pytest.mark.parametrize(
"op,qstr,ans,tag",
[
("~=", "DESC:datasets:test_query_name_nocasewildcard*", 3, "nocasewildcard"),
("==", "DESC:datasets:test_query_name_exactmatch_first", 1, "exactmatch"),
("~==", "DESC:datasets:Test_Query_Name_nocasewildcard*", 0, "casewildcardfail"),
("~==", "DESC:datasets:test_query_name_nocasewildcard*", 3, "casewildcardpass"),
],
)
def test_query_name(dummy_file, op, qstr, ans, tag):
"""Test a quering on a partial name with wildcards"""

# Establish connection to database
tmp_src_dir, tmp_root_dir = dummy_file
datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING)

# Add entry
for tmp_tag in ["first", "second", "third"]:
d_id = _insert_dataset_entry(
datareg,
f"DESC:datasets:test_query_name_{tag}_{tmp_tag}",
"0.0.1",
)

# Do a wildcard search on the name
f = datareg.Query.gen_filter("dataset.name", op, qstr)
results = datareg.Query.find_datasets(property_names=None, filters=[f])

# How many datasets did we find
if ans == 0:
assert len(results) == 0
else:
assert len(results) > 0
for c, v in results.items():
assert len(v) == ans
Loading