diff --git a/CHANGELOG.md b/CHANGELOG.md index f74976bc2..f6c2cfcb9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,11 @@ Write the date in place of the "Unreleased" in the case a new version is release # Changelog -## Unreleased +## v0.1.0b1 + +### Added + +- Support for `FullText` search on SQLite-backed catalogs ### Fixed diff --git a/pyproject.toml b/pyproject.toml index 19f11eb34..deb6aacd8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -143,6 +143,7 @@ dataframe = [ dev = [ "coverage", "flake8", + "importlib_resources;python_version < \"3.9\"", "ldap3", "pre-commit", "pytest <8", # TMP pin while plugins catch up diff --git a/tiled/_tests/sql/__init__.py b/tiled/_tests/sql/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tiled/_tests/sql/before_creating_fts5_virtual_table.sql b/tiled/_tests/sql/before_creating_fts5_virtual_table.sql new file mode 100644 index 000000000..c6af441e0 --- /dev/null +++ b/tiled/_tests/sql/before_creating_fts5_virtual_table.sql @@ -0,0 +1,111 @@ +PRAGMA foreign_keys=OFF; +BEGIN TRANSACTION; +CREATE TABLE nodes ( + id INTEGER NOT NULL, + "key" VARCHAR(1023) NOT NULL, + ancestors JSON, + structure_family VARCHAR(9) NOT NULL, + metadata JSON NOT NULL, + specs JSON NOT NULL, + time_created DATETIME DEFAULT (CURRENT_TIMESTAMP), + time_updated DATETIME DEFAULT (CURRENT_TIMESTAMP), + PRIMARY KEY (id), + CONSTRAINT key_ancestors_unique_constraint UNIQUE ("key", ancestors) +); +INSERT INTO nodes VALUES(1,'x','[]','array','{"color":"blue"}','[]','2024-05-25 10:18:38','2024-05-25 10:18:38'); +CREATE TABLE structures ( + id VARCHAR(32) NOT NULL, + structure JSON NOT NULL, + PRIMARY KEY (id), + UNIQUE (id) +); +INSERT INTO structures VALUES('8e5b0a1237f27c3d04d2cb94bc695ff8','{"data_type":{"endianness":"little","kind":"i","itemsize":8},"chunks":[[3]],"shape":[3],"dims":null,"resizable":false}'); +CREATE TABLE assets ( + id INTEGER NOT NULL, + data_uri VARCHAR(1023), + is_directory BOOLEAN NOT NULL, + hash_type VARCHAR(63), + hash_content VARCHAR(1023), + size INTEGER, + time_created DATETIME DEFAULT (CURRENT_TIMESTAMP), + time_updated DATETIME DEFAULT (CURRENT_TIMESTAMP), + PRIMARY KEY (id) +); +INSERT INTO assets VALUES(1,'file://localhost/home/dallan/Repos/bnl/tiled/data/x',1,NULL,NULL,NULL,'2024-05-25 10:18:38','2024-05-25 10:18:38'); +CREATE TABLE data_sources ( + id INTEGER NOT NULL, + node_id INTEGER NOT NULL, + structure_id VARCHAR(32), + mimetype VARCHAR(255) NOT NULL, + parameters JSON, + management VARCHAR(9) NOT NULL, + structure_family VARCHAR(9) NOT NULL, + time_created DATETIME DEFAULT (CURRENT_TIMESTAMP), + time_updated DATETIME DEFAULT (CURRENT_TIMESTAMP), + PRIMARY KEY (id), + FOREIGN KEY(node_id) REFERENCES nodes (id) ON DELETE CASCADE, + FOREIGN KEY(structure_id) REFERENCES structures (id) ON DELETE CASCADE +); +INSERT INTO data_sources VALUES(1,1,'8e5b0a1237f27c3d04d2cb94bc695ff8','application/x-zarr','{}','writable','array','2024-05-25 10:18:38','2024-05-25 10:18:38'); +CREATE TABLE revisions ( + id INTEGER NOT NULL, + node_id INTEGER NOT NULL, + revision_number INTEGER NOT NULL, + metadata JSON NOT NULL, + specs JSON NOT NULL, + time_created DATETIME DEFAULT (CURRENT_TIMESTAMP), + time_updated DATETIME DEFAULT (CURRENT_TIMESTAMP), + PRIMARY KEY (id), + CONSTRAINT node_id_revision_number_unique_constraint UNIQUE (node_id, revision_number), + FOREIGN KEY(node_id) REFERENCES nodes (id) ON DELETE CASCADE +); +CREATE TABLE data_source_asset_association ( + data_source_id INTEGER NOT NULL, + asset_id INTEGER NOT NULL, + parameter VARCHAR(255), + num INTEGER, + PRIMARY KEY (data_source_id, asset_id), + CONSTRAINT parameter_num_unique_constraint UNIQUE (data_source_id, parameter, num), + FOREIGN KEY(data_source_id) REFERENCES data_sources (id) ON DELETE CASCADE, + FOREIGN KEY(asset_id) REFERENCES assets (id) ON DELETE CASCADE +); +INSERT INTO data_source_asset_association VALUES(1,1,'data_uri',NULL); +CREATE TABLE alembic_version ( + version_num VARCHAR(32) NOT NULL, + CONSTRAINT alembic_version_pkc PRIMARY KEY (version_num) +); +INSERT INTO alembic_version VALUES('e756b9381c14'); +CREATE INDEX ix_nodes_id ON nodes (id); +CREATE INDEX top_level_metadata ON nodes (ancestors, time_created, id, metadata); +CREATE UNIQUE INDEX ix_assets_data_uri ON assets (data_uri); +CREATE INDEX ix_assets_id ON assets (id); +CREATE INDEX ix_data_sources_id ON data_sources (id); +CREATE INDEX ix_revisions_id ON revisions (id); +CREATE TRIGGER cannot_insert_num_null_if_num_exists +BEFORE INSERT ON data_source_asset_association +WHEN NEW.num IS NULL +BEGIN + SELECT RAISE(ABORT, 'Can only insert num=NULL if no other row exists for the same parameter') + WHERE EXISTS + ( + SELECT 1 + FROM data_source_asset_association + WHERE parameter = NEW.parameter + AND data_source_id = NEW.data_source_id + ); +END; +CREATE TRIGGER cannot_insert_num_int_if_num_null_exists +BEFORE INSERT ON data_source_asset_association +WHEN NEW.num IS NOT NULL +BEGIN + SELECT RAISE(ABORT, 'Can only insert INTEGER num if no NULL row exists for the same parameter') + WHERE EXISTS + ( + SELECT 1 + FROM data_source_asset_association + WHERE parameter = NEW.parameter + AND num IS NULL + AND data_source_id = NEW.data_source_id + ); +END; +COMMIT; diff --git a/tiled/_tests/test_queries.py b/tiled/_tests/test_queries.py index cfa344659..67b22b840 100644 --- a/tiled/_tests/test_queries.py +++ b/tiled/_tests/test_queries.py @@ -29,7 +29,7 @@ ) from ..server.app import build_app from .conftest import TILED_TEST_POSTGRESQL_URI -from .utils import fail_with_status_code, temp_postgres +from .utils import fail_with_status_code, sqlite_from_dump, temp_postgres keys = list(string.ascii_lowercase) mapping = { @@ -50,6 +50,10 @@ numpy.ones(10), metadata={"color": "purple"} ) +mapping["full_text_test_case_urple"] = ArrayAdapter.from_array( + numpy.ones(10), metadata={"color": "urple"} +) + mapping["specs_foo_bar"] = ArrayAdapter.from_array(numpy.ones(10), specs=["foo", "bar"]) mapping["specs_foo_bar_baz"] = ArrayAdapter.from_array( numpy.ones(10), specs=["foo", "bar", "baz"] @@ -163,18 +167,53 @@ def test_contains(client): def test_full_text(client): - if client.metadata["backend"] in {"sqlite"}: - - def cm(): - return fail_with_status_code(HTTP_400_BAD_REQUEST) - - else: - cm = nullcontext - with cm(): - assert list(client.search(FullText("z"))) == ["z", "does_contain_z"] - # plainto_tsquery fails to find certain words, weirdly, so it is a useful - # test that we are using tsquery - assert list(client.search(FullText("purple"))) == ["full_text_test_case"] + "Basic test of FullText query" + assert list(client.search(FullText("z"))) == ["z", "does_contain_z"] + # plainto_tsquery fails to find certain words, weirdly, so it is a useful + # test that we are using tsquery + assert list(client.search(FullText("purple"))) == ["full_text_test_case"] + assert list(client.search(FullText("urple"))) == ["full_text_test_case_urple"] + + +def test_full_text_after_migration(): + # Load a SQL database created by an older version of Tiled, predating FullText + # support, and verify that the migration indexes the pre-existing metadata. + with sqlite_from_dump("before_creating_fts5_virtual_table.sql") as database_path: + subprocess.check_call( + [sys.executable] + + f"-m tiled catalog upgrade-database sqlite+aiosqlite:///{database_path}".split() + ) + catalog = from_uri(database_path) + app = build_app(catalog) + with Context.from_app(app) as context: + client = from_context(context) + assert list(client.search(FullText("blue"))) == ["x"] + assert list(client.search(FullText("red"))) == [] # does not exist + + +def test_full_text_update(client): + if client.metadata["backend"] == "map": + pytest.skip("Updating not supported") + # Update the fulltext index and check that it is current with the main data. + try: + client["full_text_test_case"].update_metadata({"color": "red"}) + assert list(client.search(FullText("purple"))) == [] + assert list(client.search(FullText("red"))) == ["full_text_test_case"] + finally: + # Reset case in the event tests are run out of order. + client["full_text_test_case"].update_metadata({"color": "purple"}) + + +def test_full_text_delete(client): + if client.metadata["backend"] == "map": + pytest.skip("Updating not supported") + # Delete a record the fulltext index and check that it is current with the main data. + client.write_array(numpy.ones(10), metadata={"item": "toaster"}, key="test_delete") + # Assert that the data was written + assert list(client.search(FullText("toaster"))) == ["test_delete"] + client.delete("test_delete") + assert list(client.search(FullText("purple"))) == ["full_text_test_case"] + assert list(client.search(FullText("toaster"))) == [] def test_regex(client): diff --git a/tiled/_tests/utils.py b/tiled/_tests/utils.py index f5836616e..538397da3 100644 --- a/tiled/_tests/utils.py +++ b/tiled/_tests/utils.py @@ -1,7 +1,11 @@ import contextlib import getpass +import sqlite3 +import sys +import tempfile import uuid from enum import IntEnum +from pathlib import Path import httpx import pytest @@ -11,6 +15,11 @@ from ..client import context from ..client.base import BaseClient +if sys.version_info < (3, 9): + import importlib_resources as resources +else: + from importlib import resources # Python >= 3.9 only + @contextlib.contextmanager def fail_with_status_code(status_code): @@ -64,3 +73,19 @@ class URL_LIMITS(IntEnum): HUGE = 80_000 DEFAULT = BaseClient.URL_CHARACTER_LIMIT TINY = 10 + + +@contextlib.contextmanager +def sqlite_from_dump(filename): + """Create a SQLite db in a temporary directory, loading a SQL script. + + SQL script should be given as a filename, assumed to be in tiled/_tests/sql/ + """ + with tempfile.TemporaryDirectory() as directory: + database_path = Path(directory, "catalog.db") + conn = sqlite3.connect(database_path) + ref = resources.files("tiled._tests.sql") / filename + with resources.as_file(ref) as path: + conn.executescript(path.read_text()) + conn.close() + yield database_path diff --git a/tiled/catalog/adapter.py b/tiled/catalog/adapter.py index a329bb787..70c9ed40d 100644 --- a/tiled/catalog/adapter.py +++ b/tiled/catalog/adapter.py @@ -35,6 +35,7 @@ from sqlalchemy.orm import selectinload from sqlalchemy.pool import AsyncAdaptedQueuePool from sqlalchemy.sql.expression import cast +from sqlalchemy.sql.sqltypes import MatchType from starlette.status import HTTP_404_NOT_FOUND, HTTP_415_UNSUPPORTED_MEDIA_TYPE from tiled.queries import ( @@ -371,12 +372,24 @@ def structure(self): return self.data_sources[0].structure return None + def apply_conditions(self, statement): + # IF this is a sqlite database and we are doing a full text MATCH + # query, we need a JOIN with the FTS5 virtual table. + if (self.context.engine.dialect.name == "sqlite") and any( + isinstance(condition.type, MatchType) for condition in self.conditions + ): + statement = statement.join( + orm.metadata_fts5, orm.metadata_fts5.c.rowid == orm.Node.id + ) + for condition in self.conditions: + statement = statement.filter(condition) + return statement + async def async_len(self): statement = select(func.count(orm.Node.key)).filter( orm.Node.ancestors == self.segments ) - for condition in self.conditions: - statement = statement.filter(condition) + statement = self.apply_conditions(statement) async with self.context.session() as db: return (await db.execute(statement)).scalar_one() @@ -398,17 +411,13 @@ async def lookup_adapter( # Search queries and access controls apply only at the top level. assert not first_level.conditions return await first_level.lookup_adapter(segments[1:]) - statement = ( - select(orm.Node) - .filter(orm.Node.ancestors == self.segments + ancestors) - .options( - selectinload(orm.Node.data_sources).selectinload( - orm.DataSource.structure - ) - ) + statement = select(orm.Node) + statement = self.apply_conditions(statement) + statement = statement.filter( + orm.Node.ancestors == self.segments + ancestors + ).options( + selectinload(orm.Node.data_sources).selectinload(orm.DataSource.structure) ) - for condition in self.conditions: - statement = statement.filter(condition) async with self.context.session() as db: node = (await db.execute(statement.filter(orm.Node.key == key))).scalar() if node is None: @@ -953,8 +962,7 @@ async def keys_range(self, offset, limit): (offset + limit) if limit is not None else None, # noqa: E203 ) statement = select(orm.Node.key).filter(orm.Node.ancestors == self.segments) - for condition in self.conditions: - statement = statement.filter(condition) + statement = self.apply_conditions(statement) async with self.context.session() as db: return ( ( @@ -976,8 +984,7 @@ async def items_range(self, offset, limit): (offset + limit) if limit is not None else None, # noqa: E203 ) statement = select(orm.Node).filter(orm.Node.ancestors == self.segments) - for condition in self.conditions: - statement = statement.filter(condition) + statement = self.apply_conditions(statement) async with self.context.session() as db: nodes = ( ( @@ -1187,7 +1194,7 @@ def contains(query, tree): def full_text(query, tree): dialect_name = tree.engine.url.get_dialect().name if dialect_name == "sqlite": - raise UnsupportedQueryType("full_text") + condition = orm.metadata_fts5.c.metadata.match(query.text) elif dialect_name == "postgresql": tsvector = func.jsonb_to_tsvector( cast("simple", REGCONFIG), orm.Node.metadata_, cast(["string"], JSONB) diff --git a/tiled/catalog/core.py b/tiled/catalog/core.py index 6c9869e7d..a57c88a6f 100644 --- a/tiled/catalog/core.py +++ b/tiled/catalog/core.py @@ -5,6 +5,7 @@ # This is list of all valid revisions (from current to oldest). ALL_REVISIONS = [ + "ed3a4223a600", "e756b9381c14", "2ca16566d692", "1cd99c02d0c7", diff --git a/tiled/catalog/migrations/versions/ed3a4223a600_create_sqlite_table_for_fulltext_search.py b/tiled/catalog/migrations/versions/ed3a4223a600_create_sqlite_table_for_fulltext_search.py new file mode 100644 index 000000000..4d6ac30d9 --- /dev/null +++ b/tiled/catalog/migrations/versions/ed3a4223a600_create_sqlite_table_for_fulltext_search.py @@ -0,0 +1,59 @@ +"""Create sqlite table for fulltext search + +Revision ID: ed3a4223a600 +Revises: e756b9381c14 +Create Date: 2024-04-11 16:41:01.369520 + +""" +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "ed3a4223a600" +down_revision = "e756b9381c14" +branch_labels = None +depends_on = None + + +def upgrade(): + connection = op.get_bind() + if connection.engine.dialect.name == "sqlite": + statements = [ + # Create an external content fts5 table. + # See https://www.sqlite.org/fts5.html Section 4.4.3. + """ + CREATE VIRTUAL TABLE metadata_fts5 USING fts5(metadata, content='nodes', content_rowid='id'); + """, + # Insert all existing node content into the fts5 table. + """ + INSERT INTO metadata_fts5(rowid, metadata) + SELECT id, metadata FROM nodes; + """, + # Triggers keep the index synchronized with the nodes table. + """ + CREATE TRIGGER nodes_metadata_fts5_sync_ai AFTER INSERT ON nodes BEGIN + INSERT INTO metadata_fts5(rowid, metadata) + VALUES (new.id, new.metadata); + END; + """, + """ + CREATE TRIGGER nodes_metadata_fts5_sync_ad AFTER DELETE ON nodes BEGIN + INSERT INTO metadata_fts5(metadata_fts5, rowid, metadata) + VALUES('delete', old.id, old.metadata); + END; + """, + """ + CREATE TRIGGER nodes_metadata_fts5_sync_au AFTER UPDATE ON nodes BEGIN + INSERT INTO metadata_fts5(metadata_fts5, rowid, metadata) + VALUES('delete', old.id, old.metadata); + INSERT INTO metadata_fts5(rowid, metadata) + VALUES (new.id, new.metadata); + END; + """, + ] + for statement in statements: + op.execute(sa.text(statement)) + + +def downgrade(): + pass diff --git a/tiled/catalog/orm.py b/tiled/catalog/orm.py index eeca4c2ac..051faf4f8 100644 --- a/tiled/catalog/orm.py +++ b/tiled/catalog/orm.py @@ -9,11 +9,14 @@ ForeignKey, Index, Integer, + Table, Unicode, event, + schema, text, ) from sqlalchemy.dialects.postgresql import JSONB +from sqlalchemy.ext.compiler import compiles from sqlalchemy.orm import Mapped, mapped_column, relationship from sqlalchemy.schema import UniqueConstraint from sqlalchemy.sql import func @@ -94,7 +97,7 @@ class Node(Timestamped, Base): "id", "metadata", postgresql_using="gin", - ) + ), # This is used by ORDER BY with the default sorting. # Index("ancestors_time_created", "ancestors", "time_created"), ) @@ -255,7 +258,7 @@ def unique_parameter_num_null_check(target, connection, **kw): ) -@event.listens_for(DataSourceAssetAssociation.__table__, "after_create") +@event.listens_for(Node.__table__, "after_create") def create_index_metadata_tsvector_search(target, connection, **kw): # This creates a ts_vector based metadata search index for fulltext. # Postgres only feature @@ -271,6 +274,71 @@ def create_index_metadata_tsvector_search(target, connection, **kw): ) +class FTS5Table(Table): + pass + + +@compiles(schema.CreateTable, "sqlite") +def _compile_fts5_virtual_table_sqlite(element: schema.CreateTable, compiler, **kw): + if not isinstance(element.target, FTS5Table): + return compiler.visit_create_table(element, **kw) + name = compiler.preparer.format_table(element.target) + cols = ", ".join( + # Skip last column (rowid). + compiler.preparer.format_column(col) + for col in element.target.columns[1:] + ) + return f"CREATE VIRTUAL TABLE {name} USING fts5({cols}, content='nodes', content_rowid='id')" + + +@compiles(schema.CreateTable, "postgresql") +def _compile_no_op_fts5_postgresql(element: schema.CreateTable, compiler, **kw): + # Preclude the creation of the FTS5 virtual table in posgres instances, + # Where fulltext search is handled by a different indexing mechanism. + if not isinstance(element.target, FTS5Table): + return compiler.visit_create_table(element, **kw) + return "SELECT 1" + + +metadata_fts5 = FTS5Table( + "metadata_fts5", Base.metadata, Column("rowid", Integer), Column("metadata", JSON) +) + + +@event.listens_for(metadata_fts5, "after_create") +def create_virtual_table_fits5(target, connection, **kw): + if connection.engine.dialect.name == "sqlite": + statements = [ + # See https://www.sqlite.org/fts5.html Section 4.4.3. + # """ + # CREATE VIRTUAL TABLE metadata_fts5 USING fts5(metadata, content='nodes', content_rowid='id'); + # """, + # Triggers keep the index synchronized with the nodes table. + """ + CREATE TRIGGER nodes_metadata_fts5_sync_ai AFTER INSERT ON nodes BEGIN + INSERT INTO metadata_fts5(rowid, metadata) + VALUES (new.id, new.metadata); + END; + """, + """ + CREATE TRIGGER nodes_metadata_fts5_sync_ad AFTER DELETE ON nodes BEGIN + INSERT INTO metadata_fts5(metadata_fts5, rowid, metadata) + VALUES('delete', old.id, old.metadata); + END; + """, + """ + CREATE TRIGGER nodes_metadata_fts5_sync_au AFTER UPDATE ON nodes BEGIN + INSERT INTO metadata_fts5(metadata_fts5, rowid, metadata) + VALUES('delete', old.id, old.metadata); + INSERT INTO metadata_fts5(rowid, metadata) + VALUES (new.id, new.metadata); + END; + """, + ] + for statement in statements: + connection.execute(text(statement)) + + class DataSource(Timestamped, Base): """ The describes how to open one or more file/blobs to extract data for a Node.