Merge pull request #396 from kjsanger/feature/multi-sample-metadata

Add support for data objects having metadata for >1 sample
wtsi-npg · Jan 8, 2025 · b2ddae7 · b2ddae7
2 parents 36a8506 + 1a1b1e1
commit b2ddae7
Show file tree

Hide file tree

Showing 4 changed files with 142 additions and 65 deletions.
diff --git a/src/npg_irods/common.py b/src/npg_irods/common.py
@@ -401,37 +401,34 @@ def ensure_secondary_metadata_updated(
        True if updated.
     """
     secondary_metadata, acl = [], []
-    sample, study = None, None
+    samples, study = [], None
 
     if item.metadata(TrackedStudy.ID):
-        study_id = item.avu(TrackedStudy.ID).value
-        study = find_study_by_study_id(mlwh_session, study_id)
+        study = find_study_by_study_id(mlwh_session, item.avu(TrackedStudy.ID).value)
 
-    if item.metadata(TrackedSample.ID):
-        sample_id = item.avu(TrackedSample.ID).value
-        sample = find_sample_by_sample_id(mlwh_session, sample_id)
-
-    if study is None and sample is None:
-        raise ValueError(
-            f"Failed to update metadata on {item}. No {TrackedStudy.ID} or "
-            f"{TrackedSample.ID} or metadata are present"
-        )
+    for avu in item.metadata(TrackedSample.ID):
+        sample = find_sample_by_sample_id(mlwh_session, avu.value)
+        samples.append(sample)
 
     zone = infer_zone(item)
-    if sample is None:
-        secondary_metadata.extend(make_study_metadata(study))
-        acl.extend(make_study_acl(study, zone=zone))
-    elif study is None:
-        secondary_metadata.extend(make_sample_metadata(sample))
-        log.warn(
-            f"Item has no {TrackedStudy.ID} metadata to allow permissions to be set. "
-            "It will be set as unreadable by default.",
-            path=item,
-        )
-    else:
-        secondary_metadata.extend(make_study_metadata(study))
-        secondary_metadata.extend(make_sample_metadata(sample))
-        acl.extend(make_sample_acl(sample, study, zone=zone))
+
+    match study, samples:
+        case None, []:
+            raise ValueError(
+                f"Failed to update metadata on {item}. No {TrackedStudy.ID} or "
+                f"{TrackedSample.ID} or metadata are present"
+            )
+        case study, []:
+            secondary_metadata.extend(make_study_metadata(study))
+            acl.extend(make_study_acl(study, zone=zone))
+        case None, [*sm]:
+            for s in sm:
+                secondary_metadata.extend(make_sample_metadata(s))
+        case study, [*sm]:
+            secondary_metadata.extend(make_study_metadata(study))
+            for s in sm:
+                secondary_metadata.extend(make_sample_metadata(s))
+                acl.extend(make_sample_acl(s, study, zone=zone))
 
     secondary_metadata = sorted(set(secondary_metadata))
     acl = sorted(set(acl))

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -127,7 +127,7 @@ def mlwh_session() -> Session:
     drop_database(engine.url)
 
 
-def initialize_mlwh_study_and_sample(session: Session):
+def initialize_mlwh_study_and_samples(session: Session):
     """
     Insert ML warehouse test data for synthetic runs.
 
@@ -148,7 +148,7 @@ def initialize_mlwh_study_and_sample(session: Session):
     )
     session.add(study_x)
 
-    sample_y = Sample(
+    sample_1 = Sample(
         id_lims="LIMS_01",
         common_name="common_name1",
         donor_id="donor_id1",
@@ -157,18 +157,33 @@ def initialize_mlwh_study_and_sample(session: Session):
         name="name1",
         public_name="public_name1",
         supplier_name="supplier_name1",
-        uuid_sample_lims="82429892-0ab6-11ee-b5ba-fa163eac3af7",
+        uuid_sample_lims="82429892-0ab6-11ee-b5ba-fa163eac3ag7",
         **default_timestamps,
     )
-    session.add(sample_y)
+    session.add(sample_1)
+
+    sample_2 = Sample(
+        id_lims="LIMS_01",
+        common_name="common_name2",
+        donor_id="donor_id2",
+        id_sample_lims="id_sample_lims2",
+        accession_number="Test Accession",
+        name="name2",
+        public_name="public_name2",
+        supplier_name="supplier_name2",
+        uuid_sample_lims="82429892-0ab6-11ee-b5ba-fa163eac3ag8",
+        **default_timestamps,
+    )
+    session.add(sample_2)
 
     session.commit()
 
 
 @pytest.fixture(scope="function")
-def simple_study_and_sample_mlwh(mlwh_session) -> Session:
-    """An ML warehouse database fixture populated with test study records."""
-    initialize_mlwh_study_and_sample(mlwh_session)
+def study_and_samples_mlwh(mlwh_session) -> Session:
+    """An ML warehouse database fixture populated with a single test study and some
+    sample records."""
+    initialize_mlwh_study_and_samples(mlwh_session)
     yield mlwh_session
 
 
@@ -229,11 +244,11 @@ def annotated_data_object(tmp_path):
 
 
 @pytest.fixture(scope="function")
-def simple_study_and_sample_data_object(tmp_path, irods_groups):
+def single_study_and_single_sample_data_object(tmp_path, irods_groups):
     """A fixture providing a collection containing a single data object with sample
-    and study metadata"""
+    and study metadata."""
     root_path = PurePath(
-        "/testZone/home/irods/test/simple_study_and_sample_data_object"
+        "/testZone/home/irods/test/single_study_and_single_sample_data_object"
     )
     rods_path = add_rods_path(root_path, tmp_path)
 
@@ -251,6 +266,31 @@ def simple_study_and_sample_data_object(tmp_path, irods_groups):
         remove_rods_path(rods_path)
 
 
+@pytest.fixture(scope="function")
+def single_study_and_multi_sample_data_object(tmp_path, irods_groups):
+    """A fixture providing a collection containing a single data object with multiple
+    sample and single study metadata."""
+    root_path = PurePath(
+        "/testZone/home/irods/test/single_study_and_multi_sample_data_object"
+    )
+    rods_path = add_rods_path(root_path, tmp_path)
+
+    obj_path = rods_path / "lorem.txt"
+    iput("./tests/data/simple/data_object/lorem.txt", obj_path)
+
+    obj = DataObject(obj_path)
+    obj.add_metadata(
+        AVU(TrackedSample.ID, "id_sample_lims1"),
+        AVU(TrackedSample.ID, "id_sample_lims2"),
+        AVU(TrackedStudy.ID, "1000"),
+    )
+
+    try:
+        yield obj_path
+    finally:
+        remove_rods_path(rods_path)
+
+
 @pytest.fixture(scope="function")
 def consent_withdrawn_gapi_data_object(tmp_path):
     root_path = PurePath("/testZone/home/irods/test/consent_withdrawn_gapi_data_object")

diff --git a/tests/test_common.py b/tests/test_common.py
@@ -23,11 +23,11 @@
 
 
 class TestCommonFunctions:
-    @m.context("When an iRODS object has both study and sample ID metadata")
+    @m.context("When an iRODS object has both one study and one sample ID metadata")
     @m.context("When it wants both study and sample metadata enhanced")
     @m.it("Updates study and sample metadata from MLWH")
     def test_ensure_secondary_metadata_updated(
-        self, simple_study_and_sample_data_object, simple_study_and_sample_mlwh
+        self, single_study_and_single_sample_data_object, study_and_samples_mlwh
     ):
         sample_id = "id_sample_lims1"
         study_id = "1000"
@@ -45,57 +45,97 @@ def test_ensure_secondary_metadata_updated(
             AVU(TrackedSample.NAME, "name1"),
             AVU(TrackedSample.PUBLIC_NAME, "public_name1"),
             AVU(TrackedSample.SUPPLIER_NAME, "supplier_name1"),
-            AVU(TrackedSample.UUID, "82429892-0ab6-11ee-b5ba-fa163eac3af7"),
+            AVU(TrackedSample.UUID, "82429892-0ab6-11ee-b5ba-fa163eac3ag7"),
         ]
 
-        obj = DataObject(simple_study_and_sample_data_object)
-        assert ensure_secondary_metadata_updated(obj, simple_study_and_sample_mlwh)
+        obj = DataObject(single_study_and_single_sample_data_object)
+        assert ensure_secondary_metadata_updated(obj, study_and_samples_mlwh)
+
+        for avu in expected_avus:
+            assert avu in obj.metadata()
+
+    @m.context("When an iRODS object has one study ID and multiple sample ID metadata")
+    @m.context("When it wants both study and sample metadata enhanced")
+    @m.it("Updates study and sample metadata from MLWH")
+    def test_ensure_secondary_metadata_updated_multiple_samples(
+        self,
+        single_study_and_multi_sample_data_object,
+        study_and_samples_mlwh,
+    ):
+        sample_id1 = "id_sample_lims1"
+        sample_id2 = "id_sample_lims2"
+        study_id = "1000"
+
+        expected_avus = [
+            AVU(TrackedStudy.ID, study_id),
+            AVU(TrackedStudy.NAME, "Study X"),
+            AVU(TrackedStudy.TITLE, "Test Study Title"),
+            AVU(TrackedStudy.ACCESSION_NUMBER, "Test Accession"),
+            AVU(TrackedSample.ID, sample_id1),
+            AVU(TrackedSample.ACCESSION_NUMBER, "Test Accession"),
+            AVU(TrackedSample.COMMON_NAME, "common_name1"),
+            AVU(TrackedSample.DONOR_ID, "donor_id1"),
+            AVU(TrackedSample.NAME, "name1"),
+            AVU(TrackedSample.PUBLIC_NAME, "public_name1"),
+            AVU(TrackedSample.SUPPLIER_NAME, "supplier_name1"),
+            AVU(TrackedSample.ID, sample_id2),
+            AVU(TrackedSample.ACCESSION_NUMBER, "Test Accession"),
+            AVU(TrackedSample.COMMON_NAME, "common_name2"),
+            AVU(TrackedSample.DONOR_ID, "donor_id2"),
+            AVU(TrackedSample.NAME, "name2"),
+            AVU(TrackedSample.PUBLIC_NAME, "public_name2"),
+            AVU(TrackedSample.SUPPLIER_NAME, "supplier_name2"),
+        ]
+
+        obj = DataObject(single_study_and_multi_sample_data_object)
+        assert ensure_secondary_metadata_updated(obj, study_and_samples_mlwh)
+
         for avu in expected_avus:
             assert avu in obj.metadata()
 
     @m.context("When an iRODS object has both study and sample ID metadata")
     @m.context("When it wants both study and sample metadata enhanced")
     @m.it("Updates permissions according to the study")
     def test_ensure_secondary_metadata_permissions_updated(
-        self, simple_study_and_sample_data_object, simple_study_and_sample_mlwh
+        self, single_study_and_single_sample_data_object, study_and_samples_mlwh
     ):
         zone = "testZone"
 
-        obj = DataObject(simple_study_and_sample_data_object)
+        obj = DataObject(single_study_and_single_sample_data_object)
         assert obj.permissions() == [AC("irods", perm=Permission.OWN, zone=zone)]
-        assert ensure_secondary_metadata_updated(obj, simple_study_and_sample_mlwh)
+        assert ensure_secondary_metadata_updated(obj, study_and_samples_mlwh)
         assert obj.permissions() == [
             AC("irods", perm=Permission.OWN, zone=zone),
             AC("ss_1000", perm=Permission.READ, zone=zone),
         ]
 
-    @m.context("When an iRODS object has study ID, but not sample ID metadata")
+    @m.context("When an iRODS object has one study ID, but no sample ID metadata")
     @m.context("When it wants study metadata enhanced")
     @m.it("Updates permissions according to the study")
     def test_ensure_secondary_metadata_permissions_updated_no_sample(
-        self, simple_study_and_sample_data_object, simple_study_and_sample_mlwh
+        self, single_study_and_single_sample_data_object, study_and_samples_mlwh
     ):
         zone = "testZone"
 
-        obj = DataObject(simple_study_and_sample_data_object)
+        obj = DataObject(single_study_and_single_sample_data_object)
         obj.remove_metadata(AVU(TrackedSample.ID, "id_sample_lims1"))
 
         assert obj.permissions() == [AC("irods", perm=Permission.OWN, zone=zone)]
-        assert ensure_secondary_metadata_updated(obj, simple_study_and_sample_mlwh)
+        assert ensure_secondary_metadata_updated(obj, study_and_samples_mlwh)
         assert obj.permissions() == [
             AC("irods", perm=Permission.OWN, zone=zone),
             AC("ss_1000", perm=Permission.READ, zone=zone),
         ]
 
-    @m.context("When an iRODS object has no study ID")
+    @m.context("When an iRODS object has no study ID and one sample ID metadata")
     @m.context("When it wants study metadata enhanced")
     @m.it("Removes access permissions")
     def test_ensure_secondary_metadata_permissions_updated_no_study(
-        self, simple_study_and_sample_data_object, simple_study_and_sample_mlwh
+        self, single_study_and_single_sample_data_object, study_and_samples_mlwh
     ):
         zone = "testZone"
 
-        obj = DataObject(simple_study_and_sample_data_object)
+        obj = DataObject(single_study_and_single_sample_data_object)
         obj.remove_metadata(AVU(TrackedStudy.ID, "1000"))
         obj.add_permissions(
             AC("ss_1000", perm=Permission.READ, zone=zone),
@@ -105,5 +145,5 @@ def test_ensure_secondary_metadata_permissions_updated_no_study(
             AC("irods", perm=Permission.OWN, zone=zone),
             AC("ss_1000", perm=Permission.READ, zone=zone),
         ]
-        assert ensure_secondary_metadata_updated(obj, simple_study_and_sample_mlwh)
+        assert ensure_secondary_metadata_updated(obj, study_and_samples_mlwh)
         assert obj.permissions() == [AC("irods", perm=Permission.OWN, zone=zone)]
diff --git a/tests/test_utilities.py b/tests/test_utilities.py
@@ -645,13 +645,13 @@ class TestMetadataUtilities:
     @m.context("When it wants both study and sample metadata enhanced")
     @m.it("Counts successes correctly")
     def test_update_secondary_metadata(
-        self, simple_study_and_sample_data_object, simple_study_and_sample_mlwh
+        self, single_study_and_single_sample_data_object, study_and_samples_mlwh
     ):
-        obj_path = simple_study_and_sample_data_object.as_posix()
+        obj_path = single_study_and_single_sample_data_object.as_posix()
 
         with StringIO("\n".join([obj_path])) as reader:
             with StringIO() as writer:
-                engine = simple_study_and_sample_mlwh.get_bind()
+                engine = study_and_samples_mlwh.get_bind()
                 num_processed, num_updated, num_errors = update_secondary_metadata(
                     reader, writer, engine, print_update=True
                 )
@@ -675,27 +675,27 @@ def test_update_secondary_metadata(
             AVU(TrackedSample.NAME, "name1"),
             AVU(TrackedSample.PUBLIC_NAME, "public_name1"),
             AVU(TrackedSample.SUPPLIER_NAME, "supplier_name1"),
-            AVU(TrackedSample.UUID, "82429892-0ab6-11ee-b5ba-fa163eac3af7"),
+            AVU(TrackedSample.UUID, "82429892-0ab6-11ee-b5ba-fa163eac3ag7"),
         ]
 
-        obj = DataObject(simple_study_and_sample_data_object)
+        obj = DataObject(single_study_and_single_sample_data_object)
         for avu in expected_avus:
             assert avu in obj.metadata()
 
     @m.context("When an iRODS object has study ID, but not sample ID metadata")
     @m.context("When it wants study metadata enhanced")
     @m.it("Counts successes correctly")
     def test_update_secondary_metadata_with_just_study(
-        self, simple_study_and_sample_data_object, simple_study_and_sample_mlwh
+        self, single_study_and_single_sample_data_object, study_and_samples_mlwh
     ):
-        obj_path = simple_study_and_sample_data_object.as_posix()
-        obj = DataObject(simple_study_and_sample_data_object)
+        obj_path = single_study_and_single_sample_data_object.as_posix()
+        obj = DataObject(single_study_and_single_sample_data_object)
 
         assert obj.remove_metadata(AVU(TrackedSample.ID, "id_sample_lims1")) == 1
 
         with StringIO("\n".join([obj_path])) as reader:
             with StringIO() as writer:
-                engine = simple_study_and_sample_mlwh.get_bind()
+                engine = study_and_samples_mlwh.get_bind()
                 num_processed, num_updated, num_errors = update_secondary_metadata(
                     reader, writer, engine, print_update=True
                 )
@@ -720,16 +720,16 @@ def test_update_secondary_metadata_with_just_study(
     @m.context("When it wants sample metadata enhanced")
     @m.it("Counts successes correctly")
     def test_update_general_metadata_with_just_study_err(
-        self, simple_study_and_sample_data_object, simple_study_and_sample_mlwh
+        self, single_study_and_single_sample_data_object, study_and_samples_mlwh
     ):
-        obj_path = simple_study_and_sample_data_object.as_posix()
-        obj = DataObject(simple_study_and_sample_data_object)
+        obj_path = single_study_and_single_sample_data_object.as_posix()
+        obj = DataObject(single_study_and_single_sample_data_object)
 
         assert obj.remove_metadata(AVU(TrackedStudy.ID, "1000")) == 1
 
         with StringIO("\n".join([obj_path])) as reader:
             with StringIO() as writer:
-                engine = simple_study_and_sample_mlwh.get_bind()
+                engine = study_and_samples_mlwh.get_bind()
                 num_processed, num_updated, num_errors = update_secondary_metadata(
                     reader, writer, engine, print_update=True
                 )