Skip to content

Commit

Permalink
Fix Illumina flowcells query to use the entity_type column
Browse files Browse the repository at this point in the history
The query to find Illumina flowcells in the MLWH by component was
using tag index to try to identify controls. This is not a reliable
method; entity_type should be used instead. The set of valid entity
types is defined in the metadata of the MLWH schema.
  • Loading branch information
kjsanger committed Jan 9, 2024
1 parent eef8337 commit 3d7336a
Show file tree
Hide file tree
Showing 2 changed files with 89 additions and 39 deletions.
67 changes: 36 additions & 31 deletions src/npg_irods/illumina.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from typing import Iterator, Optional, Type

from partisan.irods import AVU, Collection, DataObject
from partisan.metadata import AsValueEnum
from sqlalchemy import asc
from sqlalchemy.orm import Session
from structlog import get_logger
Expand Down Expand Up @@ -57,11 +58,17 @@ class TagIndex(Enum):
Rather, it is a bin for reads that cannot be associated with any of the candidate
tags in a pool after sequencing."""

CONTROL_198 = 198

CONTROL_888 = 888
"""Tag index 888 is conventionally used to indicate a control sample e.g. Phi X
that has been added to a pool."""
@unique
class EntityType(AsValueEnum):
"""The type of sequenced material applied to a flowcell. This related to the
entity_type column in the MLWH. The values are defined in the MLWH schema
metadata."""

LIBRARY = "library"
LIBRARY_CONTROL = "library_control"
LIBRARY_INDEXED = "library_indexed"
LIBRARY_INDEXED_SPIKE = "library_indexed_spike"


@dataclass(order=True)
Expand Down Expand Up @@ -141,7 +148,7 @@ def __repr__(self):
if self.tag_index is not None:
rep[SeqConcept.TAG_INDEX.value] = self.tag_index
if self.subset is not None:
rep[SeqConcept.SUBSET.value] = self.subset
rep[SeqConcept.SUBSET.value] = self.subset.value

return json.dumps(rep, sort_keys=True, separators=(",", ":"))

Expand Down Expand Up @@ -202,25 +209,29 @@ def empty_acl(*args):
return []

if requires_full_metadata(item):
log.info("Requires full metadata", path=item)
log.debug("Requires full metadata", path=item)
sample_fn, study_fn = make_sample_metadata, make_study_metadata
else:
log.info("Requires reduced metadata", path=item)
log.debug("Requires reduced metadata", path=item)
sample_fn, study_fn = make_reduced_sample_metadata, make_reduced_study_metadata

if requires_managed_access(item):
log.info("Requires managed access", path=item)
log.debug("Requires managed access", path=item)
acl_fn = make_sample_acl
else:
log.info("Does not require managed access", path=item)
log.debug("Does not require managed access", path=item)
acl_fn = empty_acl

# Each component may be associated with multiple flowcells
components = find_associated_components(item)
log.debug("Found associated components", path=item, comp=components)

for c in components:
for fc in find_flowcells_by_component(
flowcells = find_flowcells_by_component(
mlwh_session, c, include_controls=include_controls
):
)
log.debug("Found associated flowcells", path=item, flowcells=flowcells, comp=c)
for fc in flowcells:
secondary_metadata.extend(sample_fn(fc.sample))
secondary_metadata.extend(study_fn(fc.study))
acl.extend(acl_fn(fc.sample, fc.study, zone=zone))
Expand Down Expand Up @@ -467,31 +478,25 @@ def find_flowcells_by_component(
sess.query(IseqFlowcell)
.distinct()
.join(IseqFlowcell.iseq_product_metrics)
.filter(IseqProductMetrics.id_run == component.id_run)
.filter(
IseqProductMetrics.id_run == component.id_run,
IseqFlowcell.position == component.position,
)
)

if component.position is not None:
query = query.filter(IseqProductMetrics.position == component.position)
if not include_controls:
query = query.filter(
IseqFlowcell.entity_type.notin_(
[
EntityType.LIBRARY_CONTROL.value,
EntityType.LIBRARY_INDEXED_SPIKE.value,
]
)
)

match component.tag_index:
case TagIndex.CONTROL_198.value | TagIndex.CONTROL_888.value:
if not include_controls:
query = query.filter(
IseqProductMetrics.tag_index.notin_(
[TagIndex.CONTROL_198.value, TagIndex.CONTROL_888.value]
)
)

query = query.filter(IseqProductMetrics.tag_index == component.tag_index)
case TagIndex.BIN.value:
if not include_controls:
query = query.filter(
IseqProductMetrics.tag_index.notin_(
[TagIndex.CONTROL_198.value, TagIndex.CONTROL_888.value]
)
)

query = query.filter(IseqProductMetrics.tag_index.is_not(None))
query = query.filter(IseqProductMetrics.tag_index.isnot(None))
case int():
query = query.filter(IseqProductMetrics.tag_index == component.tag_index)
case None:
Expand Down
61 changes: 53 additions & 8 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
Sample,
Study,
)
from npg_irods.illumina import EntityType
from npg_irods.metadata import illumina, ont
from npg_irods.metadata.common import DataFile, SeqConcept
from npg_irods.metadata.lims import TrackedSample
Expand Down Expand Up @@ -413,34 +414,78 @@ def initialize_mlwh_illumina_synthetic(session: Session):

sample_info = [
# Not multiplexed
{"study": study_a, "sample": sample1, "position": 1, "tag_index": None},
{
"study": study_a,
"sample": sample1,
"position": 1,
"tag_index": None,
"entity_type": EntityType.LIBRARY.value,
},
# Multiplexed, samples from the same study
{"study": study_a, "sample": sample1, "position": 1, "tag_index": 1},
{"study": study_a, "sample": sample2, "position": 1, "tag_index": 2},
{"study": study_a, "sample": sample1, "position": 2, "tag_index": 1},
{"study": study_a, "sample": sample2, "position": 2, "tag_index": 2},
{
"study": study_a,
"sample": sample1,
"position": 1,
"tag_index": 1,
"entity_type": EntityType.LIBRARY_INDEXED.value,
},
{
"study": study_a,
"sample": sample2,
"position": 1,
"tag_index": 2,
"entity_type": EntityType.LIBRARY_INDEXED.value,
},
{
"study": study_a,
"sample": sample1,
"position": 2,
"tag_index": 1,
"entity_type": EntityType.LIBRARY_INDEXED.value,
},
{
"study": study_a,
"sample": sample2,
"position": 2,
"tag_index": 2,
"entity_type": EntityType.LIBRARY_INDEXED.value,
},
# Multiplexed, samples from different studies
{"study": study_a, "sample": sample1, "position": 2, "tag_index": 1},
{"study": study_b, "sample": sample3, "position": 2, "tag_index": 2},
{
"study": study_a,
"sample": sample1,
"position": 2,
"tag_index": 1,
"entity_type": EntityType.LIBRARY_INDEXED.value,
},
{
"study": study_b,
"sample": sample3,
"position": 2,
"tag_index": 2,
"entity_type": EntityType.LIBRARY_INDEXED.value,
},
# Phi X
{
"study": control_study,
"sample": control_sample,
"position": 1,
"tag_index": 888,
"entity_type": EntityType.LIBRARY_INDEXED_SPIKE.value,
},
{
"study": control_study,
"sample": control_sample,
"position": 2,
"tag_index": 888,
"entity_type": EntityType.LIBRARY_INDEXED_SPIKE.value,
},
]

flowcells = [
IseqFlowcell(
entity_id_lims=f"ENTITY_01",
entity_type=f"ENTITY_TYPE_01",
entity_type=info["entity_type"],
id_flowcell_lims=f"FLOWCELL{i}",
id_lims="LIMS_01",
id_pool_lims=f"POOL_01",
Expand Down

0 comments on commit 3d7336a

Please sign in to comment.