feat(l2g): merge sQTL and tuQTL colocalisation features (opentargets#824

) * feat: merge tuQTL colocalisation results into sQTL features * fix: add colocalisation neighbourhood features in the l2g default features list * fix: minor bug
thehyve · Oct 11, 2024 · e3d32ba · e3d32ba
1 parent c7c602a
commit e3d32ba
Show file tree

Hide file tree

Showing 6 changed files with 35 additions and 212 deletions.
diff --git a/docs/python_api/datasets/l2g_features/colocalisation.md b/docs/python_api/datasets/l2g_features/colocalisation.md
@@ -7,19 +7,15 @@ title: From colocalisation
 ::: gentropy.dataset.l2g_features.colocalisation.EQtlColocClppMaximumFeature
 ::: gentropy.dataset.l2g_features.colocalisation.PQtlColocClppMaximumFeature
 ::: gentropy.dataset.l2g_features.colocalisation.SQtlColocClppMaximumFeature
-::: gentropy.dataset.l2g_features.colocalisation.TuQtlColocClppMaximumFeature
 ::: gentropy.dataset.l2g_features.colocalisation.EQtlColocH4MaximumFeature
 ::: gentropy.dataset.l2g_features.colocalisation.PQtlColocH4MaximumFeature
 ::: gentropy.dataset.l2g_features.colocalisation.SQtlColocH4MaximumFeature
-::: gentropy.dataset.l2g_features.colocalisation.TuQtlColocH4MaximumFeature
 ::: gentropy.dataset.l2g_features.colocalisation.EQtlColocClppMaximumNeighbourhoodFeature
 ::: gentropy.dataset.l2g_features.colocalisation.PQtlColocClppMaximumNeighbourhoodFeature
 ::: gentropy.dataset.l2g_features.colocalisation.SQtlColocClppMaximumNeighbourhoodFeature
-::: gentropy.dataset.l2g_features.colocalisation.TuQtlColocClppMaximumNeighbourhoodFeature
 ::: gentropy.dataset.l2g_features.colocalisation.EQtlColocH4MaximumNeighbourhoodFeature
 ::: gentropy.dataset.l2g_features.colocalisation.PQtlColocH4MaximumNeighbourhoodFeature
 ::: gentropy.dataset.l2g_features.colocalisation.SQtlColocH4MaximumNeighbourhoodFeature
-::: gentropy.dataset.l2g_features.colocalisation.TuQtlColocH4MaximumNeighbourhoodFeature
 
 ## Common logic
 

diff --git a/src/gentropy/config.py b/src/gentropy/config.py
@@ -246,12 +246,18 @@ class LocusToGeneConfig(StepConfig):
             "eQtlColocClppMaximum",
             "pQtlColocClppMaximum",
             "sQtlColocClppMaximum",
-            "tuQtlColocClppMaximum",
             # max H4 for each (study, locus, gene) aggregating over a specific qtl type
             "eQtlColocH4Maximum",
             "pQtlColocH4Maximum",
             "sQtlColocH4Maximum",
-            "tuQtlColocH4Maximum",
+            # max CLPP for each (study, locus, gene) aggregating over a specific qtl type and in relation with the mean in the vicinity
+            "eQtlColocClppMaximumNeighbourhood",
+            "pQtlColocClppMaximumNeighbourhood",
+            "sQtlColocClppMaximumNeighbourhood",
+            # max H4 for each (study, locus, gene) aggregating over a specific qtl type and in relation with the mean in the vicinity
+            "eQtlColocH4MaximumNeighbourhood",
+            "pQtlColocH4MaximumNeighbourhood",
+            "sQtlColocH4MaximumNeighbourhood",
             # distance to gene footprint
             "distanceSentinelFootprint",
             "distanceSentinelFootprintNeighbourhood",

diff --git a/src/gentropy/dataset/colocalisation.py b/src/gentropy/dataset/colocalisation.py
@@ -42,15 +42,15 @@ def extract_maximum_coloc_probability_per_region_and_gene(
         study_index: StudyIndex,
         *,
         filter_by_colocalisation_method: str,
-        filter_by_qtl: str | None = None,
+        filter_by_qtls: str | list[str] | None = None,
     ) -> DataFrame:
         """Get maximum colocalisation probability for a (studyLocus, gene) window.
 
         Args:
             study_locus (StudyLocus): Dataset containing study loci to filter the colocalisation dataset on and the geneId linked to the region
             study_index (StudyIndex): Study index to use to get study metadata
             filter_by_colocalisation_method (str): optional filter to apply on the colocalisation dataset
-            filter_by_qtl (str | None): optional filter to apply on the colocalisation dataset
+            filter_by_qtls (str | list[str] | None): optional filter to apply on the colocalisation dataset
 
         Returns:
             DataFrame: table with the maximum colocalisation scores for the provided study loci
@@ -63,8 +63,15 @@ def extract_maximum_coloc_probability_per_region_and_gene(
         valid_qtls = list(
             set(EqtlCatalogueStudyIndex.method_to_study_type_mapping.values())
         )
-        if filter_by_qtl and filter_by_qtl not in valid_qtls:
-            raise ValueError(f"There are no studies with QTL type {filter_by_qtl}")
+
+        if filter_by_qtls:
+            filter_by_qtls = (
+                list(map(str.lower, [filter_by_qtls]))
+                if isinstance(filter_by_qtls, str)
+                else list(map(str.lower, filter_by_qtls))
+            )
+            if any(qtl not in valid_qtls for qtl in filter_by_qtls):
+                raise ValueError(f"There are no studies with QTL type {filter_by_qtls}")
 
         if filter_by_colocalisation_method not in [
             "ECaviar",
@@ -82,10 +89,8 @@ def extract_maximum_coloc_probability_per_region_and_gene(
             f.col("rightGeneId").isNotNull(),
             f.lower("colocalisationMethod") == filter_by_colocalisation_method.lower(),
         ]
-        if filter_by_qtl:
-            coloc_filtering_expr.append(
-                f.lower("rightStudyType") == filter_by_qtl.lower()
-            )
+        if filter_by_qtls:
+            coloc_filtering_expr.append(f.lower("rightStudyType").isin(filter_by_qtls))
 
         filtered_colocalisation = (
             # Bring rightStudyType and rightGeneId and filter by rows where the gene is null,

diff --git a/src/gentropy/dataset/l2g_features/colocalisation.py b/src/gentropy/dataset/l2g_features/colocalisation.py
@@ -23,7 +23,7 @@ def common_colocalisation_feature_logic(
     colocalisation_method: str,
     colocalisation_metric: str,
     feature_name: str,
-    qtl_type: str,
+    qtl_types: list[str] | str,
     *,
     colocalisation: Colocalisation,
     study_index: StudyIndex,
@@ -36,7 +36,7 @@ def common_colocalisation_feature_logic(
         colocalisation_method (str): The colocalisation method to filter the data by
         colocalisation_metric (str): The colocalisation metric to use
         feature_name (str): The name of the feature to create
-        qtl_type (str): The type of QTL to filter the data by
+        qtl_types (list[str] | str): The types of QTL to filter the data by
         colocalisation (Colocalisation): Dataset with the colocalisation results
         study_index (StudyIndex): Study index to fetch study type and gene
         study_locus (StudyLocus): Study locus to traverse between colocalisation and study index
@@ -55,7 +55,7 @@ def common_colocalisation_feature_logic(
                 study_locus,
                 study_index,
                 filter_by_colocalisation_method=colocalisation_method,
-                filter_by_qtl=qtl_type,
+                filter_by_qtls=qtl_types,
             ),
             on=joining_cols,
         )
@@ -73,7 +73,7 @@ def common_neighbourhood_colocalisation_feature_logic(
     colocalisation_method: str,
     colocalisation_metric: str,
     feature_name: str,
-    qtl_type: str,
+    qtl_types: list[str] | str,
     *,
     colocalisation: Colocalisation,
     study_index: StudyIndex,
@@ -86,7 +86,7 @@ def common_neighbourhood_colocalisation_feature_logic(
         colocalisation_method (str): The colocalisation method to filter the data by
         colocalisation_metric (str): The colocalisation metric to use
         feature_name (str): The name of the feature to create
-        qtl_type (str): The type of QTL to filter the data by
+        qtl_types (list[str] | str): The types of QTL to filter the data by
         colocalisation (Colocalisation): Dataset with the colocalisation results
         study_index (StudyIndex): Study index to fetch study type and gene
         study_locus (StudyLocus): Study locus to traverse between colocalisation and study index
@@ -101,7 +101,7 @@ def common_neighbourhood_colocalisation_feature_logic(
         colocalisation_method,
         colocalisation_metric,
         local_feature_name,
-        qtl_type,
+        qtl_types,
         colocalisation=colocalisation,
         study_index=study_index,
         study_locus=study_locus,
@@ -310,15 +310,15 @@ def compute(
         """
         colocalisation_method = "ECaviar"
         colocalisation_metric = "clpp"
-        qtl_type = "sqtl"
+        qtl_types = ["sqtl", "tuqtl"]
         return cls(
             _df=convert_from_wide_to_long(
                 common_colocalisation_feature_logic(
                     study_loci_to_annotate,
                     colocalisation_method,
                     colocalisation_metric,
                     cls.feature_name,
-                    qtl_type,
+                    qtl_types,
                     **feature_dependency,
                 ),
                 id_vars=("studyLocusId", "geneId"),
@@ -352,99 +352,15 @@ def compute(
         """
         colocalisation_method = "ECaviar"
         colocalisation_metric = "clpp"
-        qtl_type = "sqtl"
-        return cls(
-            _df=convert_from_wide_to_long(
-                common_neighbourhood_colocalisation_feature_logic(
-                    study_loci_to_annotate,
-                    colocalisation_method,
-                    colocalisation_metric,
-                    cls.feature_name,
-                    qtl_type,
-                    **feature_dependency,
-                ),
-                id_vars=("studyLocusId", "geneId"),
-                var_name="featureName",
-                value_name="featureValue",
-            ),
-            _schema=cls.get_schema(),
-        )
-
-
-class TuQtlColocClppMaximumFeature(L2GFeature):
-    """Max CLPP for each (study, locus, gene) aggregating over all tuQTLs."""
-
-    feature_dependency_type = [Colocalisation, StudyIndex, StudyLocus]
-    feature_name = "tuQtlColocClppMaximum"
-
-    @classmethod
-    def compute(
-        cls: type[TuQtlColocClppMaximumFeature],
-        study_loci_to_annotate: StudyLocus | L2GGoldStandard,
-        feature_dependency: dict[str, Any],
-    ) -> TuQtlColocClppMaximumFeature:
-        """Computes the feature.
-
-        Args:
-            study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
-            feature_dependency (dict[str, Any]): Dataset with the colocalisation results
-
-        Returns:
-            TuQtlColocClppMaximumFeature: Feature dataset
-        """
-        colocalisation_method = "ECaviar"
-        colocalisation_metric = "clpp"
-        qtl_type = "tuqtl"
-        return cls(
-            _df=convert_from_wide_to_long(
-                common_colocalisation_feature_logic(
-                    study_loci_to_annotate,
-                    colocalisation_method,
-                    colocalisation_metric,
-                    cls.feature_name,
-                    qtl_type,
-                    **feature_dependency,
-                ),
-                id_vars=("studyLocusId", "geneId"),
-                var_name="featureName",
-                value_name="featureValue",
-            ),
-            _schema=cls.get_schema(),
-        )
-
-
-class TuQtlColocClppMaximumNeighbourhoodFeature(L2GFeature):
-    """Max CLPP for each (study, locus) aggregating over all tuQTLs."""
-
-    feature_dependency_type = [Colocalisation, StudyIndex, StudyLocus]
-    feature_name = "tuQtlColocClppMaximumNeighbourhood"
-
-    @classmethod
-    def compute(
-        cls: type[TuQtlColocClppMaximumNeighbourhoodFeature],
-        study_loci_to_annotate: StudyLocus | L2GGoldStandard,
-        feature_dependency: dict[str, Any],
-    ) -> TuQtlColocClppMaximumNeighbourhoodFeature:
-        """Computes the feature.
-
-        Args:
-            study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
-            feature_dependency (dict[str, Any]): Dataset with the colocalisation results
-
-        Returns:
-            TuQtlColocClppMaximumNeighbourhoodFeature: Feature dataset
-        """
-        colocalisation_method = "ECaviar"
-        colocalisation_metric = "clpp"
-        qtl_type = "tuqtl"
+        qtl_types = ["sqtl", "tuqtl"]
         return cls(
             _df=convert_from_wide_to_long(
                 common_neighbourhood_colocalisation_feature_logic(
                     study_loci_to_annotate,
                     colocalisation_method,
                     colocalisation_metric,
                     cls.feature_name,
-                    qtl_type,
+                    qtl_types,
                     **feature_dependency,
                 ),
                 id_vars=("studyLocusId", "geneId"),
@@ -646,15 +562,15 @@ def compute(
         """
         colocalisation_method = "Coloc"
         colocalisation_metric = "h4"
-        qtl_type = "sqtl"
+        qtl_types = ["sqtl", "tuqtl"]
         return cls(
             _df=convert_from_wide_to_long(
                 common_colocalisation_feature_logic(
                     study_loci_to_annotate,
                     colocalisation_method,
                     colocalisation_metric,
                     cls.feature_name,
-                    qtl_type,
+                    qtl_types,
                     **feature_dependency,
                 ),
                 id_vars=("studyLocusId", "geneId"),
@@ -688,99 +604,15 @@ def compute(
         """
         colocalisation_method = "Coloc"
         colocalisation_metric = "h4"
-        qtl_type = "sqtl"
+        qtl_types = ["sqtl", "tuqtl"]
         return cls(
             _df=convert_from_wide_to_long(
                 common_neighbourhood_colocalisation_feature_logic(
                     study_loci_to_annotate,
                     colocalisation_method,
                     colocalisation_metric,
                     cls.feature_name,
-                    qtl_type,
-                    **feature_dependency,
-                ),
-                id_vars=("studyLocusId", "geneId"),
-                var_name="featureName",
-                value_name="featureValue",
-            ),
-            _schema=cls.get_schema(),
-        )
-
-
-class TuQtlColocH4MaximumFeature(L2GFeature):
-    """Max H4 for each (study, locus, gene) aggregating over all tuQTLs."""
-
-    feature_dependency_type = [Colocalisation, StudyIndex, StudyLocus]
-    feature_name = "tuQtlColocH4Maximum"
-
-    @classmethod
-    def compute(
-        cls: type[TuQtlColocH4MaximumFeature],
-        study_loci_to_annotate: StudyLocus | L2GGoldStandard,
-        feature_dependency: dict[str, Any],
-    ) -> TuQtlColocH4MaximumFeature:
-        """Computes the feature.
-
-        Args:
-            study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
-            feature_dependency (dict[str, Any]): Dataset with the colocalisation results
-
-        Returns:
-            TuQtlColocH4MaximumFeature: Feature dataset
-        """
-        colocalisation_method = "Coloc"
-        colocalisation_metric = "h4"
-        qtl_type = "tuqtl"
-        return cls(
-            _df=convert_from_wide_to_long(
-                common_colocalisation_feature_logic(
-                    study_loci_to_annotate,
-                    colocalisation_method,
-                    colocalisation_metric,
-                    cls.feature_name,
-                    qtl_type,
-                    **feature_dependency,
-                ),
-                id_vars=("studyLocusId", "geneId"),
-                var_name="featureName",
-                value_name="featureValue",
-            ),
-            _schema=cls.get_schema(),
-        )
-
-
-class TuQtlColocH4MaximumNeighbourhoodFeature(L2GFeature):
-    """Max H4 for each (study, locus) aggregating over all tuQTLs."""
-
-    feature_dependency_type = [Colocalisation, StudyIndex, StudyLocus]
-    feature_name = "tuQtlColocH4MaximumNeighbourhood"
-
-    @classmethod
-    def compute(
-        cls: type[TuQtlColocH4MaximumNeighbourhoodFeature],
-        study_loci_to_annotate: StudyLocus | L2GGoldStandard,
-        feature_dependency: dict[str, Any],
-    ) -> TuQtlColocH4MaximumNeighbourhoodFeature:
-        """Computes the feature.
-
-        Args:
-            study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
-            feature_dependency (dict[str, Any]): Dataset with the colocalisation results
-
-        Returns:
-            TuQtlColocH4MaximumNeighbourhoodFeature: Feature dataset
-        """
-        colocalisation_method = "Coloc"
-        colocalisation_metric = "h4"
-        qtl_type = "tuqtl"
-        return cls(
-            _df=convert_from_wide_to_long(
-                common_colocalisation_feature_logic(
-                    study_loci_to_annotate,
-                    colocalisation_method,
-                    colocalisation_metric,
-                    cls.feature_name,
-                    qtl_type,
+                    qtl_types,
                     **feature_dependency,
                 ),
                 id_vars=("studyLocusId", "geneId"),