Skip to content

Commit

Permalink
feat(l2g): merge sQTL and tuQTL colocalisation features (opentargets#824
Browse files Browse the repository at this point in the history
)

* feat: merge tuQTL colocalisation results into sQTL features

* fix: add colocalisation neighbourhood features in the l2g default features list

* fix: minor bug
  • Loading branch information
ireneisdoomed authored Oct 11, 2024
1 parent c7c602a commit e3d32ba
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 212 deletions.
4 changes: 0 additions & 4 deletions docs/python_api/datasets/l2g_features/colocalisation.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,15 @@ title: From colocalisation
::: gentropy.dataset.l2g_features.colocalisation.EQtlColocClppMaximumFeature
::: gentropy.dataset.l2g_features.colocalisation.PQtlColocClppMaximumFeature
::: gentropy.dataset.l2g_features.colocalisation.SQtlColocClppMaximumFeature
::: gentropy.dataset.l2g_features.colocalisation.TuQtlColocClppMaximumFeature
::: gentropy.dataset.l2g_features.colocalisation.EQtlColocH4MaximumFeature
::: gentropy.dataset.l2g_features.colocalisation.PQtlColocH4MaximumFeature
::: gentropy.dataset.l2g_features.colocalisation.SQtlColocH4MaximumFeature
::: gentropy.dataset.l2g_features.colocalisation.TuQtlColocH4MaximumFeature
::: gentropy.dataset.l2g_features.colocalisation.EQtlColocClppMaximumNeighbourhoodFeature
::: gentropy.dataset.l2g_features.colocalisation.PQtlColocClppMaximumNeighbourhoodFeature
::: gentropy.dataset.l2g_features.colocalisation.SQtlColocClppMaximumNeighbourhoodFeature
::: gentropy.dataset.l2g_features.colocalisation.TuQtlColocClppMaximumNeighbourhoodFeature
::: gentropy.dataset.l2g_features.colocalisation.EQtlColocH4MaximumNeighbourhoodFeature
::: gentropy.dataset.l2g_features.colocalisation.PQtlColocH4MaximumNeighbourhoodFeature
::: gentropy.dataset.l2g_features.colocalisation.SQtlColocH4MaximumNeighbourhoodFeature
::: gentropy.dataset.l2g_features.colocalisation.TuQtlColocH4MaximumNeighbourhoodFeature

## Common logic

Expand Down
10 changes: 8 additions & 2 deletions src/gentropy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,12 +246,18 @@ class LocusToGeneConfig(StepConfig):
"eQtlColocClppMaximum",
"pQtlColocClppMaximum",
"sQtlColocClppMaximum",
"tuQtlColocClppMaximum",
# max H4 for each (study, locus, gene) aggregating over a specific qtl type
"eQtlColocH4Maximum",
"pQtlColocH4Maximum",
"sQtlColocH4Maximum",
"tuQtlColocH4Maximum",
# max CLPP for each (study, locus, gene) aggregating over a specific qtl type and in relation with the mean in the vicinity
"eQtlColocClppMaximumNeighbourhood",
"pQtlColocClppMaximumNeighbourhood",
"sQtlColocClppMaximumNeighbourhood",
# max H4 for each (study, locus, gene) aggregating over a specific qtl type and in relation with the mean in the vicinity
"eQtlColocH4MaximumNeighbourhood",
"pQtlColocH4MaximumNeighbourhood",
"sQtlColocH4MaximumNeighbourhood",
# distance to gene footprint
"distanceSentinelFootprint",
"distanceSentinelFootprintNeighbourhood",
Expand Down
21 changes: 13 additions & 8 deletions src/gentropy/dataset/colocalisation.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,15 @@ def extract_maximum_coloc_probability_per_region_and_gene(
study_index: StudyIndex,
*,
filter_by_colocalisation_method: str,
filter_by_qtl: str | None = None,
filter_by_qtls: str | list[str] | None = None,
) -> DataFrame:
"""Get maximum colocalisation probability for a (studyLocus, gene) window.
Args:
study_locus (StudyLocus): Dataset containing study loci to filter the colocalisation dataset on and the geneId linked to the region
study_index (StudyIndex): Study index to use to get study metadata
filter_by_colocalisation_method (str): optional filter to apply on the colocalisation dataset
filter_by_qtl (str | None): optional filter to apply on the colocalisation dataset
filter_by_qtls (str | list[str] | None): optional filter to apply on the colocalisation dataset
Returns:
DataFrame: table with the maximum colocalisation scores for the provided study loci
Expand All @@ -63,8 +63,15 @@ def extract_maximum_coloc_probability_per_region_and_gene(
valid_qtls = list(
set(EqtlCatalogueStudyIndex.method_to_study_type_mapping.values())
)
if filter_by_qtl and filter_by_qtl not in valid_qtls:
raise ValueError(f"There are no studies with QTL type {filter_by_qtl}")

if filter_by_qtls:
filter_by_qtls = (
list(map(str.lower, [filter_by_qtls]))
if isinstance(filter_by_qtls, str)
else list(map(str.lower, filter_by_qtls))
)
if any(qtl not in valid_qtls for qtl in filter_by_qtls):
raise ValueError(f"There are no studies with QTL type {filter_by_qtls}")

if filter_by_colocalisation_method not in [
"ECaviar",
Expand All @@ -82,10 +89,8 @@ def extract_maximum_coloc_probability_per_region_and_gene(
f.col("rightGeneId").isNotNull(),
f.lower("colocalisationMethod") == filter_by_colocalisation_method.lower(),
]
if filter_by_qtl:
coloc_filtering_expr.append(
f.lower("rightStudyType") == filter_by_qtl.lower()
)
if filter_by_qtls:
coloc_filtering_expr.append(f.lower("rightStudyType").isin(filter_by_qtls))

filtered_colocalisation = (
# Bring rightStudyType and rightGeneId and filter by rows where the gene is null,
Expand Down
196 changes: 14 additions & 182 deletions src/gentropy/dataset/l2g_features/colocalisation.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def common_colocalisation_feature_logic(
colocalisation_method: str,
colocalisation_metric: str,
feature_name: str,
qtl_type: str,
qtl_types: list[str] | str,
*,
colocalisation: Colocalisation,
study_index: StudyIndex,
Expand All @@ -36,7 +36,7 @@ def common_colocalisation_feature_logic(
colocalisation_method (str): The colocalisation method to filter the data by
colocalisation_metric (str): The colocalisation metric to use
feature_name (str): The name of the feature to create
qtl_type (str): The type of QTL to filter the data by
qtl_types (list[str] | str): The types of QTL to filter the data by
colocalisation (Colocalisation): Dataset with the colocalisation results
study_index (StudyIndex): Study index to fetch study type and gene
study_locus (StudyLocus): Study locus to traverse between colocalisation and study index
Expand All @@ -55,7 +55,7 @@ def common_colocalisation_feature_logic(
study_locus,
study_index,
filter_by_colocalisation_method=colocalisation_method,
filter_by_qtl=qtl_type,
filter_by_qtls=qtl_types,
),
on=joining_cols,
)
Expand All @@ -73,7 +73,7 @@ def common_neighbourhood_colocalisation_feature_logic(
colocalisation_method: str,
colocalisation_metric: str,
feature_name: str,
qtl_type: str,
qtl_types: list[str] | str,
*,
colocalisation: Colocalisation,
study_index: StudyIndex,
Expand All @@ -86,7 +86,7 @@ def common_neighbourhood_colocalisation_feature_logic(
colocalisation_method (str): The colocalisation method to filter the data by
colocalisation_metric (str): The colocalisation metric to use
feature_name (str): The name of the feature to create
qtl_type (str): The type of QTL to filter the data by
qtl_types (list[str] | str): The types of QTL to filter the data by
colocalisation (Colocalisation): Dataset with the colocalisation results
study_index (StudyIndex): Study index to fetch study type and gene
study_locus (StudyLocus): Study locus to traverse between colocalisation and study index
Expand All @@ -101,7 +101,7 @@ def common_neighbourhood_colocalisation_feature_logic(
colocalisation_method,
colocalisation_metric,
local_feature_name,
qtl_type,
qtl_types,
colocalisation=colocalisation,
study_index=study_index,
study_locus=study_locus,
Expand Down Expand Up @@ -310,15 +310,15 @@ def compute(
"""
colocalisation_method = "ECaviar"
colocalisation_metric = "clpp"
qtl_type = "sqtl"
qtl_types = ["sqtl", "tuqtl"]
return cls(
_df=convert_from_wide_to_long(
common_colocalisation_feature_logic(
study_loci_to_annotate,
colocalisation_method,
colocalisation_metric,
cls.feature_name,
qtl_type,
qtl_types,
**feature_dependency,
),
id_vars=("studyLocusId", "geneId"),
Expand Down Expand Up @@ -352,99 +352,15 @@ def compute(
"""
colocalisation_method = "ECaviar"
colocalisation_metric = "clpp"
qtl_type = "sqtl"
return cls(
_df=convert_from_wide_to_long(
common_neighbourhood_colocalisation_feature_logic(
study_loci_to_annotate,
colocalisation_method,
colocalisation_metric,
cls.feature_name,
qtl_type,
**feature_dependency,
),
id_vars=("studyLocusId", "geneId"),
var_name="featureName",
value_name="featureValue",
),
_schema=cls.get_schema(),
)


class TuQtlColocClppMaximumFeature(L2GFeature):
"""Max CLPP for each (study, locus, gene) aggregating over all tuQTLs."""

feature_dependency_type = [Colocalisation, StudyIndex, StudyLocus]
feature_name = "tuQtlColocClppMaximum"

@classmethod
def compute(
cls: type[TuQtlColocClppMaximumFeature],
study_loci_to_annotate: StudyLocus | L2GGoldStandard,
feature_dependency: dict[str, Any],
) -> TuQtlColocClppMaximumFeature:
"""Computes the feature.
Args:
study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
feature_dependency (dict[str, Any]): Dataset with the colocalisation results
Returns:
TuQtlColocClppMaximumFeature: Feature dataset
"""
colocalisation_method = "ECaviar"
colocalisation_metric = "clpp"
qtl_type = "tuqtl"
return cls(
_df=convert_from_wide_to_long(
common_colocalisation_feature_logic(
study_loci_to_annotate,
colocalisation_method,
colocalisation_metric,
cls.feature_name,
qtl_type,
**feature_dependency,
),
id_vars=("studyLocusId", "geneId"),
var_name="featureName",
value_name="featureValue",
),
_schema=cls.get_schema(),
)


class TuQtlColocClppMaximumNeighbourhoodFeature(L2GFeature):
"""Max CLPP for each (study, locus) aggregating over all tuQTLs."""

feature_dependency_type = [Colocalisation, StudyIndex, StudyLocus]
feature_name = "tuQtlColocClppMaximumNeighbourhood"

@classmethod
def compute(
cls: type[TuQtlColocClppMaximumNeighbourhoodFeature],
study_loci_to_annotate: StudyLocus | L2GGoldStandard,
feature_dependency: dict[str, Any],
) -> TuQtlColocClppMaximumNeighbourhoodFeature:
"""Computes the feature.
Args:
study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
feature_dependency (dict[str, Any]): Dataset with the colocalisation results
Returns:
TuQtlColocClppMaximumNeighbourhoodFeature: Feature dataset
"""
colocalisation_method = "ECaviar"
colocalisation_metric = "clpp"
qtl_type = "tuqtl"
qtl_types = ["sqtl", "tuqtl"]
return cls(
_df=convert_from_wide_to_long(
common_neighbourhood_colocalisation_feature_logic(
study_loci_to_annotate,
colocalisation_method,
colocalisation_metric,
cls.feature_name,
qtl_type,
qtl_types,
**feature_dependency,
),
id_vars=("studyLocusId", "geneId"),
Expand Down Expand Up @@ -646,15 +562,15 @@ def compute(
"""
colocalisation_method = "Coloc"
colocalisation_metric = "h4"
qtl_type = "sqtl"
qtl_types = ["sqtl", "tuqtl"]
return cls(
_df=convert_from_wide_to_long(
common_colocalisation_feature_logic(
study_loci_to_annotate,
colocalisation_method,
colocalisation_metric,
cls.feature_name,
qtl_type,
qtl_types,
**feature_dependency,
),
id_vars=("studyLocusId", "geneId"),
Expand Down Expand Up @@ -688,99 +604,15 @@ def compute(
"""
colocalisation_method = "Coloc"
colocalisation_metric = "h4"
qtl_type = "sqtl"
qtl_types = ["sqtl", "tuqtl"]
return cls(
_df=convert_from_wide_to_long(
common_neighbourhood_colocalisation_feature_logic(
study_loci_to_annotate,
colocalisation_method,
colocalisation_metric,
cls.feature_name,
qtl_type,
**feature_dependency,
),
id_vars=("studyLocusId", "geneId"),
var_name="featureName",
value_name="featureValue",
),
_schema=cls.get_schema(),
)


class TuQtlColocH4MaximumFeature(L2GFeature):
"""Max H4 for each (study, locus, gene) aggregating over all tuQTLs."""

feature_dependency_type = [Colocalisation, StudyIndex, StudyLocus]
feature_name = "tuQtlColocH4Maximum"

@classmethod
def compute(
cls: type[TuQtlColocH4MaximumFeature],
study_loci_to_annotate: StudyLocus | L2GGoldStandard,
feature_dependency: dict[str, Any],
) -> TuQtlColocH4MaximumFeature:
"""Computes the feature.
Args:
study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
feature_dependency (dict[str, Any]): Dataset with the colocalisation results
Returns:
TuQtlColocH4MaximumFeature: Feature dataset
"""
colocalisation_method = "Coloc"
colocalisation_metric = "h4"
qtl_type = "tuqtl"
return cls(
_df=convert_from_wide_to_long(
common_colocalisation_feature_logic(
study_loci_to_annotate,
colocalisation_method,
colocalisation_metric,
cls.feature_name,
qtl_type,
**feature_dependency,
),
id_vars=("studyLocusId", "geneId"),
var_name="featureName",
value_name="featureValue",
),
_schema=cls.get_schema(),
)


class TuQtlColocH4MaximumNeighbourhoodFeature(L2GFeature):
"""Max H4 for each (study, locus) aggregating over all tuQTLs."""

feature_dependency_type = [Colocalisation, StudyIndex, StudyLocus]
feature_name = "tuQtlColocH4MaximumNeighbourhood"

@classmethod
def compute(
cls: type[TuQtlColocH4MaximumNeighbourhoodFeature],
study_loci_to_annotate: StudyLocus | L2GGoldStandard,
feature_dependency: dict[str, Any],
) -> TuQtlColocH4MaximumNeighbourhoodFeature:
"""Computes the feature.
Args:
study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
feature_dependency (dict[str, Any]): Dataset with the colocalisation results
Returns:
TuQtlColocH4MaximumNeighbourhoodFeature: Feature dataset
"""
colocalisation_method = "Coloc"
colocalisation_metric = "h4"
qtl_type = "tuqtl"
return cls(
_df=convert_from_wide_to_long(
common_colocalisation_feature_logic(
study_loci_to_annotate,
colocalisation_method,
colocalisation_metric,
cls.feature_name,
qtl_type,
qtl_types,
**feature_dependency,
),
id_vars=("studyLocusId", "geneId"),
Expand Down
Loading

0 comments on commit e3d32ba

Please sign in to comment.