Skip to content

Commit

Permalink
feat: redefine neighbourhood features to represent similarity with be…
Browse files Browse the repository at this point in the history
…st metric + other fixes (opentargets#913)

* feat: mean to max

* fix: remove protein coding

* fix: adding protein coding

* feat(l2g): neighbourhood features are a division between local and regional

* feat(l2g): regional max for distance features only consider protein coding genes

* fix(coloc_features): regional max for coloc features only consider protein coding genes

* fix(vep_features): regional max for vep features only consider protein coding genes

* feat(l2g): train and predict based on protein coding genes only

* feat: set nbh feature to 1 if features are 0 in the region

* feat: set nbh feature to 1 if features are 0 in the region

* Revert "feat: set nbh feature to 1 if features are 0 in the region"

This reverts commit da145ab.

* fix: return nbh features only for protein coding genes + optimisation

* test: change expected results based on changes

* test: change expected results based on changes

* fix: test

---------

Co-authored-by: Yakov Tsepilov <[email protected]>
  • Loading branch information
ireneisdoomed and addramir authored Nov 15, 2024
1 parent c46480b commit 40ca215
Show file tree
Hide file tree
Showing 9 changed files with 117 additions and 99 deletions.
2 changes: 1 addition & 1 deletion src/gentropy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ class LocusToGeneConfig(StepConfig):
"geneCount500kb",
"proteinGeneCount500kb",
"credibleSetConfidence",
"isProteinCoding",
# "isProteinCoding",
]
)
hyperparameters: dict[str, Any] = field(
Expand Down
29 changes: 17 additions & 12 deletions src/gentropy/dataset/l2g_features/colocalisation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from typing import TYPE_CHECKING, Any

import pyspark.sql.functions as f
from pyspark.sql import Window

from gentropy.common.spark_helpers import convert_from_wide_to_long
from gentropy.dataset.colocalisation import Colocalisation
Expand Down Expand Up @@ -168,23 +169,27 @@ def common_neighbourhood_colocalisation_feature_logic(
study_locus,
)
)
# Compute average score in the vicinity (feature will be the same for any gene associated with a studyLocus)
# (non protein coding genes in the vicinity are excluded see #3552)
regional_mean_per_study_locus = (
return (
extended_local_max.join(
gene_index.df.select("geneId", "biotype"), "geneId", "left"
# Compute average score in the vicinity (feature will be the same for any gene associated with a studyLocus)
# (non protein coding genes in the vicinity are excluded see #3552)
gene_index.df.filter(f.col("biotype") == "protein_coding").select("geneId"),
"geneId",
"inner",
)
.withColumn(
"regional_max",
f.max(local_feature_name).over(Window.partitionBy("studyLocusId")),
)
.filter(f.col("biotype") == "protein_coding")
.groupBy("studyLocusId")
.agg(f.mean(local_feature_name).alias("regional_mean"))
)
return (
local_max.join(regional_mean_per_study_locus, "studyLocusId", "left")
.withColumn(
feature_name,
f.col(local_feature_name) - f.coalesce(f.col("regional_mean"), f.lit(0.0)),
f.when(
(f.col("regional_max").isNotNull()) & (f.col("regional_max") != 0.0),
f.col(local_feature_name)
/ f.coalesce(f.col("regional_max"), f.lit(0.0)),
).otherwise(f.lit(0.0)),
)
.drop("regional_mean", local_feature_name)
.drop("regional_max", local_feature_name)
)


Expand Down
41 changes: 29 additions & 12 deletions src/gentropy/dataset/l2g_features/distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from pyspark.sql import Window

from gentropy.common.spark_helpers import convert_from_wide_to_long
from gentropy.dataset.gene_index import GeneIndex
from gentropy.dataset.l2g_features.l2g_feature import L2GFeature
from gentropy.dataset.l2g_gold_standard import L2GGoldStandard
from gentropy.dataset.study_locus import StudyLocus
Expand Down Expand Up @@ -55,7 +56,7 @@ def common_distance_feature_logic(
agg_expr = f.sum(f.col("distance_score"))
elif "Sentinel" in feature_name:
df = study_loci_to_annotate.df.select("studyLocusId", "variantId")
# For minimum distances we calculate the unweighted distance between the sentinel (lead) and the gene. This
# For minimum distances we calculate the unweighted distance between the sentinel (lead) and the gene.
distance_score_expr = f.lit(genomic_window) - f.col(distance_type) + f.lit(1)
agg_expr = f.first(f.col("distance_score"))
return (
Expand Down Expand Up @@ -84,15 +85,17 @@ def common_neighbourhood_distance_feature_logic(
variant_index: VariantIndex,
feature_name: str,
distance_type: str,
gene_index: GeneIndex,
genomic_window: int = 500_000,
) -> DataFrame:
"""Calculate the distance feature that correlates any variant in a credible set with any gene nearby the locus. The distance is weighted by the posterior probability of the variant to factor in its contribution to the trait.
"""Calculate the distance feature that correlates any variant in a credible set with any protein coding gene nearby the locus. The distance is weighted by the posterior probability of the variant to factor in its contribution to the trait.
Args:
study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
variant_index (VariantIndex): The dataset containing distance to gene information
feature_name (str): The name of the feature
distance_type (str): The type of distance to gene
gene_index (GeneIndex): The dataset containing gene information
genomic_window (int): The maximum window size to consider
Returns:
Expand All @@ -109,16 +112,30 @@ def common_neighbourhood_distance_feature_logic(
)
return (
# Then compute mean distance in the vicinity (feature will be the same for any gene associated with a studyLocus)
local_metric.withColumn(
"regional_metric",
f.mean(f.col(local_feature_name)).over(Window.partitionBy("studyLocusId")),
local_metric.join(
gene_index.df.filter(f.col("biotype") == "protein_coding").select("geneId"),
"geneId",
"inner",
)
.withColumn(
"regional_max",
f.max(local_feature_name).over(Window.partitionBy("studyLocusId")),
)
.withColumn(
feature_name,
f.when(
(f.col("regional_max").isNotNull()) & (f.col("regional_max") != 0.0),
f.col(local_feature_name)
/ f.coalesce(f.col("regional_max"), f.lit(0.0)),
).otherwise(f.lit(0.0)),
)
.withColumn(
feature_name,
(f.col(local_feature_name) - f.col("regional_metric"))
/ f.log10(f.lit(genomic_window + 1)),
f.when(f.col(feature_name) < 0, f.lit(0.0))
.when(f.col(feature_name) > 1, f.lit(1.0))
.otherwise(f.col(feature_name)),
)
.drop("regional_metric", local_feature_name)
.drop("regional_max", local_feature_name)
)


Expand Down Expand Up @@ -168,7 +185,7 @@ def compute(
class DistanceTssMeanNeighbourhoodFeature(L2GFeature):
"""Minimum mean distance to TSS for all genes in the vicinity of a studyLocus."""

feature_dependency_type = VariantIndex
feature_dependency_type = [VariantIndex, GeneIndex]
feature_name = "distanceTssMeanNeighbourhood"

@classmethod
Expand Down Expand Up @@ -244,7 +261,7 @@ def compute(
class DistanceSentinelTssNeighbourhoodFeature(L2GFeature):
"""Distance between the sentinel variant and a gene TSS as a relation of the distnace with all the genes in the vicinity of a studyLocus. This is not weighted by the causal probability."""

feature_dependency_type = VariantIndex
feature_dependency_type = [VariantIndex, GeneIndex]
feature_name = "distanceSentinelTssNeighbourhood"

@classmethod
Expand Down Expand Up @@ -325,7 +342,7 @@ def compute(
class DistanceFootprintMeanNeighbourhoodFeature(L2GFeature):
"""Minimum mean distance to footprint for all genes in the vicinity of a studyLocus."""

feature_dependency_type = VariantIndex
feature_dependency_type = [VariantIndex, GeneIndex]
feature_name = "distanceFootprintMeanNeighbourhood"

@classmethod
Expand Down Expand Up @@ -401,7 +418,7 @@ def compute(
class DistanceSentinelFootprintNeighbourhoodFeature(L2GFeature):
"""Distance between the sentinel variant and a gene footprint as a relation of the distnace with all the genes in the vicinity of a studyLocus. This is not weighted by the causal probability."""

feature_dependency_type = VariantIndex
feature_dependency_type = [VariantIndex, GeneIndex]
feature_name = "distanceSentinelFootprintNeighbourhood"

@classmethod
Expand Down
36 changes: 20 additions & 16 deletions src/gentropy/dataset/l2g_features/vep.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from typing import TYPE_CHECKING, Any

import pyspark.sql.functions as f
from pyspark.sql import Window

from gentropy.common.spark_helpers import convert_from_wide_to_long
from gentropy.dataset.gene_index import GeneIndex
Expand Down Expand Up @@ -79,7 +80,7 @@ def common_neighbourhood_vep_feature_logic(
gene_index: GeneIndex,
feature_name: str,
) -> DataFrame:
"""Extracts variant severity score computed from VEP for any gene, based on what is the mean score for protein coding genes that are nearby the locus.
"""Extracts variant severity score computed from VEP for any gene, based on what is the max score for protein coding genes that are nearby the locus.
Args:
study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
Expand All @@ -95,26 +96,29 @@ def common_neighbourhood_vep_feature_logic(
study_loci_to_annotate,
feature_name=local_feature_name,
variant_index=variant_index,
).join(
# Bring gene classification
gene_index.df.select("geneId", "biotype"),
"geneId",
"inner",
)
# Compute average score in the vicinity (feature will be the same for any gene associated with a studyLocus)
# (non protein coding genes in the vicinity are excluded see #3552)
regional_mean_per_study_locus = (
local_metric.filter(f.col("biotype") == "protein_coding")
.groupBy("studyLocusId")
.agg(f.mean(local_feature_name).alias("regional_mean"))
)
return (
local_metric.join(regional_mean_per_study_locus, "studyLocusId", "left")
local_metric
# Compute average score in the vicinity (feature will be the same for any gene associated with a studyLocus)
# (non protein coding genes in the vicinity are excluded see #3552)
.join(
gene_index.df.filter(f.col("biotype") == "protein_coding").select("geneId"),
"geneId",
"inner",
)
.withColumn(
"regional_max",
f.max(local_feature_name).over(Window.partitionBy("studyLocusId")),
)
.withColumn(
feature_name,
f.col(local_feature_name) - f.coalesce(f.col("regional_mean"), f.lit(0.0)),
f.when(
(f.col("regional_max").isNotNull()) & (f.col("regional_max") != 0.0),
f.col(local_feature_name)
/ f.coalesce(f.col("regional_max"), f.lit(0.0)),
).otherwise(f.lit(0.0)),
)
.drop("regional_mean", local_feature_name, "biotype")
.drop("regional_max", local_feature_name)
)


Expand Down
1 change: 1 addition & 0 deletions src/gentropy/dataset/l2g_gold_standard.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ def build_feature_matrix(
on=["studyId", "variantId", "geneId"],
how="inner",
)
.filter(f.col("isProteinCoding") == 1)
.drop("studyId", "variantId")
.distinct(),
with_gold_standard=True,
Expand Down
1 change: 1 addition & 0 deletions src/gentropy/dataset/l2g_prediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ def from_credible_set(
credible_set.df.filter(f.col("studyType") == "gwas")
.select("studyLocusId")
.join(feature_matrix._df, "studyLocusId")
.filter(f.col("isProteinCoding") == 1)
)
)
.fill_na()
Expand Down
2 changes: 1 addition & 1 deletion src/gentropy/l2g.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def __init__(
session, credible_set_path, recursiveFileLookup=True
)
self.feature_matrix = L2GFeatureMatrix(
_df=session.load_data(feature_matrix_path), features_list=self.features_list
_df=session.load_data(feature_matrix_path),
)

if run_mode == "predict":
Expand Down
Loading

0 comments on commit 40ca215

Please sign in to comment.