feat: redefine neighbourhood features to represent similarity with be…

…st metric + other fixes (opentargets#913) * feat: mean to max * fix: remove protein coding * fix: adding protein coding * feat(l2g): neighbourhood features are a division between local and regional * feat(l2g): regional max for distance features only consider protein coding genes * fix(coloc_features): regional max for coloc features only consider protein coding genes * fix(vep_features): regional max for vep features only consider protein coding genes * feat(l2g): train and predict based on protein coding genes only * feat: set nbh feature to 1 if features are 0 in the region * feat: set nbh feature to 1 if features are 0 in the region * Revert "feat: set nbh feature to 1 if features are 0 in the region" This reverts commit da145ab. * fix: return nbh features only for protein coding genes + optimisation * test: change expected results based on changes * test: change expected results based on changes * fix: test --------- Co-authored-by: Yakov Tsepilov <[email protected]>
thehyve · Nov 15, 2024 · 40ca215 · 40ca215
1 parent c46480b
commit 40ca215
Show file tree

Hide file tree

Showing 9 changed files with 117 additions and 99 deletions.
diff --git a/src/gentropy/config.py b/src/gentropy/config.py
@@ -263,7 +263,7 @@ class LocusToGeneConfig(StepConfig):
             "geneCount500kb",
             "proteinGeneCount500kb",
             "credibleSetConfidence",
-            "isProteinCoding",
+            # "isProteinCoding",
         ]
     )
     hyperparameters: dict[str, Any] = field(

diff --git a/src/gentropy/dataset/l2g_features/colocalisation.py b/src/gentropy/dataset/l2g_features/colocalisation.py
@@ -5,6 +5,7 @@
 from typing import TYPE_CHECKING, Any
 
 import pyspark.sql.functions as f
+from pyspark.sql import Window
 
 from gentropy.common.spark_helpers import convert_from_wide_to_long
 from gentropy.dataset.colocalisation import Colocalisation
@@ -168,23 +169,27 @@ def common_neighbourhood_colocalisation_feature_logic(
             study_locus,
         )
     )
-    # Compute average score in the vicinity (feature will be the same for any gene associated with a studyLocus)
-    # (non protein coding genes in the vicinity are excluded see #3552)
-    regional_mean_per_study_locus = (
+    return (
         extended_local_max.join(
-            gene_index.df.select("geneId", "biotype"), "geneId", "left"
+            # Compute average score in the vicinity (feature will be the same for any gene associated with a studyLocus)
+            # (non protein coding genes in the vicinity are excluded see #3552)
+            gene_index.df.filter(f.col("biotype") == "protein_coding").select("geneId"),
+            "geneId",
+            "inner",
+        )
+        .withColumn(
+            "regional_max",
+            f.max(local_feature_name).over(Window.partitionBy("studyLocusId")),
         )
-        .filter(f.col("biotype") == "protein_coding")
-        .groupBy("studyLocusId")
-        .agg(f.mean(local_feature_name).alias("regional_mean"))
-    )
-    return (
-        local_max.join(regional_mean_per_study_locus, "studyLocusId", "left")
         .withColumn(
             feature_name,
-            f.col(local_feature_name) - f.coalesce(f.col("regional_mean"), f.lit(0.0)),
+            f.when(
+                (f.col("regional_max").isNotNull()) & (f.col("regional_max") != 0.0),
+                f.col(local_feature_name)
+                / f.coalesce(f.col("regional_max"), f.lit(0.0)),
+            ).otherwise(f.lit(0.0)),
         )
-        .drop("regional_mean", local_feature_name)
+        .drop("regional_max", local_feature_name)
     )
 
 

diff --git a/src/gentropy/dataset/l2g_features/distance.py b/src/gentropy/dataset/l2g_features/distance.py
@@ -8,6 +8,7 @@
 from pyspark.sql import Window
 
 from gentropy.common.spark_helpers import convert_from_wide_to_long
+from gentropy.dataset.gene_index import GeneIndex
 from gentropy.dataset.l2g_features.l2g_feature import L2GFeature
 from gentropy.dataset.l2g_gold_standard import L2GGoldStandard
 from gentropy.dataset.study_locus import StudyLocus
@@ -55,7 +56,7 @@ def common_distance_feature_logic(
         agg_expr = f.sum(f.col("distance_score"))
     elif "Sentinel" in feature_name:
         df = study_loci_to_annotate.df.select("studyLocusId", "variantId")
-        # For minimum distances we calculate the unweighted distance between the sentinel (lead) and the gene. This
+        # For minimum distances we calculate the unweighted distance between the sentinel (lead) and the gene.
         distance_score_expr = f.lit(genomic_window) - f.col(distance_type) + f.lit(1)
         agg_expr = f.first(f.col("distance_score"))
     return (
@@ -84,15 +85,17 @@ def common_neighbourhood_distance_feature_logic(
     variant_index: VariantIndex,
     feature_name: str,
     distance_type: str,
+    gene_index: GeneIndex,
     genomic_window: int = 500_000,
 ) -> DataFrame:
-    """Calculate the distance feature that correlates any variant in a credible set with any gene nearby the locus. The distance is weighted by the posterior probability of the variant to factor in its contribution to the trait.
+    """Calculate the distance feature that correlates any variant in a credible set with any protein coding gene nearby the locus. The distance is weighted by the posterior probability of the variant to factor in its contribution to the trait.
 
     Args:
         study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
         variant_index (VariantIndex): The dataset containing distance to gene information
         feature_name (str): The name of the feature
         distance_type (str): The type of distance to gene
+        gene_index (GeneIndex): The dataset containing gene information
         genomic_window (int): The maximum window size to consider
 
     Returns:
@@ -109,16 +112,30 @@ def common_neighbourhood_distance_feature_logic(
     )
     return (
         # Then compute mean distance in the vicinity (feature will be the same for any gene associated with a studyLocus)
-        local_metric.withColumn(
-            "regional_metric",
-            f.mean(f.col(local_feature_name)).over(Window.partitionBy("studyLocusId")),
+        local_metric.join(
+            gene_index.df.filter(f.col("biotype") == "protein_coding").select("geneId"),
+            "geneId",
+            "inner",
+        )
+        .withColumn(
+            "regional_max",
+            f.max(local_feature_name).over(Window.partitionBy("studyLocusId")),
+        )
+        .withColumn(
+            feature_name,
+            f.when(
+                (f.col("regional_max").isNotNull()) & (f.col("regional_max") != 0.0),
+                f.col(local_feature_name)
+                / f.coalesce(f.col("regional_max"), f.lit(0.0)),
+            ).otherwise(f.lit(0.0)),
         )
         .withColumn(
             feature_name,
-            (f.col(local_feature_name) - f.col("regional_metric"))
-            / f.log10(f.lit(genomic_window + 1)),
+            f.when(f.col(feature_name) < 0, f.lit(0.0))
+            .when(f.col(feature_name) > 1, f.lit(1.0))
+            .otherwise(f.col(feature_name)),
         )
-        .drop("regional_metric", local_feature_name)
+        .drop("regional_max", local_feature_name)
     )
 
 
@@ -168,7 +185,7 @@ def compute(
 class DistanceTssMeanNeighbourhoodFeature(L2GFeature):
     """Minimum mean distance to TSS for all genes in the vicinity of a studyLocus."""
 
-    feature_dependency_type = VariantIndex
+    feature_dependency_type = [VariantIndex, GeneIndex]
     feature_name = "distanceTssMeanNeighbourhood"
 
     @classmethod
@@ -244,7 +261,7 @@ def compute(
 class DistanceSentinelTssNeighbourhoodFeature(L2GFeature):
     """Distance between the sentinel variant and a gene TSS as a relation of the distnace with all the genes in the vicinity of a studyLocus. This is not weighted by the causal probability."""
 
-    feature_dependency_type = VariantIndex
+    feature_dependency_type = [VariantIndex, GeneIndex]
     feature_name = "distanceSentinelTssNeighbourhood"
 
     @classmethod
@@ -325,7 +342,7 @@ def compute(
 class DistanceFootprintMeanNeighbourhoodFeature(L2GFeature):
     """Minimum mean distance to footprint for all genes in the vicinity of a studyLocus."""
 
-    feature_dependency_type = VariantIndex
+    feature_dependency_type = [VariantIndex, GeneIndex]
     feature_name = "distanceFootprintMeanNeighbourhood"
 
     @classmethod
@@ -401,7 +418,7 @@ def compute(
 class DistanceSentinelFootprintNeighbourhoodFeature(L2GFeature):
     """Distance between the sentinel variant and a gene footprint as a relation of the distnace with all the genes in the vicinity of a studyLocus. This is not weighted by the causal probability."""
 
-    feature_dependency_type = VariantIndex
+    feature_dependency_type = [VariantIndex, GeneIndex]
     feature_name = "distanceSentinelFootprintNeighbourhood"
 
     @classmethod

diff --git a/src/gentropy/dataset/l2g_features/vep.py b/src/gentropy/dataset/l2g_features/vep.py
@@ -5,6 +5,7 @@
 from typing import TYPE_CHECKING, Any
 
 import pyspark.sql.functions as f
+from pyspark.sql import Window
 
 from gentropy.common.spark_helpers import convert_from_wide_to_long
 from gentropy.dataset.gene_index import GeneIndex
@@ -79,7 +80,7 @@ def common_neighbourhood_vep_feature_logic(
     gene_index: GeneIndex,
     feature_name: str,
 ) -> DataFrame:
-    """Extracts variant severity score computed from VEP for any gene, based on what is the mean score for protein coding genes that are nearby the locus.
+    """Extracts variant severity score computed from VEP for any gene, based on what is the max score for protein coding genes that are nearby the locus.
 
     Args:
         study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
@@ -95,26 +96,29 @@ def common_neighbourhood_vep_feature_logic(
         study_loci_to_annotate,
         feature_name=local_feature_name,
         variant_index=variant_index,
-    ).join(
-        # Bring gene classification
-        gene_index.df.select("geneId", "biotype"),
-        "geneId",
-        "inner",
-    )
-    # Compute average score in the vicinity (feature will be the same for any gene associated with a studyLocus)
-    # (non protein coding genes in the vicinity are excluded see #3552)
-    regional_mean_per_study_locus = (
-        local_metric.filter(f.col("biotype") == "protein_coding")
-        .groupBy("studyLocusId")
-        .agg(f.mean(local_feature_name).alias("regional_mean"))
     )
     return (
-        local_metric.join(regional_mean_per_study_locus, "studyLocusId", "left")
+        local_metric
+        # Compute average score in the vicinity (feature will be the same for any gene associated with a studyLocus)
+        # (non protein coding genes in the vicinity are excluded see #3552)
+        .join(
+            gene_index.df.filter(f.col("biotype") == "protein_coding").select("geneId"),
+            "geneId",
+            "inner",
+        )
+        .withColumn(
+            "regional_max",
+            f.max(local_feature_name).over(Window.partitionBy("studyLocusId")),
+        )
         .withColumn(
             feature_name,
-            f.col(local_feature_name) - f.coalesce(f.col("regional_mean"), f.lit(0.0)),
+            f.when(
+                (f.col("regional_max").isNotNull()) & (f.col("regional_max") != 0.0),
+                f.col(local_feature_name)
+                / f.coalesce(f.col("regional_max"), f.lit(0.0)),
+            ).otherwise(f.lit(0.0)),
         )
-        .drop("regional_mean", local_feature_name, "biotype")
+        .drop("regional_max", local_feature_name)
     )
 
 

diff --git a/src/gentropy/dataset/l2g_gold_standard.py b/src/gentropy/dataset/l2g_gold_standard.py
@@ -132,6 +132,7 @@ def build_feature_matrix(
                 on=["studyId", "variantId", "geneId"],
                 how="inner",
             )
+            .filter(f.col("isProteinCoding") == 1)
             .drop("studyId", "variantId")
             .distinct(),
             with_gold_standard=True,

diff --git a/src/gentropy/dataset/l2g_prediction.py b/src/gentropy/dataset/l2g_prediction.py
@@ -78,6 +78,7 @@ def from_credible_set(
                     credible_set.df.filter(f.col("studyType") == "gwas")
                     .select("studyLocusId")
                     .join(feature_matrix._df, "studyLocusId")
+                    .filter(f.col("isProteinCoding") == 1)
                 )
             )
             .fill_na()

diff --git a/src/gentropy/l2g.py b/src/gentropy/l2g.py
@@ -163,7 +163,7 @@ def __init__(
             session, credible_set_path, recursiveFileLookup=True
         )
         self.feature_matrix = L2GFeatureMatrix(
-            _df=session.load_data(feature_matrix_path), features_list=self.features_list
+            _df=session.load_data(feature_matrix_path),
         )
 
         if run_mode == "predict":