Integrate eCAVIAR colocalization (opentargets#58)

* refactor: extract method to find overlapping peaks to `find_gwas_vs_all_overlapping_peaks` * test: add tests for `find_gwas_vs_all_overlapping_peaks` * fix: add gene_id to the metadata col * feat: implement working ecaviar * test: add test for `ecaviar_colocalisation` * test: added test for _extract_credible_sets * fix: filter only variants in the 95 credible set * docs: update coloc docs * feat: add coloc schema and validation step
thehyve · Jan 3, 2023 · 8c5c532 · 8c5c532
1 parent 2855069
commit 8c5c532
Show file tree

Hide file tree

Showing 9 changed files with 779 additions and 65 deletions.
diff --git a/configs/etl/reference.yaml b/configs/etl/reference.yaml
@@ -9,14 +9,16 @@ coloc:
     #TOOD: to update
     sumstats_filtered: gs://genetics-portal-dev-sumstats/filtered/significant_window_2mb_union
   outputs:
-    coloc: ${etl.outputs}/coloc
+    coloc: ${etl.outputs}/coloc_ecaviar_all
   parameters:
     # priorc1 Prior on variant being causal for trait 1
     priorc1: 1e-4
     # priorc2 Prior on variant being causal for trait 2
     priorc2: 1e-4
     # priorc12 Prior on variant being causal for traits 1 and 2
     priorc12: 1e-5
+    # posterior probability threshold to consider PICS signal
+    pp_threshold: 0.001
   machine: ${machine.coloc}
 
 v2g:

diff --git a/docs/modules/colocalisation.md b/docs/modules/colocalisation.md
@@ -1,3 +1,31 @@
-::: etl.coloc
+This workflow runs colocalization analyses that assess the degree to which independent signals of the association share the same causal variant in a region of the genome, typically limited by linkage disequilibrium (LD).
+
+The colocalisation test is performed using two methods:
+
+1. Based on the [R COLOC package](https://github.com/chr1swallace/coloc/blob/main/R/claudia.R), which uses the Bayes factors from the credible set to estimate the posterior probability of colocalisation. This method makes the simplifying assumption that **only a single causal variant** exists for any given trait in any genomic region.
+
+    Using GWAS summary statistics, and without information about LD, we start by enumerating all variant-level hypotheses:
+
+    Hypothesis | Description
+    --- | ---
+    H_0 | no association with either trait in the region
+    H_1 | association with trait 1 only
+    H_2 | association with trait 2 only
+    H_3 | both traits are associated, but have different single causal variants
+    H_4 | both traits are associated and share the same single causal variant
+
+
+
+2. Based on eCAVIAR. It extends the [CAVIAR](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5142122/#bib18) framework to explicitly estimate the posterior probability that the same variant is causal in 2 studies while accounting for the uncertainty of LD.
+
+    eCAVIAR computes the colocalization posterior probability (**CLPP**) by utilizing the marginal posterior probabilities derived from PICS. This framework allows for **multiple variants to be causal** in a single locus.
+
+## Summary of the logic
+
+The workflow is divided into 2 steps for both methods:
+
+**1. Find all vs all pairs of independent signals of association in the region of interest.**
 ::: etl.coloc.overlaps
+
+**2. For each pair of signals, run the colocalisation test.**
 ::: etl.coloc.coloc
diff --git a/src/etl/coloc/coloc.py b/src/etl/coloc/coloc.py
@@ -11,9 +11,12 @@
 from pyspark.sql.types import DoubleType
 
 if TYPE_CHECKING:
-    from pyspark.sql import DataFrame
+    from pyspark.sql import Column, DataFrame
 
-from etl.coloc.overlaps import find_all_vs_all_overlapping_signals
+from etl.coloc.overlaps import (
+    find_all_vs_all_overlapping_signals,
+    find_gwas_vs_all_overlapping_peaks,
+)
 
 
 def run_colocalisation(
@@ -22,20 +25,51 @@ def run_colocalisation(
     priorc1: float,
     priorc2: float,
     prioc12: float,
+    pp_threshold: float,
     sumstats: DataFrame,
 ) -> DataFrame:
     """Run colocalisation analysis."""
-    # 1. Looking for overlapping signals
-    overlapping_signals = find_all_vs_all_overlapping_signals(credible_sets, study_df)
+    credible_sets_enriched = credible_sets.join(
+        f.broadcast(study_df), on="studyId", how="left"
+    ).persist()
+    # 1. Standard colocalisation
+    # Looking for overlapping signals
+    overlapping_signals = find_all_vs_all_overlapping_signals(
+        # We keep only the credible sets where probability is given as a bayes factor
+        credible_sets_enriched.filter(f.col("logABF").isNotNull())
+    )
+    fm_coloc = colocalisation(overlapping_signals, priorc1, priorc2, prioc12)
 
-    return (
-        # 2. Perform colocalisation analysis
-        colocalisation(
-            overlapping_signals,
-            priorc1,
-            priorc2,
-            prioc12,
+    # 2. LD-expanded colocalisation
+    # Looking for overlapping signals
+    overlapping_signals = find_gwas_vs_all_overlapping_peaks(
+        # We keep the credible sets where probability is given as a posterior probability (resulted from PICS and SuSIE for FINNGEN)
+        credible_sets_enriched.filter(f.col("posteriorProbability") > pp_threshold),
+        "posteriorProbability",
+    )
+
+    # IDEA: pass a parameter with the type of coloc: all vs all, gwas vs all
+    metadata_cols = ["studyId", "phenotype", "biofeature"]
+    ecaviar_coloc = (
+        ecaviar_colocalisation(overlapping_signals, pp_threshold)
+        # Add study metadata - to be deprecated
+        # the resulting table has more rows because of the studies that have multiple mapped traits
+        .join(
+            study_df.selectExpr(*(f"{i} as left_{i}" for i in metadata_cols)),
+            on="left_studyId",
+            how="left",
+        )
+        .join(
+            study_df.selectExpr(*(f"{i} as right_{i}" for i in metadata_cols)),
+            on="right_studyId",
+            how="left",
         )
+        .distinct()
+    )
+
+    return (
+        # 3. Join colocalisation results
+        fm_coloc.unionByName(ecaviar_coloc, allowMissingColumns=True)
         # 4. Add betas from sumstats
         # Adds backwards compatibility with production schema
         # Note: First implementation in _add_coloc_sumstats_info hasn't been fully tested
@@ -79,6 +113,22 @@ def _get_posteriors(all_abfs: VectorUDT) -> DenseVector:
     return Vectors.dense(abfs_posteriors)
 
 
+def _get_clpp(left_pp: Column, right_pp: Column) -> Column:
+    """Calculate the colocalisation posterior probability (CLPP).
+
+    If the fact that the same variant is found causal for two studies are independent events,
+    CLPP is defined as the product of posterior porbabilities that a variant is causal in both studies.
+
+    Args:
+        left_pp (Column): left posterior probability
+        right_pp (Column): right posterior probability
+
+    Returns:
+        Column: CLPP
+    """
+    return left_pp * right_pp
+
+
 def colocalisation(
     overlapping_signals: DataFrame, priorc1: float, priorc2: float, priorc12: float
 ) -> DataFrame:
@@ -105,7 +155,7 @@ def colocalisation(
     posteriors = f.udf(_get_posteriors, VectorUDT())
     coloc = (
         overlapping_signals
-        # Before summarizing log_abf columns nulls need to be filled with 0:
+        # Before summing log_abf columns nulls need to be filled with 0:
         .fillna(0, subset=["left_logABF", "right_logABF"])
         # Sum of log_abfs for each pair of signals
         .withColumn("sum_log_abf", f.col("left_logABF") + f.col("right_logABF"))
@@ -126,10 +176,12 @@ def colocalisation(
                 "sum_log_abf"
             ),
             # carrying over information and renaming columns (backwards compatible)
-            f.first("left_traitFromSourceMappedId").alias("left_phenotype"),
+            f.first("left_phenotype").alias("left_phenotype"),
             f.first("left_biofeature").alias("left_biofeature"),
-            f.first("right_traitFromSourceMappedId").alias("right_phenotype"),
+            f.first("left_gene_id").alias("left_gene_id"),
+            f.first("right_phenotype").alias("right_phenotype"),
             f.first("right_biofeature").alias("right_biofeature"),
+            f.first("right_gene_id").alias("right_gene_id"),
         )
         .withColumn("logsum1", logsum(f.col("left_logABF")))
         .withColumn("logsum2", logsum(f.col("right_logABF")))
@@ -195,3 +247,43 @@ def colocalisation(
         .drop("posteriors", "allABF", "lH0abf", "lH1abf", "lH2abf", "lH3abf", "lH4abf")
     )
     return coloc
+
+
+def ecaviar_colocalisation(
+    overlapping_signals: DataFrame, clpp_threshold: float
+) -> DataFrame:
+    """Calculate bayesian colocalisation based on overlapping signals.
+
+    Args:
+        overlapping_signals (DataFrame): DataFrame with overlapping signals.
+        clpp_threshold (float): Colocalization cutoff threshold as described in the paper.
+
+    Returns:
+        DataFrame: DataFrame with colocalisation results.
+    """
+    signal_pairs_cols = [
+        "chromosome",
+        "studyId",
+        "leadVariantId",
+        "type",
+    ]
+
+    return (
+        overlapping_signals.withColumn(
+            "clpp",
+            _get_clpp(
+                f.col("left_posteriorProbability"), f.col("right_posteriorProbability")
+            ),
+        )
+        .filter(f.col("clpp") > clpp_threshold)
+        .groupBy(
+            *(
+                [f"left_{col}" for col in signal_pairs_cols]
+                + [f"right_{col}" for col in signal_pairs_cols]
+            )
+        )
+        .agg(
+            f.count("*").alias("coloc_n_vars"),
+            f.sum(f.col("clpp")).alias("clpp"),
+        )
+    )
diff --git a/src/etl/coloc/overlaps.py b/src/etl/coloc/overlaps.py
@@ -11,8 +11,7 @@
 
 
 def find_all_vs_all_overlapping_signals(
-    credible_sets: DataFrame,
-    study_df: DataFrame,
+    credible_sets_enriched: DataFrame,
 ) -> DataFrame:
     """Find overlapping signals.
 
@@ -23,57 +22,32 @@ def find_all_vs_all_overlapping_signals(
     appearing on the right side.
 
     Args:
-        credible_sets (DataFrame): DataFrame containing the credible sets to be analysed
-        study_df (DataFrame): DataFrame containing metadata about the study
+        credible_sets_enriched (DataFrame): DataFrame containing the credible sets to be analysed
 
     Returns:
         DataFrame: overlapping peaks to be compared
     """
-    credible_sets_enriched = credible_sets.join(
-        f.broadcast(study_df), on="studyId", how="left"
-    )
     # Columnns to be used as left and right
     id_cols = [
         "chromosome",
         "studyId",
         "leadVariantId",
         "type",
     ]
-    metadata_cols = [
-        "traitFromSourceMappedId",
-        "biofeature",
-    ]
+    metadata_cols = ["phenotype", "biofeature", "gene_id"]
 
     # Self join with complex condition. Left it's all gwas and right can be gwas or molecular trait
-    cols_to_rename = id_cols
-    credset_to_self_join = credible_sets_enriched.select(id_cols + ["tagVariantId"])
+    # This function includes some metadata about the overlap that needs to be dropped to avoid duplicates
+    cols_to_drop = ["left_logABF", "right_logABF", "tagVariantId"]
     overlapping_peaks = (
-        credset_to_self_join.alias("left")
-        .filter(f.col("type") == "gwas")
-        .join(
-            credset_to_self_join.alias("right"),
-            on=[
-                f.col("left.chromosome") == f.col("right.chromosome"),
-                f.col("left.tagVariantId") == f.col("right.tagVariantId"),
-                (f.col("right.type") != "gwas")
-                | (f.col("left.studyId") > f.col("right.studyId")),
-            ],
-            how="inner",
-        )
-        .drop("left.tagVariantId", "right.tagVariantId")
-        # Rename columns to make them unambiguous
-        .selectExpr(
-            *(
-                [f"left.{col} as left_{col}" for col in cols_to_rename]
-                + [f"right.{col} as right_{col}" for col in cols_to_rename]
-            )
-        )
-        .dropDuplicates(
-            [f"left_{i}" for i in id_cols] + [f"right_{i}" for i in id_cols]
-        )
-        .cache()
+        find_gwas_vs_all_overlapping_peaks(credible_sets_enriched, "logABF")
+        # Keep overlapping peaks where logABF is at either side
+        .filter(
+            f.col("left_logABF").isNotNull() & f.col("right_logABF").isNotNull()
+        ).drop(*cols_to_drop)
     )
 
+    # Bring metadata from left and right for all variants that tag the peak
     overlapping_left = credible_sets_enriched.selectExpr(
         [f"{col} as left_{col}" for col in id_cols + metadata_cols + ["logABF"]]
         + ["tagVariantId"]
@@ -99,3 +73,55 @@ def find_all_vs_all_overlapping_signals(
         + ["tagVariantId"],
         how="outer",
     )
+
+
+def find_gwas_vs_all_overlapping_peaks(
+    credible_sets_enriched: DataFrame, causality_statistic: str
+) -> DataFrame:
+    """Find overlapping signals between GWAS (left) and GWAS or Molecular traits (right).
+
+    The main principle is that here we extract which are the peaks that share a tagging variant for the same trait.
+
+    Args:
+        credible_sets_enriched (DataFrame): DataFrame containing the credible sets to be analysed
+        causality_statistic (str): Causality statistic to be used for the analysis. Can be either logABF or posteriorProbability
+
+    Returns:
+        DataFrame: overlapping peaks to be compared
+    """
+    id_cols = [
+        "chromosome",
+        "studyId",
+        "leadVariantId",
+        "type",
+    ]
+    cols_to_rename = id_cols + [causality_statistic]
+    credset_to_self_join = credible_sets_enriched.select(
+        id_cols + ["tagVariantId", causality_statistic]
+    )
+    return (
+        credset_to_self_join.alias("left")
+        .filter(f.col("type") == "gwas")
+        .join(
+            credset_to_self_join.alias("right"),
+            on=[
+                f.col("left.chromosome") == f.col("right.chromosome"),
+                f.col("left.tagVariantId") == f.col("right.tagVariantId"),
+                (f.col("right.type") != "gwas")
+                | (f.col("left.studyId") > f.col("right.studyId")),
+            ],
+            how="inner",
+        )
+        # Rename columns to make them unambiguous
+        .selectExpr(
+            *(
+                [f"left.{col} as left_{col}" for col in cols_to_rename]
+                + [f"right.{col} as right_{col}" for col in cols_to_rename]
+                + ["left.tagVariantId as tagVariantId"],
+            )
+        )
+        .dropDuplicates(
+            [f"left_{i}" for i in id_cols] + [f"right_{i}" for i in id_cols]
+        )
+        .cache()
+    )
diff --git a/src/etl/coloc/utils.py b/src/etl/coloc/utils.py
@@ -75,14 +75,15 @@ def _extract_credible_sets(
     """
     return (
         study_locus.withColumn("credibleSet", f.explode("credibleSets"))
-        # Filter out credible sets generated by PICS (for now)
-        .filter(f.col("credibleSet.method").isin(["conditional", "SuSIE"]))
         .withColumn("credibleVariant", f.explode("credibleSet.credibleSet"))
+        .filter(f.col("credibleVariant.is95CredibleSet"))
         .select(
             f.col("variantId").alias("leadVariantId"),
             "studyId",
             "credibleVariant.tagVariantId",
             "credibleVariant.logABF",
+            "credibleVariant.posteriorProbability",
             f.split(f.col("variantId"), "_")[0].alias("chromosome"),
+            "credibleSet.method",
         )
     )