forked from opentargets/gentropy
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: ingest FinnGen UKB meta-analysis data (opentargets#756)
* feat: implement FinnGen UKB meta-analysis ingestion and harmonisation * chore: remove ot_finngen_ukb_meta.yaml * chore: remove raw_study_index_path to raw_study_index_path_from_tsv * fix: use session.write_mode * style: rename class to FinngenUkbMetaIngestionStep
- Loading branch information
Showing
12 changed files
with
236 additions
and
30 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
"""FinnGen UKB meta-analysis data source.""" | ||
|
||
from __future__ import annotations |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
"""Study Index for Finngen data source.""" | ||
from __future__ import annotations | ||
|
||
import pyspark.sql.functions as f | ||
from pyspark.sql import SparkSession | ||
|
||
from gentropy.dataset.study_index import StudyIndex | ||
|
||
|
||
class FinngenUkbMetaStudyIndex(StudyIndex): | ||
"""Study index dataset from FinnGen UKB meta-analysis.""" | ||
|
||
@classmethod | ||
def from_source( | ||
cls: type[FinngenUkbMetaStudyIndex], | ||
spark: SparkSession, | ||
raw_study_index_path_from_tsv: str, | ||
) -> StudyIndex: | ||
"""This function ingests study level metadata from FinnGen UKB meta-analysis. | ||
Args: | ||
spark (SparkSession): Spark session object. | ||
raw_study_index_path_from_tsv (str): Raw study index path. | ||
Returns: | ||
StudyIndex: Parsed and annotated FinnGen UKB meta-analysis study table. | ||
""" | ||
# Read the raw study index and process. | ||
study_index_df = ( | ||
spark.read.csv(raw_study_index_path_from_tsv, sep="\t", header=True) | ||
.select( | ||
f.lit("gwas").alias("studyType"), | ||
f.lit("FINNGEN_R11_UKB_META").alias("projectId"), | ||
f.col("_gentropy_study_id").alias("studyId"), | ||
f.col("name").alias("traitFromSource"), | ||
f.lit(True).alias("hasSumstats"), | ||
f.col("_gentropy_summary_stats_link").alias("summarystatsLocation"), | ||
(f.col("fg_n_cases") + f.col("ukbb_n_cases") + f.col("fg_n_controls") + f.col("ukbb_n_controls")).alias("nSamples") | ||
) | ||
) | ||
# Add population structure. | ||
study_index_df = ( | ||
study_index_df | ||
.withColumn( | ||
"discoverySamples", | ||
f.array( | ||
f.struct( | ||
f.col("nSamples").cast("integer").alias("sampleSize"), | ||
f.lit("European").alias("ancestry"), | ||
) | ||
) | ||
) | ||
.withColumn( | ||
"ldPopulationStructure", | ||
cls.aggregate_and_map_ancestries(f.col("discoverySamples")), | ||
) | ||
) | ||
|
||
return StudyIndex( | ||
_df=study_index_df, | ||
_schema=StudyIndex.get_schema(), | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
"""Summary statistics ingestion for FinnGen UKB meta-analysis.""" | ||
|
||
from __future__ import annotations | ||
|
||
from dataclasses import dataclass | ||
|
||
from pyspark.sql import SparkSession | ||
|
||
from gentropy.common.harmonise import harmonise_summary_stats | ||
from gentropy.dataset.summary_statistics import SummaryStatistics | ||
|
||
|
||
@dataclass | ||
class FinngenUkbMetaSummaryStats: | ||
"""Summary statistics dataset for FinnGen UKB meta-analysis.""" | ||
|
||
@classmethod | ||
def from_source( | ||
cls: type[FinngenUkbMetaSummaryStats], | ||
spark: SparkSession, | ||
raw_summary_stats_path: str, | ||
tmp_variant_annotation_path: str, | ||
chromosome: str, | ||
study_index_path: str, | ||
) -> SummaryStatistics: | ||
"""Ingest and harmonise all summary stats for FinnGen UKB meta-analysis data. | ||
Args: | ||
spark (SparkSession): Spark session object. | ||
raw_summary_stats_path (str): Input raw summary stats path. | ||
tmp_variant_annotation_path (str): Input variant annotation dataset path. | ||
chromosome (str): Which chromosome to process. | ||
study_index_path (str): The path to study index, which is necessary in some cases to populate the sample size column. | ||
Returns: | ||
SummaryStatistics: Processed summary statistics dataset for a given chromosome. | ||
""" | ||
# Run the harmonisation steps. | ||
df = harmonise_summary_stats( | ||
spark, | ||
raw_summary_stats_path, | ||
tmp_variant_annotation_path, | ||
chromosome, | ||
colname_position="POS", | ||
colname_allele0="REF", | ||
colname_allele1="ALT", | ||
colname_a1freq=None, | ||
colname_info=None, | ||
colname_beta="all_inv_var_meta_beta", | ||
colname_se="all_inv_var_meta_sebeta", | ||
colname_mlog10p="all_inv_var_meta_mlogp", | ||
colname_n=None, | ||
) | ||
|
||
# Populate the sample size column from the study index. | ||
study_index = spark.read.parquet(study_index_path).select("studyId", "nSamples") | ||
df = df.join(study_index, on=["studyId"], how="inner") | ||
|
||
# Create the summary statistics object. | ||
return SummaryStatistics( | ||
_df=df, | ||
_schema=SummaryStatistics.get_schema(), | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
"""Step to run FinnGen UKB meta-analysis data ingestion.""" | ||
|
||
from __future__ import annotations | ||
|
||
from gentropy.common.per_chromosome import ( | ||
prepare_va, | ||
process_summary_stats_per_chromosome, | ||
) | ||
from gentropy.common.session import Session | ||
from gentropy.datasource.finngen_ukb_meta.study_index import FinngenUkbMetaStudyIndex | ||
from gentropy.datasource.finngen_ukb_meta.summary_stats import ( | ||
FinngenUkbMetaSummaryStats, | ||
) | ||
|
||
|
||
class FinngenUkbMetaIngestionStep: | ||
"""FinnGen UKB meta-analysis data ingestion and harmonisation.""" | ||
|
||
def __init__( | ||
self, session: Session, raw_study_index_path_from_tsv: str, raw_summary_stats_path: str, variant_annotation_path: str, tmp_variant_annotation_path: str, study_index_output_path: str, summary_stats_output_path: str | ||
) -> None: | ||
"""Data ingestion and harmonisation step for FinnGen UKB meta-analysis. | ||
Args: | ||
session (Session): Session object. | ||
raw_study_index_path_from_tsv (str): Input raw study index path. | ||
raw_summary_stats_path (str): Input raw summary stats path. | ||
variant_annotation_path (str): Input variant annotation dataset path. | ||
tmp_variant_annotation_path (str): Temporary output path for variant annotation dataset. | ||
study_index_output_path (str): Study index output path. | ||
summary_stats_output_path (str): Summary stats output path. | ||
""" | ||
session.logger.info("Pre-compute the direct and flipped variant annotation dataset.") | ||
prepare_va(session, variant_annotation_path, tmp_variant_annotation_path) | ||
|
||
session.logger.info("Process study index.") | ||
( | ||
FinngenUkbMetaStudyIndex.from_source( | ||
spark=session.spark, | ||
raw_study_index_path_from_tsv=raw_study_index_path_from_tsv, | ||
) | ||
.df | ||
.write | ||
.mode(session.write_mode) | ||
.parquet(study_index_output_path) | ||
) | ||
|
||
session.logger.info("Process and harmonise summary stats.") | ||
process_summary_stats_per_chromosome(session, FinngenUkbMetaSummaryStats, raw_summary_stats_path, tmp_variant_annotation_path, summary_stats_output_path, study_index_output_path) |
Oops, something went wrong.