From d4b507049ca54c927e8d80da43449f93e931e28a Mon Sep 17 00:00:00 2001 From: Szymon Szyszkowski <69353402+project-defiant@users.noreply.github.com> Date: Tue, 1 Oct 2024 16:41:19 +0200 Subject: [PATCH] fix: align the schema of study_index for ukb ppp eur (#803) * fix(ukb_ppp_study_index): update column name to match schema * chore: add note that notebooks are not supported --------- Co-authored-by: Szymon Szyszkowski --- notebooks/README.md | 3 ++ notebooks/Release_QC_metrics.ipynb | 2 +- .../datasource/ukb_ppp_eur/study_index.py | 31 ++++++++----------- 3 files changed, 17 insertions(+), 19 deletions(-) create mode 100644 notebooks/README.md diff --git a/notebooks/README.md b/notebooks/README.md new file mode 100644 index 000000000..35132b6bf --- /dev/null +++ b/notebooks/README.md @@ -0,0 +1,3 @@ +# Notebooks + +The notebooks in listed in this directory are not actively maintained and updated. diff --git a/notebooks/Release_QC_metrics.ipynb b/notebooks/Release_QC_metrics.ipynb index 4eb27015b..5f9bf77c0 100644 --- a/notebooks/Release_QC_metrics.ipynb +++ b/notebooks/Release_QC_metrics.ipynb @@ -419,7 +419,7 @@ "# Number of studies\n", "eqtl_index=session.spark.read.parquet(eqtl_index_path, recursiveFileLookup=True)\n", "# Number of tissues, list of tissues\n", - "#eqtl_index.select(f.col(\"tissueFromSourceId\")).distinct().show(truncate=False)\n", + "#eqtl_index.select(f.col(\"biosampleFromSourceId\")).distinct().show(truncate=False)\n", "\n", "# Credible_set. Please use Daniels’ notebook as a reference. For each subfolder:\n", "# eqtl catalog susie:\n", diff --git a/src/gentropy/datasource/ukb_ppp_eur/study_index.py b/src/gentropy/datasource/ukb_ppp_eur/study_index.py index f694b9a47..8a3105f5d 100644 --- a/src/gentropy/datasource/ukb_ppp_eur/study_index.py +++ b/src/gentropy/datasource/ukb_ppp_eur/study_index.py @@ -1,4 +1,5 @@ """Study Index for Finngen data source.""" + from __future__ import annotations import pyspark.sql.functions as f @@ -29,9 +30,7 @@ def from_source( """ # In order to populate the nSamples column, we need to peek inside the summary stats dataframe. num_of_samples = ( - spark - .read - .parquet(raw_summary_stats_path) + spark.read.parquet(raw_summary_stats_path) .filter(f.col("chromosome") == "22") .groupBy("studyId") .agg(f.first("N").cast("integer").alias("nSamples")) @@ -45,7 +44,7 @@ def from_source( f.lit("UKB_PPP_EUR").alias("projectId"), f.col("_gentropy_study_id").alias("studyId"), f.col("UKBPPP_ProteinID").alias("traitFromSource"), - f.lit("UBERON_0001969").alias("tissueFromSourceId"), + f.lit("UBERON_0001969").alias("biosampleFromSourceId"), f.col("ensembl_id").alias("geneId"), f.lit(True).alias("hasSumstats"), f.col("_gentropy_summary_stats_link").alias("summarystatsLocation"), @@ -53,21 +52,17 @@ def from_source( .join(num_of_samples, "studyId", "inner") ) # Add population structure. - study_index_df = ( - study_index_df - .withColumn( - "discoverySamples", - f.array( - f.struct( - f.col("nSamples").cast("integer").alias("sampleSize"), - f.lit("European").alias("ancestry"), - ) + study_index_df = study_index_df.withColumn( + "discoverySamples", + f.array( + f.struct( + f.col("nSamples").cast("integer").alias("sampleSize"), + f.lit("European").alias("ancestry"), ) - ) - .withColumn( - "ldPopulationStructure", - cls.aggregate_and_map_ancestries(f.col("discoverySamples")), - ) + ), + ).withColumn( + "ldPopulationStructure", + cls.aggregate_and_map_ancestries(f.col("discoverySamples")), ) return StudyIndex(