From d4b507049ca54c927e8d80da43449f93e931e28a Mon Sep 17 00:00:00 2001
From: Szymon Szyszkowski <69353402+project-defiant@users.noreply.github.com>
Date: Tue, 1 Oct 2024 16:41:19 +0200
Subject: [PATCH] fix: align the schema of study_index for ukb ppp eur (#803)

* fix(ukb_ppp_study_index): update column name to match schema

* chore: add note that notebooks are not supported

---------

Co-authored-by: Szymon Szyszkowski <ss60@mib117351s.internal.sanger.ac.uk>
---
 notebooks/README.md                           |  3 ++
 notebooks/Release_QC_metrics.ipynb            |  2 +-
 .../datasource/ukb_ppp_eur/study_index.py     | 31 ++++++++-----------
 3 files changed, 17 insertions(+), 19 deletions(-)
 create mode 100644 notebooks/README.md

diff --git a/notebooks/README.md b/notebooks/README.md
new file mode 100644
index 000000000..35132b6bf
--- /dev/null
+++ b/notebooks/README.md
@@ -0,0 +1,3 @@
+# Notebooks
+
+The notebooks in listed in this directory are not actively maintained and updated.
diff --git a/notebooks/Release_QC_metrics.ipynb b/notebooks/Release_QC_metrics.ipynb
index 4eb27015b..5f9bf77c0 100644
--- a/notebooks/Release_QC_metrics.ipynb
+++ b/notebooks/Release_QC_metrics.ipynb
@@ -419,7 +419,7 @@
     "# Number of studies\n",
     "eqtl_index=session.spark.read.parquet(eqtl_index_path, recursiveFileLookup=True)\n",
     "# Number of tissues, list of tissues\n",
-    "#eqtl_index.select(f.col(\"tissueFromSourceId\")).distinct().show(truncate=False)\n",
+    "#eqtl_index.select(f.col(\"biosampleFromSourceId\")).distinct().show(truncate=False)\n",
     "\n",
     "# Credible_set. Please use Daniels’ notebook as a reference. For each subfolder:\n",
     "# eqtl catalog susie:\n",
diff --git a/src/gentropy/datasource/ukb_ppp_eur/study_index.py b/src/gentropy/datasource/ukb_ppp_eur/study_index.py
index f694b9a47..8a3105f5d 100644
--- a/src/gentropy/datasource/ukb_ppp_eur/study_index.py
+++ b/src/gentropy/datasource/ukb_ppp_eur/study_index.py
@@ -1,4 +1,5 @@
 """Study Index for Finngen data source."""
+
 from __future__ import annotations
 
 import pyspark.sql.functions as f
@@ -29,9 +30,7 @@ def from_source(
         """
         # In order to populate the nSamples column, we need to peek inside the summary stats dataframe.
         num_of_samples = (
-            spark
-            .read
-            .parquet(raw_summary_stats_path)
+            spark.read.parquet(raw_summary_stats_path)
             .filter(f.col("chromosome") == "22")
             .groupBy("studyId")
             .agg(f.first("N").cast("integer").alias("nSamples"))
@@ -45,7 +44,7 @@ def from_source(
                 f.lit("UKB_PPP_EUR").alias("projectId"),
                 f.col("_gentropy_study_id").alias("studyId"),
                 f.col("UKBPPP_ProteinID").alias("traitFromSource"),
-                f.lit("UBERON_0001969").alias("tissueFromSourceId"),
+                f.lit("UBERON_0001969").alias("biosampleFromSourceId"),
                 f.col("ensembl_id").alias("geneId"),
                 f.lit(True).alias("hasSumstats"),
                 f.col("_gentropy_summary_stats_link").alias("summarystatsLocation"),
@@ -53,21 +52,17 @@ def from_source(
             .join(num_of_samples, "studyId", "inner")
         )
         # Add population structure.
-        study_index_df = (
-            study_index_df
-            .withColumn(
-                "discoverySamples",
-                f.array(
-                    f.struct(
-                        f.col("nSamples").cast("integer").alias("sampleSize"),
-                        f.lit("European").alias("ancestry"),
-                    )
+        study_index_df = study_index_df.withColumn(
+            "discoverySamples",
+            f.array(
+                f.struct(
+                    f.col("nSamples").cast("integer").alias("sampleSize"),
+                    f.lit("European").alias("ancestry"),
                 )
-            )
-            .withColumn(
-                "ldPopulationStructure",
-                cls.aggregate_and_map_ancestries(f.col("discoverySamples")),
-            )
+            ),
+        ).withColumn(
+            "ldPopulationStructure",
+            cls.aggregate_and_map_ancestries(f.col("discoverySamples")),
         )
 
         return StudyIndex(