fix(find_overlap): missing right study type in output (opentargets#828)

* fix: rightStudyType nulls removed * test: improve testing of peak overlap function * fix: overlap test --------- Co-authored-by: David Ochoa <[email protected]>
thehyve · Oct 11, 2024 · 9f446e8 · 9f446e8
1 parent fb6111d
commit 9f446e8
Show file tree

Hide file tree

Showing 4 changed files with 54 additions and 3 deletions.
diff --git a/src/gentropy/dataset/study_locus.py b/src/gentropy/dataset/study_locus.py
@@ -412,6 +412,7 @@ def _overlapping_peaks(
             .select(
                 f.col("left.studyLocusId").alias("leftStudyLocusId"),
                 f.col("right.studyLocusId").alias("rightStudyLocusId"),
+                f.col("right.studyType").alias("rightStudyType"),
                 f.col("left.chromosome").alias("chromosome"),
             )
             .distinct()
@@ -452,7 +453,6 @@ def _align_overlapping_tags(
             f.col("chromosome"),
             f.col("tagVariantId"),
             f.col("studyLocusId").alias("rightStudyLocusId"),
-            f.col("studyType").alias("rightStudyType"),
             *[f.col(col).alias(f"right_{col}") for col in stats_cols],
         ).join(peak_overlaps, on=["chromosome", "rightStudyLocusId"], how="inner")
 
@@ -464,6 +464,7 @@ def _align_overlapping_tags(
                 "rightStudyLocusId",
                 "leftStudyLocusId",
                 "tagVariantId",
+                "rightStudyType",
             ],
             how="outer",
         ).select(

diff --git a/tests/gentropy/conftest.py b/tests/gentropy/conftest.py
@@ -501,6 +501,15 @@ def sample_ukbiobank_studies(spark: SparkSession) -> DataFrame:
     )
 
 
+@pytest.fixture()
+def study_locus_sample_for_colocalisation(spark: SparkSession) -> DataFrame:
+    """Sample study locus data for colocalisation."""
+    return StudyLocus(
+        _df=spark.read.parquet("tests/gentropy/data_samples/coloc_test.parquet"),
+        _schema=StudyLocus.get_schema(),
+    )
+
+
 @pytest.fixture()
 def sample_target_index(spark: SparkSession) -> DataFrame:
     """Sample target index sample data."""

diff --git a/tests/gentropy/data_samples/coloc_test.parquet b/tests/gentropy/data_samples/coloc_test.parquet
diff --git a/tests/gentropy/dataset/test_study_locus_overlaps.py b/tests/gentropy/dataset/test_study_locus_overlaps.py
@@ -4,6 +4,7 @@
 
 from typing import TYPE_CHECKING, Any
 
+import pyspark.sql.functions as f
 import pyspark.sql.types as t
 import pytest
 
@@ -59,7 +60,12 @@ def test_study_locus_overlap_from_associations(mock_study_locus: StudyLocus) ->
             False,
             # expected - output DataFrame with overlapping signals
             [
-                {"leftStudyLocusId": "1", "rightStudyLocusId": "2", "chromosome": "1"},
+                {
+                    "leftStudyLocusId": "1",
+                    "rightStudyLocusId": "2",
+                    "rightStudyType": "eqtl",
+                    "chromosome": "1",
+                },
             ],
         ),
         (
@@ -93,7 +99,14 @@ def test_study_locus_overlap_from_associations(mock_study_locus: StudyLocus) ->
             # intrastudy - bool of whether or not to use inter-study or intra-study logic
             True,
             # expected - output DataFrame with overlapping signals
-            [{"leftStudyLocusId": "2", "rightStudyLocusId": "1", "chromosome": "1"}],
+            [
+                {
+                    "leftStudyLocusId": "2",
+                    "rightStudyLocusId": "1",
+                    "rightStudyType": "gwas",
+                    "chromosome": "1",
+                }
+            ],
         ),
     ],
 )
@@ -118,10 +131,38 @@ def test_overlapping_peaks(
         [
             t.StructField("leftStudyLocusId", t.StringType()),
             t.StructField("rightStudyLocusId", t.StringType()),
+            t.StructField("rightStudyType", t.StringType()),
             t.StructField("chromosome", t.StringType()),
         ]
     )
     observed_df = spark.createDataFrame(observed, mock_schema)
     result_df = StudyLocus._overlapping_peaks(observed_df, intrastudy)
     expected_df = spark.createDataFrame(expected, expected_schema)
     assert result_df.collect() == expected_df.collect()
+
+
+class TestStudyLocusOverlap:
+    """Test the overlapping of StudyLocus dataset."""
+
+    @pytest.fixture(autouse=True)
+    def setup(
+        self: TestStudyLocusOverlap, study_locus_sample_for_colocalisation: StudyLocus
+    ) -> None:
+        """Get sample dataset."""
+        # Store imput dataset:
+        self.study_locus = study_locus_sample_for_colocalisation
+
+        # Call locus overlap:
+        self.overlaps = study_locus_sample_for_colocalisation.find_overlaps()
+
+    def test_coloc_return_type(self: TestStudyLocusOverlap) -> None:
+        """Test get_schema."""
+        assert isinstance(self.overlaps, StudyLocusOverlap)
+
+    def test_coloc_not_null(self: TestStudyLocusOverlap) -> None:
+        """Test get_schema."""
+        assert self.overlaps.df.count() != 0
+
+    def test_coloc_study_type_not_null(self: TestStudyLocusOverlap) -> None:
+        """Test get_schema."""
+        assert self.overlaps.filter(f.col("rightStudyType").isNull()).df.count() == 0