Skip to content

Commit

Permalink
fix(find_overlap): missing right study type in output (opentargets#828)
Browse files Browse the repository at this point in the history
* fix: rightStudyType nulls removed

* test: improve testing of peak overlap function

* fix: overlap test

---------

Co-authored-by: David Ochoa <[email protected]>
  • Loading branch information
DSuveges and d0choa authored Oct 11, 2024
1 parent fb6111d commit 9f446e8
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 3 deletions.
3 changes: 2 additions & 1 deletion src/gentropy/dataset/study_locus.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,7 @@ def _overlapping_peaks(
.select(
f.col("left.studyLocusId").alias("leftStudyLocusId"),
f.col("right.studyLocusId").alias("rightStudyLocusId"),
f.col("right.studyType").alias("rightStudyType"),
f.col("left.chromosome").alias("chromosome"),
)
.distinct()
Expand Down Expand Up @@ -452,7 +453,6 @@ def _align_overlapping_tags(
f.col("chromosome"),
f.col("tagVariantId"),
f.col("studyLocusId").alias("rightStudyLocusId"),
f.col("studyType").alias("rightStudyType"),
*[f.col(col).alias(f"right_{col}") for col in stats_cols],
).join(peak_overlaps, on=["chromosome", "rightStudyLocusId"], how="inner")

Expand All @@ -464,6 +464,7 @@ def _align_overlapping_tags(
"rightStudyLocusId",
"leftStudyLocusId",
"tagVariantId",
"rightStudyType",
],
how="outer",
).select(
Expand Down
9 changes: 9 additions & 0 deletions tests/gentropy/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,15 @@ def sample_ukbiobank_studies(spark: SparkSession) -> DataFrame:
)


@pytest.fixture()
def study_locus_sample_for_colocalisation(spark: SparkSession) -> DataFrame:
"""Sample study locus data for colocalisation."""
return StudyLocus(
_df=spark.read.parquet("tests/gentropy/data_samples/coloc_test.parquet"),
_schema=StudyLocus.get_schema(),
)


@pytest.fixture()
def sample_target_index(spark: SparkSession) -> DataFrame:
"""Sample target index sample data."""
Expand Down
Binary file added tests/gentropy/data_samples/coloc_test.parquet
Binary file not shown.
45 changes: 43 additions & 2 deletions tests/gentropy/dataset/test_study_locus_overlaps.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from typing import TYPE_CHECKING, Any

import pyspark.sql.functions as f
import pyspark.sql.types as t
import pytest

Expand Down Expand Up @@ -59,7 +60,12 @@ def test_study_locus_overlap_from_associations(mock_study_locus: StudyLocus) ->
False,
# expected - output DataFrame with overlapping signals
[
{"leftStudyLocusId": "1", "rightStudyLocusId": "2", "chromosome": "1"},
{
"leftStudyLocusId": "1",
"rightStudyLocusId": "2",
"rightStudyType": "eqtl",
"chromosome": "1",
},
],
),
(
Expand Down Expand Up @@ -93,7 +99,14 @@ def test_study_locus_overlap_from_associations(mock_study_locus: StudyLocus) ->
# intrastudy - bool of whether or not to use inter-study or intra-study logic
True,
# expected - output DataFrame with overlapping signals
[{"leftStudyLocusId": "2", "rightStudyLocusId": "1", "chromosome": "1"}],
[
{
"leftStudyLocusId": "2",
"rightStudyLocusId": "1",
"rightStudyType": "gwas",
"chromosome": "1",
}
],
),
],
)
Expand All @@ -118,10 +131,38 @@ def test_overlapping_peaks(
[
t.StructField("leftStudyLocusId", t.StringType()),
t.StructField("rightStudyLocusId", t.StringType()),
t.StructField("rightStudyType", t.StringType()),
t.StructField("chromosome", t.StringType()),
]
)
observed_df = spark.createDataFrame(observed, mock_schema)
result_df = StudyLocus._overlapping_peaks(observed_df, intrastudy)
expected_df = spark.createDataFrame(expected, expected_schema)
assert result_df.collect() == expected_df.collect()


class TestStudyLocusOverlap:
"""Test the overlapping of StudyLocus dataset."""

@pytest.fixture(autouse=True)
def setup(
self: TestStudyLocusOverlap, study_locus_sample_for_colocalisation: StudyLocus
) -> None:
"""Get sample dataset."""
# Store imput dataset:
self.study_locus = study_locus_sample_for_colocalisation

# Call locus overlap:
self.overlaps = study_locus_sample_for_colocalisation.find_overlaps()

def test_coloc_return_type(self: TestStudyLocusOverlap) -> None:
"""Test get_schema."""
assert isinstance(self.overlaps, StudyLocusOverlap)

def test_coloc_not_null(self: TestStudyLocusOverlap) -> None:
"""Test get_schema."""
assert self.overlaps.df.count() != 0

def test_coloc_study_type_not_null(self: TestStudyLocusOverlap) -> None:
"""Test get_schema."""
assert self.overlaps.filter(f.col("rightStudyType").isNull()).df.count() == 0

0 comments on commit 9f446e8

Please sign in to comment.