Skip to content

Commit

Permalink
fix(validation): add qualityControls column if missing in StudyLocu…
Browse files Browse the repository at this point in the history
…s dataset when perfroming validation (opentargets#814)

* fix: coalesce qualityControl column in credible set

* fix: resolve missing qualityControls column when validating studyLocus

* chore: whitespace

* chore: use getter to infer qc column name

* chore: drop show

* chore: drop warnings for step_test(s)

---------

Co-authored-by: Szymon Szyszkowski <[email protected]>
  • Loading branch information
project-defiant and Szymon Szyszkowski authored Oct 4, 2024
1 parent fca55be commit 68c0168
Show file tree
Hide file tree
Showing 5 changed files with 84 additions and 3 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ site/
.env
.coverage*
wandb/
hail*.log
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ exclude = ["dist"]
addopts = "-n auto --doctest-modules --cov=src/ --cov-report=xml"
pythonpath = ["."]
testpaths = ["tests/gentropy", "src/gentropy"]
marks = ["step_test"]
markers = ["step_test"]

# Semi-strict mode for mypy
[tool.mypy]
Expand Down
29 changes: 29 additions & 0 deletions src/gentropy/common/spark_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -818,3 +818,32 @@ def get_nested_struct_schema(dtype: t.DataType) -> t.StructType:
return get_nested_struct_schema(dtype)
case _:
raise TypeError("The input data type must be a nested struct.")


def get_struct_field_schema(schema: t.StructType, name: str) -> t.DataType:
"""Get schema for underlying struct field.
Args:
schema (t.StructType): Provided schema where the name should be looked in.
name (str): Name of the field to look in the schema
Returns:
t.DataType: Data type of the StructField with provided name
Raises:
ValueError: If provided name is not present in the input schema
Examples:
>>> get_struct_field_schema(t.StructType([t.StructField("a", t.StringType())]), "a")
StringType()
>>> get_struct_field_schema(t.StructType([t.StructField("a", t.StringType())]), "b") # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
...
ValueError: Provided name b is not present in the schema
"""
matching_fields = [f for f in schema.fields if f.name == name]
if not matching_fields:
raise ValueError("Provided name %s is not present in the schema.", name)
return matching_fields[0].dataType
15 changes: 13 additions & 2 deletions src/gentropy/dataset/study_locus.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
from gentropy.common.schemas import parse_spark_schema
from gentropy.common.spark_helpers import (
calculate_neglog_pvalue,
create_empty_column_if_not_exists,
get_struct_field_schema,
order_array_of_structs_by_field,
)
from gentropy.common.utils import get_logsum
Expand Down Expand Up @@ -271,10 +273,19 @@ def validate_lead_pvalue(self: StudyLocus, pvalue_cutoff: float) -> StudyLocus:
Returns:
StudyLocus: Updated study locus with quality control flags.
"""
df = self.df
qc_colname = StudyLocus.get_QC_column_name()
if qc_colname not in self.df.columns:
df = self.df.withColumn(
qc_colname,
create_empty_column_if_not_exists(
qc_colname, get_struct_field_schema(StudyLocus.get_schema(), qc_colname)
),
)
return StudyLocus(
_df=(
self.df.withColumn(
"qualityControls",
df.withColumn(
qc_colname,
# Because this QC might already run on the dataset, the unique set of flags is generated:
f.array_distinct(
self._qc_subsignificant_associations(
Expand Down
40 changes: 40 additions & 0 deletions tests/gentropy/datasource/finngen/test_finngen_finemapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,16 @@

from __future__ import annotations

from pathlib import Path

import hail as hl
import pytest
from pyspark.sql import SparkSession

from gentropy.common.session import Session
from gentropy.dataset.study_locus import StudyLocus
from gentropy.datasource.finngen.finemapping import FinnGenFinemapping
from gentropy.finngen_finemapping_ingestion import FinnGenFinemappingIngestionStep


@pytest.mark.parametrize(
Expand Down Expand Up @@ -43,3 +47,39 @@ def test_finngen_finemapping_from_finngen_susie_finemapping(
),
StudyLocus,
)


@pytest.mark.parametrize(
[
"finngen_susie_finemapping_snp_files",
"finngen_susie_finemapping_cs_summary_files",
],
[
pytest.param(
"tests/gentropy/data_samples/finngen_R9_AB1_EBV.SUSIE.snp.gz",
"tests/gentropy/data_samples/finngen_credset_summary_sample.tsv",
id="non block compressed files",
),
],
)
@pytest.mark.step_test
def test_finngen_finemapping_ingestion_step(
session: Session,
finngen_susie_finemapping_snp_files: str,
finngen_susie_finemapping_cs_summary_files: str,
tmp_path: Path,
) -> None:
"""Test finngen finemapping ingestion step."""
output_path = tmp_path / "output"
FinnGenFinemappingIngestionStep(
session=session,
finngen_finemapping_out=str(output_path),
finngen_susie_finemapping_cs_summary_files=finngen_susie_finemapping_cs_summary_files,
finngen_susie_finemapping_snp_files=finngen_susie_finemapping_snp_files,
finngen_finemapping_lead_pvalue_threshold=1e-5,
)
assert output_path.is_dir()
assert (output_path / "_SUCCESS").exists()

cs = StudyLocus.from_parquet(session=session, path=str(output_path))
assert cs.df.count() == 1

0 comments on commit 68c0168

Please sign in to comment.