Skip to content

Commit

Permalink
chore(gnomad): updating GnomAD version to 4.1 from 4.0 + using joint …
Browse files Browse the repository at this point in the history
…frequencies (opentargets#929)

* fix: gnomad 4.1 frequencies

* fix: removing in-silico extraction in gnomad

* fix: removing in silico predictor ingestion from gnomad pre-process
  • Loading branch information
DSuveges authored Nov 26, 2024
1 parent 8a83ec6 commit 008aa38
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 38 deletions.
4 changes: 3 additions & 1 deletion src/gentropy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,9 @@ class GnomadVariantConfig(StepConfig):
}
)
variant_annotation_path: str = MISSING
gnomad_genomes_path: str = "gs://gcp-public-data--gnomad/release/4.0/ht/genomes/gnomad.genomes.v4.0.sites.ht/"
gnomad_genomes_path: str = (
"gs://gcp-public-data--gnomad/release/4.1/ht/joint/gnomad.joint.v4.1.sites.ht/"
)
gnomad_variant_populations: list[str] = field(
default_factory=lambda: [
"afr", # African-American
Expand Down
41 changes: 4 additions & 37 deletions src/gentropy/datasource/gnomad/variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from gentropy.common.types import VariantPopulation
from gentropy.config import GnomadVariantConfig, VariantIndexConfig
from gentropy.dataset.variant_index import InSilicoPredictorNormaliser, VariantIndex
from gentropy.dataset.variant_index import VariantIndex

if TYPE_CHECKING:
pass
Expand Down Expand Up @@ -84,32 +84,11 @@ def as_variant_index(self: GnomADVariants) -> VariantIndex:
).map(
lambda p: hl.struct(
populationName=p,
alleleFrequency=ht.freq[ht.globals.freq_index_dict[p]].AF,
alleleFrequency=ht.joint.freq[
ht.joint_globals.freq_index_dict[p]
].AF,
)
),
# Extract in silico predictors:
inSilicoPredictors=hl.array(
[
hl.struct(
method=hl.str("SpliceAI"),
assessment=hl.missing(hl.tstr),
score=hl.expr.functions.float32(
ht.in_silico_predictors.spliceai_ds_max
),
assessmentFlag=hl.missing(hl.tstr),
targetId=hl.missing(hl.tstr),
),
hl.struct(
method=hl.str("Pangolin"),
assessment=hl.missing(hl.tstr),
score=hl.expr.functions.float32(
ht.in_silico_predictors.pangolin_largest_ds
),
assessmentFlag=hl.missing(hl.tstr),
targetId=hl.missing(hl.tstr),
),
]
),
# Extract cross references to GnomAD:
dbXrefs=hl.array(
[
Expand All @@ -133,11 +112,6 @@ def as_variant_index(self: GnomADVariants) -> VariantIndex:
.to_spark(flatten=False)
.withColumns(
{
# Once The parsing is done, we have to drop objects with no score from inSilicoPredictors:
"inSilicoPredictors": f.filter(
f.col("inSilicoPredictors"),
lambda predictor: predictor["score"].isNotNull(),
),
# Generate a variantId that is hashed for long variant ids:
"variantId": VariantIndex.hash_long_variant_ids(
f.col("variantId"),
Expand All @@ -149,13 +123,6 @@ def as_variant_index(self: GnomADVariants) -> VariantIndex:
"mostSevereConsequenceId": f.lit(None).cast(t.StringType()),
}
)
# Normalising in silico predictor assessments:
.withColumn(
"inSilicoPredictors",
InSilicoPredictorNormaliser.normalise_in_silico_predictors(
f.col("inSilicoPredictors")
),
)
),
_schema=VariantIndex.get_schema(),
)
2 changes: 2 additions & 0 deletions src/gentropy/gnomad_ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,8 @@ def __init__(
gnomad_genomes_path, variant_annotation_path
)

session.logger.info("Gnomad variant annotation path:")
session.logger.info(variant_annotation_path)
# Parse variant info from source.
(
GnomADVariants(
Expand Down

0 comments on commit 008aa38

Please sign in to comment.