Skip to content

Commit

Permalink
Merge pull request #651 from broadinstitute/jg/change_denovo_to_make_…
Browse files Browse the repository at this point in the history
…full_trio_mt

Make dense trio MT
  • Loading branch information
jkgoodrich authored Dec 18, 2024
2 parents af3f365 + 350d345 commit d829f26
Show file tree
Hide file tree
Showing 3 changed files with 129 additions and 2 deletions.
9 changes: 9 additions & 0 deletions gnomad_qc/v4/resources/basics.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def get_gnomad_v4_vds(
entries_to_keep: Optional[List[str]] = None,
annotate_het_non_ref: bool = False,
checkpoint_variant_data: bool = False,
naive_coalesce_partitions: Optional[int] = None,
) -> hl.vds.VariantDataset:
"""
Get gnomAD v4 data with desired filtering and metadata annotations.
Expand Down Expand Up @@ -96,6 +97,8 @@ def get_gnomad_v4_vds(
'_het_non_ref') to the variant data. Default is False.
:param checkpoint_variant_data: Whether to checkpoint the variant data MT after
splitting and filtering. Default is False.
:param naive_coalesce_partitions: Optional argument to coalesce the VDS to a
specific number of partitions using naive coalesce.
:return: gnomAD v4 dataset with chosen annotations and filters.
"""
if remove_hard_filtered_samples and remove_hard_filtered_samples_no_sex:
Expand Down Expand Up @@ -158,6 +161,12 @@ def get_gnomad_v4_vds(
logger.info("Filtering to chromosome %s...", chrom)
vds = hl.vds.filter_chromosomes(vds, keep=chrom)

if naive_coalesce_partitions:
vds = hl.vds.VariantDataset(
vds.reference_data.naive_coalesce(naive_coalesce_partitions),
vds.variant_data.naive_coalesce(naive_coalesce_partitions),
)

if filter_partitions:
logger.info("Filtering to %s partitions...", len(filter_partitions))
vds = hl.vds.VariantDataset(
Expand Down
25 changes: 25 additions & 0 deletions gnomad_qc/v4/resources/sample_qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -997,6 +997,31 @@ def ped_filter_param_json_path(
return f"{get_sample_qc_root(version)}/relatedness/trios/gnomad.exomes.v{version}.ped_filters.json"


def dense_trio_mt(
releasable: bool = True,
test: bool = False,
) -> VersionedMatrixTableResource:
"""
Get the VersionedMatrixTableResource for the dense trio MatrixTable.
:param releasable: Whether to get the resource for the releasable trios only.
:param test: Whether to use a tmp path for a test resource.
:return: VersionedMatrixTableResource of dense trio MatrixTable.
"""
data_type = "exomes"
return VersionedMatrixTableResource(
CURRENT_SAMPLE_QC_VERSION,
{
version: MatrixTableResource(
f"{get_sample_qc_root(version, test, data_type='exomes')}"
f"/relatedness/trios/gnomad.{data_type}.v{version}.trios"
f"{'.releasable' if releasable else ''}.dense.mt"
)
for version in SAMPLE_QC_VERSIONS
},
)


######################################################################
# Other resources
######################################################################
Expand Down
97 changes: 95 additions & 2 deletions gnomad_qc/v4/sample_qc/identify_trios.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import hail as hl
from gnomad.sample_qc.relatedness import (
create_fake_pedigree,
filter_to_trios,
get_duplicated_samples,
get_duplicated_samples_ht,
infer_families,
Expand All @@ -24,8 +25,9 @@
)
from gnomad_qc.slack_creds import slack_token
from gnomad_qc.v4.resources.basics import get_gnomad_v4_vds
from gnomad_qc.v4.resources.meta import project_meta
from gnomad_qc.v4.resources.meta import meta, project_meta
from gnomad_qc.v4.resources.sample_qc import (
dense_trio_mt,
duplicates,
finalized_outlier_filtering,
interval_qc_pass,
Expand Down Expand Up @@ -385,12 +387,64 @@ def filter_ped(
return hl.Pedigree([trio for trio in ped.trios if trio.s in trios]), cutoffs


def get_trio_resources(overwrite: bool, test: bool) -> PipelineResourceCollection:
def create_dense_trio_mt(
fam_ht: hl.Table,
meta_ht: hl.Table,
releasable_only: bool = True,
test: bool = False,
naive_coalesce_partitions: Optional[int] = None,
) -> hl.MatrixTable:
"""
Create a dense MatrixTable for high quality trios.
:param fam_ht: Table with family information.
:param meta_ht: Table with metadata information.
:param releasable_only: Whether to only include trios that are releasable. Default
is True.
:param test: Whether to filter to chr20 for testing. Default is False.
:param naive_coalesce_partitions: Optional Number of partitions to coalesce the VDS
to. Default is None.
:return: Dense MatrixTable with high quality trios.
"""
filter_expr = meta_ht.high_quality
if releasable_only:
filter_expr &= meta_ht.project_meta.releasable

# Filter the metadata table to only samples in high quality and releasable trios.
meta_ht = meta_ht.filter(filter_expr)
fam_ht = fam_ht.filter(
hl.is_defined(meta_ht[fam_ht.id])
& hl.is_defined(meta_ht[fam_ht.pat_id])
& hl.is_defined(meta_ht[fam_ht.mat_id])
)
meta_ht = filter_to_trios(meta_ht, fam_ht)

# Get the gnomAD VDS filtered to high quality releasable trios.
# Using 'entries_to_keep' to keep all entries that are not `gvcf_info` because it
# is likely not needed, and removal will reduce the size of the dense MatrixTable.
vds = get_gnomad_v4_vds(
high_quality_only=True,
chrom="chr20" if test else None,
filter_samples_ht=meta_ht,
entries_to_keep=["LA", "LGT", "LAD", "LPGT", "LPL", "DP", "GQ", "SB"],
naive_coalesce_partitions=naive_coalesce_partitions,
)

return hl.vds.to_dense_mt(vds)


def get_trio_resources(
overwrite: bool,
test: bool,
dense_trio_mt_releasable_only: bool = True,
) -> PipelineResourceCollection:
"""
Get PipelineResourceCollection for all resources needed in the trio identification pipeline.
:param overwrite: Whether to overwrite existing resources.
:param test: Whether to use test resources.
:param dense_trio_mt_releasable_only: Whether to only include trios that are
releasable in the dense trio MT. Default is True.
:return: PipelineResourceCollection containing resources for all steps of the
trio identification pipeline.
"""
Expand Down Expand Up @@ -457,6 +511,16 @@ def get_trio_resources(overwrite: bool, test: bool) -> PipelineResourceCollectio
},
pipeline_input_steps=[infer_families, run_mendel_errors],
)
create_dense_trio_mt = PipelineStepResourceCollection(
"--create-dense-trio-mt",
output_resources={
"dense_trio_mt": dense_trio_mt(
releasable=dense_trio_mt_releasable_only, test=test
)
},
pipeline_input_steps=[finalize_ped],
add_input_resources={"Finalized metadata HT": {"meta_ht": meta()}},
)

# Add all steps to the trio identification pipeline resource collection.
trio_pipeline.add_steps(
Expand All @@ -466,6 +530,7 @@ def get_trio_resources(overwrite: bool, test: bool) -> PipelineResourceCollectio
"create_fake_pedigree": create_fake_pedigree,
"run_mendel_errors": run_mendel_errors,
"finalize_ped": finalize_ped,
"create_dense_trio_mt": create_dense_trio_mt,
}
)

Expand Down Expand Up @@ -566,6 +631,17 @@ def main(args):
with hl.hadoop_open(res.filter_json, "w") as d:
d.write(json.dumps(filters))

if args.create_dense_trio_mt:
res = trio_resources.create_dense_trio_mt
res.check_resource_existence()
create_dense_trio_mt(
res.final_ped.ht(),
res.meta_ht.ht(),
args.releasable_only,
test,
naive_coalesce_partitions=args.naive_coalesce_partitions,
).write(res.dense_trio_mt.path, overwrite=args.overwrite)


def get_script_argument_parser() -> argparse.ArgumentParser:
"""Get script argument parser."""
Expand Down Expand Up @@ -724,6 +800,23 @@ def get_script_argument_parser() -> argparse.ArgumentParser:
type=int,
default=24,
)
dense_trio_mt_args = parser.add_argument_group("Create dense trio MT.")
dense_trio_mt_args.add_argument(
"--create-dense-trio-mt",
help=("Create a dense MT for high quality trios."),
action="store_true",
)
dense_trio_mt_args.add_argument(
"--releasable-only",
help=("Only include trios that are releasable."),
action="store_true",
)
dense_trio_mt_args.add_argument(
"--naive-coalesce-partitions",
help=("Number of partitions to coalesce the VDS to."),
type=int,
default=5000,
)

return parser

Expand Down

0 comments on commit d829f26

Please sign in to comment.