Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add additional input arguments to vc-gatk4-hc #1403

Merged
merged 2 commits into from
Dec 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ Added
- Add filtering ``Variant`` by ``VariantExperiment`` and ``VariantCall``
- Use generic permission filters when filtering variants by related models
- Return only distinct ``VariantExperiment`` objects
- Add ``--bam-output`` input argument to ``vc-gatk4-hc``
- Add ``--max-mnp-distance`` input argument to ``vc-gatk4-hc``


===================
Expand Down
44 changes: 43 additions & 1 deletion resolwe_bio/processes/variant_calling/gatk4_hc.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ class GatkHaplotypeCaller(Process):
name = "GATK4 (HaplotypeCaller)"
category = "GATK"
process_type = "data:variants:vcf:gatk:hc"
version = "1.5.0"
version = "1.6.0"
scheduling_class = SchedulingClass.BATCH
entity = {"type": "sample"}
requirements = {
Expand Down Expand Up @@ -113,6 +113,33 @@ class Advanced:
default=12,
description="Set the maximum Java heap size (in GB).",
)
bam_out = BooleanField(
label="Output a BAM containing assembled haplotypes.",
default=False,
description="Output a BAM containing assembled haplotypes. "
"Really for debugging purposes only. "
"Note that the output here does not include uninformative reads so that not every input read is emitted to the bam. "
"Turning on this mode may result in serious performance cost for the caller.",
)
bam_writer_type = StringField(
label="BAM writer type",
default="CALLED_HAPLOTYPES",
description="The type of BAM output. "
"This determines whether HC will write out all of the haplotypes it considered (top 128 max) "
"or just the ones that were selected as alleles and assigned to samples.",
choices=[
("CALLED_HAPLOTYPES", "Assigned haplotypes"),
("ALL_POSSIBLE_HAPLOTYPES", "All considered haplotypes"),
("NO_HAPLOTYPES", "Do not output haplotypes"),
],
disabled="!bam_out",
)
max_mnp_distance = IntegerField(
label="Max MNP distance",
default=0,
description="Two or more phased substitutions separated by this distance or less are merged into MNPs. "
"Set 0 to disable.",
)

advanced = GroupField(Advanced, label="Advanced options")

Expand All @@ -121,6 +148,8 @@ class Output:

vcf = FileField(label="VCF file")
tbi = FileField(label="Tabix index")
bam = FileField(label="Alignment file", required=False)
bai = FileField(label="BAM file index", required=False)
species = StringField(label="Species")
build = StringField(label="Build")

Expand Down Expand Up @@ -157,6 +186,8 @@ def run(self, inputs, outputs):
inputs.stand_call_conf,
"--tmp-dir",
TMPDIR,
"--max-mnp-distance",
inputs.advanced.max_mnp_distance,
]

if inputs.advanced.soft_clipped:
Expand All @@ -168,6 +199,11 @@ def run(self, inputs, outputs):
if inputs.advanced.interval_padding:
args.extend(["--interval-padding", inputs.advanced.interval_padding])

if inputs.advanced.bam_out:
output_bam = name + ".gatkHC.bam"
args.extend(["--bam-output", output_bam])
args.extend(["--bam-writer-type", inputs.advanced.bam_writer_type])

return_code, stdout, stderr = Cmd["gatk"]["HaplotypeCaller"][args] & TEE(
retcode=None
)
Expand All @@ -188,3 +224,9 @@ def run(self, inputs, outputs):
outputs.tbi = variants_index
outputs.species = inputs.alignment.output.species
outputs.build = inputs.alignment.output.build
if inputs.advanced.bam_out:
output_bai = Path(name + ".gatkHC.bai")
renamed_bai = output_bai.with_suffix(".bam.bai")
output_bai.rename(renamed_bai)
outputs.bam = output_bam
outputs.bai = str(renamed_bai)
Binary file not shown.
Binary file not shown.
8 changes: 8 additions & 0 deletions resolwe_bio/tests/processes/test_variant_calling.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,8 @@ def test_gatk_haplotypecaller(self):
"soft_clipped": True,
"java_gc_threads": 3,
"max_heap_size": 7,
"bam_out": True,
"bam_writer_type": "CALLED_HAPLOTYPES",
},
},
)
Expand All @@ -205,6 +207,12 @@ def test_gatk_haplotypecaller(self):
file_filter=filter_vcf_variable,
compression="gzip",
)
self.assertFile(
gatk_rnaseq, "bam", output_folder / "56GSID_10k.rna-seq.gatkHC.bam"
)
self.assertFile(
gatk_rnaseq, "bai", output_folder / "56GSID_10k.rna-seq.gatkHC.bam.bai"
)

@tag_process("snpeff", "snpeff-single")
def test_snpeff(self):
Expand Down