From 70258d957c6818d1d38baa8425cb9ab6ca2b5a8f Mon Sep 17 00:00:00 2001 From: Darryl Nousome Date: Mon, 17 Jun 2024 21:35:38 -0400 Subject: [PATCH 01/16] fix: add hg19 to rules --- conf/base.config | 5 +++++ main.nf | 8 ++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/conf/base.config b/conf/base.config index c538c5c..50baa58 100644 --- a/conf/base.config +++ b/conf/base.config @@ -68,6 +68,11 @@ process { memory = { check_max( 48.GB * task.attempt, 'memory' ) } time = { check_max( 72.h * task.attempt, 'time' ) } } + withName:bwamem2 { + cpus = { check_max( 20 * task.attempt, 'cpus' ) } + memory = { check_max( 200.GB * task.attempt, 'memory' ) } + time = { check_max( 72.h * task.attempt, 'time' ) } + } withLabel:error_ignore { errorStrategy = 'ignore' } diff --git a/main.nf b/main.nf index 325842a..1541830 100644 --- a/main.nf +++ b/main.nf @@ -55,7 +55,7 @@ workflow { if (params.cnv){ if (params.genome == "mm10"){ CNVmouse(ALIGN.out.bamwithsample) - } else if (params.genome== "hg38"){ + } else if (params.genome== "hg38" |params.genome== "hg19"){ if (!params.vc){ CNVhuman_novc(ALIGN.out.bamwithsample) } else { @@ -84,7 +84,7 @@ workflow { if (params.cnv){ if (params.genome == "mm10"){ CNVmouse(INPUT_BAM.out.bamwithsample) - } else if (params.genome == "hg38"){ + } else if (params.genome == "hg38"|params.genome== "hg19"){ if (!params.vc){ CNVhuman_novc(INPUT_BAM.out.bamwithsample) }else { @@ -108,7 +108,7 @@ workflow { if (params.cnv){ if (params.genome == "mm10"){ CNVmouse_tonly(ALIGN_TONLY.out.bamwithsample) - } else if (params.genome== "hg38"){ + } else if (params.genome== "hg38"|params.genome== "hg19"){ if (!params.vc){ VC_TONLY(ALIGN_TONLY.out.bamwithsample,ALIGN_TONLY.out.splitout,ALIGN_TONLY.out.sample_sheet) CNVhuman_tonly(ALIGN_TONLY.out.bamwithsample,VC_TONLY.out.somaticcall_input) @@ -135,7 +135,7 @@ workflow { if (params.cnv){ if (params.genome == "mm10"){ CNVmouse_tonly(INPUT_TONLY_BAM.out.bamwithsample) - } else if (params.genome== "hg38"){ + } else if (params.genome== "hg38" | params.genome== "hg19"){ if (!params.vc){ VC_TONLY(INPUT_TONLY_BAM.out.bamwithsample,INPUT_TONLY_BAM.out.splitout,INPUT_TONLY_BAM.out.sample_sheet) CNVhuman_tonly(INPUT_TONLY_BAM.out.bamwithsample,VC_TONLY.out.somaticcall_input) From c03a313698bfd65261068c6641b902cd8d3736b2 Mon Sep 17 00:00:00 2001 From: Darryl Nousome Date: Mon, 17 Jun 2024 21:36:12 -0400 Subject: [PATCH 02/16] feat: hg19 purple --- modules/local/copynumber.nf | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/modules/local/copynumber.nf b/modules/local/copynumber.nf index 3b2855d..1bcfe9c 100644 --- a/modules/local/copynumber.nf +++ b/modules/local/copynumber.nf @@ -15,13 +15,15 @@ if (params.genome=="mm10"){ } if (params.genome=="hg38" | params.genome=="hg19"){ + HMFGENOMEREF = file(params.genomes[params.genome].HMFGENOME) GENOMEVER = params.genomes[params.genome].GENOMEVER GCPROFILE = file(params.genomes[params.genome].GCPROFILE) GERMLINEHET = file(params.genomes[params.genome].GERMLINEHET) DIPLODREG = file(params.genomes[params.genome].DIPLODREG) ENSEMBLCACHE = params.genomes[params.genome].ENSEMBLCACHE DRIVERS = file(params.genomes[params.genome].DRIVERS) - HOTSPOTS = file(params.genomes[params.genome].HOTSPOTS) + SOMATICHOTSPOTS = file(params.genomes[params.genome].SOMATICHOTSPOTS) + GERMLINEHOTSPOTS = file(params.genomes[params.genome].GERMLINEHOTSPOTS) } //mm10 Paired-Sequenza, FREEC-tumor only @@ -349,7 +351,7 @@ process amber_tonly { -tumor ${tumorname} -tumor_bam ${tumor} \ -output_dir ${tumorname}_amber \ -threads $task.cpus \ - -ref_genome_version V38 \ + -ref_genome_version $HMFGENOMEREF \ -loci $GERMLINEHET """ @@ -372,7 +374,8 @@ process amber_tn { val(normalname), path(normal), path(normalbai) output: - tuple val(tumorname), path("${tumorname}_vs_${normalname}_amber") + tuple val("${tumorname}_vs_${normalname}"), + val(tumorname), val(normalname), path("${tumorname}_vs_${normalname}_amber") script: @@ -383,7 +386,7 @@ process amber_tn { -reference ${normalname} -reference_bam ${normal} \ -output_dir ${tumorname}_vs_${normalname}_amber \ -threads $task.cpus \ - -ref_genome_version V38 \ + -ref_genome_version $HMFGENOMEREF \ -loci $GERMLINEHET """ @@ -436,7 +439,8 @@ process cobalt_tn { val(normalname), path(normal), path(normalbai) output: - tuple val(tumorname), path("${tumorname}_vs_${normalname}_cobalt") + tuple val("${tumorname}_vs_${normalname}"), + val(tumorname), val(normalname), path("${tumorname}_vs_${normalname}_cobalt") script: @@ -464,12 +468,12 @@ process purple { label 'process_medium' input: - tuple val(tumorname), val(normalname), + tuple val(id), val(tumorname), val(normalname), path(amberin), path(cobaltin), path(somaticvcf), path(somaticvcfindex) output: - tuple val(tumorname), path("${tumorname}") + tuple val(id), path("${id}") script: @@ -485,16 +489,16 @@ process purple { $ENSEMBLCACHE \ -somatic_vcf ${somaticvcf} \ -driver_gene_panel $DRIVERS \ - -somatic_hotspots $HOTSPOTS \ + -somatic_hotspots $SOMATICHOTSPOTS \ -threads $task.cpus \ - -output_dir ${tumorname} + -output_dir ${id} """ stub: """ - mkdir ${tumorname} - touch ${tumorname}/${tumorname}.purple.cnv.somatic.tsv ${tumorname}/${tumorname}.purple.cnv.gene.tsv ${tumorname}/${tumorname}.driver.catalog.somatic.tsv + mkdir ${id} + touch ${id}/${id}.purple.cnv.somatic.tsv ${id}/${id}.purple.cnv.gene.tsv ${id}/${id}.driver.catalog.somatic.tsv """ } @@ -505,7 +509,7 @@ process purple_novc { label 'process_medium' input: - tuple val(tumorname), val(normalname), + tuple val(id), val(tumorname), val(normalname), path(cobaltin), path(amberin) output: @@ -521,10 +525,10 @@ process purple_novc { -cobalt ${cobaltin} \ -gc_profile $GCPROFILE \ -ref_genome_version $GENOMEVER \ - -ref_genome $GENOMEREF \ + -ref_genome $HMFGENOMEREF \ $ENSEMBLCACHE \ -threads $task.cpus \ - -output_dir ${tumorname} + -output_dir ${tvn} """ From 54054fdb51abaa0c2d791f23120b3c39b30e6871 Mon Sep 17 00:00:00 2001 From: Darryl Nousome Date: Mon, 17 Jun 2024 21:36:24 -0400 Subject: [PATCH 03/16] feat: hg19 cnv changes --- conf/genomes.config | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/conf/genomes.config b/conf/genomes.config index 07a05d0..040c1b6 100644 --- a/conf/genomes.config +++ b/conf/genomes.config @@ -31,7 +31,9 @@ params { chromosomes = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY','chrM'] //HMFTOOLS GENOMEVER = "38" - HOTSPOTS = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/variants/KnownHotspots.somatic.38.vcf.gz" + HMFGENOME = "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/bwamem2/GRCh38.d1.vd1.fa" + SOMATICHOTSPOTS = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/variants/KnownHotspots.somatic.38.vcf.gz" + GERMLINEHOTSPOTS = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/variants/KnownHotspots.germline.38.vcf.gz" PANELBED = "-panel_bed /data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/variants/ActionableCodingPanel.38.bed.gz" HCBED = "-high_confidence_bed /data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/variants/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel_noCENorHET7.bed.gz" ENSEMBLCACHE = "-ensembl_data_dir /data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/common/ensembl_data" @@ -45,7 +47,7 @@ params { 'hg19' { genome = "/data/CCBR_Pipeliner/db/PipeDB/lib/hg19.with_extra.fa" genomefai = "/data/CCBR_Pipeliner/db/PipeDB/lib/hg19.with_extra.fa.fai" - bwagenome= "/data/CCBR_Pipeliner/db/PipeDB/lib/hs37d5.fa" + bwagenome = "/data/CCBR_Pipeliner/db/PipeDB/lib/hs37d5.fa" genomedict= "/data/CCBR_Pipeliner/db/PipeDB/lib/hg19.with_extra.dict" intervals= "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg19/hg19_noblacklist_maincontig.bed" INDELREF = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg19/GATKbundle/Mills_and_1000G_gold_standard.indels.hg19.vcf.gz" @@ -71,16 +73,17 @@ params { chromosomes = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY','chrM'] //HMFTOOLS GENOMEVER = "37" - HOTSPOTS = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/variants/KnownHotspots.38.vcf.gz" - PANELBED = "-panel_bed /data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/variants/ActionableCodingPanel.38.bed.gz" - HCBED = "-high_confidence_bed /data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/variants/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel_noCENorHET7.bed.gz" - ENSEMBLCACHE = "-ensembl_data_dir /data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/common/ensembl_data" + HMFGENOME = "/data/CCBR_Pipeliner/db/PipeDB/lib/hs37d5.fa" + SOMATICHOTSPOTS = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg19/hmftools/v5_34/ref/37/variants/KnownHotspots.somatic.37.vcf.gz" + GERMLINEHOTSPOTS = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg19/hmftools/v5_34/ref/37/variants/KnownHotspots.germline.37.vcf.gz" + PANELBED = "-panel_bed /data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg19/hmftools/v5_34/ref/37/variants/ActionableCodingPanel.37.bed.gz" + HCBED = "-high_confidence_bed /data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg19/hmftools/v5_34/ref/37/variants/NA12878_GIAB_highconf_IllFB-IllGATKHC-CG-Ion-Solid_ALLCHROM_v3.2.2_highconf.bed.gz" + ENSEMBLCACHE = "-ensembl_data_dir /data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg19/hmftools/v5_34/ref/37/common/ensembl_data" //PURPLE - GERMLINEHET = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/copy_number/AmberGermlineSites.38.tsv.gz" - GCPROFILE = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/PURPLE/GC_profile.1000bp.38.cnp" - DIPLODREG = '/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/PURPLE/DiploidRegions.38.bed.gz' - ENSEMBLCACHE = '/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/PURPLE/ensembl_data/' - DRIVERS = '/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/PURPLE/DriverGenePanel.38.tsv' + GERMLINEHET = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg19/hmftools/v5_34/ref/37/copy_number/AmberGermlineSites.37.tsv.gz" + GCPROFILE = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg19/hmftools/v5_34/ref/37/copy_number/GC_profile.1000bp.37.cnp" + DIPLODREG = '/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg19/hmftools/v5_34/ref/37/copy_number/DiploidRegions.37.bed.gz' + DRIVERS = '/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg19/hmftools/v5_34/ref/37/common/DriverGenePanel.37.tsv' } 'mm10' { From b56a8a0853a0f6d3c97fed6217366205f87c44c4 Mon Sep 17 00:00:00 2001 From: Darryl Nousome Date: Mon, 17 Jun 2024 21:36:45 -0400 Subject: [PATCH 04/16] fix: no file param --- modules/local/structural_variant.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/structural_variant.nf b/modules/local/structural_variant.nf index 1ea049a..3b1038e 100644 --- a/modules/local/structural_variant.nf +++ b/modules/local/structural_variant.nf @@ -1,5 +1,5 @@ GENOMEREF=file(params.genomes[params.genome].genome) -ANNOTSVGENOME=file(params.genomes[params.genome].annotsvgenome) +ANNOTSVGENOME=params.genomes[params.genome].annotsvgenome BWAGENOME=file(params.genomes[params.genome].bwagenome) INDELREF=file(params.genomes[params.genome].INDELREF) From 678ecfaaea6b7d7b1d3c1d31541be5d97ad50ad5 Mon Sep 17 00:00:00 2001 From: Darryl Nousome Date: Mon, 17 Jun 2024 21:36:54 -0400 Subject: [PATCH 05/16] fix: bwa mem --- modules/local/trim_align.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/local/trim_align.nf b/modules/local/trim_align.nf index 2df6abf..f91168e 100644 --- a/modules/local/trim_align.nf +++ b/modules/local/trim_align.nf @@ -42,7 +42,6 @@ process fastp { process bwamem2 { container = "${params.containers.logan}" - label 'process_high' tag { name } From cd9b73bb6b58d21f42fbea2c68ff1cfbe54e9381 Mon Sep 17 00:00:00 2001 From: Darryl Nousome Date: Mon, 17 Jun 2024 21:37:22 -0400 Subject: [PATCH 06/16] fix: add tumor vs normal to id for unique --- subworkflows/local/workflows.nf | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/subworkflows/local/workflows.nf b/subworkflows/local/workflows.nf index e7d9956..dfce894 100644 --- a/subworkflows/local/workflows.nf +++ b/subworkflows/local/workflows.nf @@ -288,7 +288,7 @@ workflow VC { | map { tumor,normal,vcfs,vcfindex,indels,indelindex -> tuple("${tumor}_vs_${normal}", vcfs.toSorted{ it -> (it.name =~ /${tumor}_vs_${normal}_(.*?).somatic.snvs.vcf.gz/)[0][1].toInteger() },vcfindex, indels.toSorted{ it -> (it.name =~ /${tumor}_vs_${normal}_(.*?).somatic.indels.vcf.gz/)[0][1].toInteger() } ,indelindex)} - | combineVariants_strelka | join(sample_sheet_paired) + | combineVariants_strelka | join(sample_sheet_paired) | map{sample,markedvcf,markedindex,finalvcf,finalindex,tumor,normal -> tuple(tumor,normal,"strelka",finalvcf,finalindex)} annotvep_tn_strelka(strelka_in) @@ -469,21 +469,21 @@ workflow SV { svaba_out=svaba_somatic(bamwithsample) .map{ tumor,bps,contigs,discord,alignents,gindel,gsv,so_indel,so_sv,unfil_gindel,unfil_gsv,unfil_so_indel,unfil_sv,log -> tuple(tumor,so_sv,"svaba")} - annotsv_svaba(svaba_out).ifEmpty("Empty SV input--No SV annotated") + //annotsv_svaba(svaba_out).ifEmpty("Empty SV input--No SV annotated") } if ("manta" in svcall_list){ //Manta manta_out=manta_somatic(bamwithsample) .map{tumor,gsv,so_sv,unfil_sv,unfil_indel -> tuple(tumor,so_sv,"manta")} - annotsv_manta(manta_out).ifEmpty("Empty SV input--No SV annotated") + //annotsv_manta(manta_out).ifEmpty("Empty SV input--No SV annotated") } - //Delly-WIP - if ("manta" in svcall_list & "svaba" in svcall_list){ + if ("manta" in svcall_list & "svaba" in svcall_list){ //Survivor gunzip(manta_out).concat(svaba_out).groupTuple() - | survivor_sv | annotsv_survivor_tn | ifEmpty("Empty SV input--No SV annotated") + | survivor_sv + | annotsv_survivor_tn | ifEmpty("Empty SV input--No SV annotated") } } @@ -524,14 +524,14 @@ workflow CNVhuman { main: cnvcall_list = params.cnvcallers.split(',') as List - + scinput = somaticcall_input|map{t1,n1,cal,vcf,ind -> tuple("${t1}_vs_${n1}",cal,vcf,ind)} if ("purple" in cnvcall_list){ //Purple bamwithsample | amber_tn bamwithsample | cobalt_tn - purplein=amber_tn.out.join(cobalt_tn.out) - purplein.join(somaticcall_input) - | map{t1,amber,cobalt,n1,vc,vcf,vcfindex -> tuple(t1,n1,amber,cobalt,vcf,vcfindex)} + purplein=amber_tn.out.join(cobalt_tn.out,by:[0,1,2]) + purplein.join(scinput) + | map{id,t1,n1,amber,cobalt,vc,vcf,vcfindex -> tuple(id,t1,n1,amber,cobalt,vcf,vcfindex)} | purple } @@ -558,10 +558,10 @@ workflow CNVhuman_novc { if ("purple" in cnvcall_list){ //Purple - bamwithsample | amber_tn - bamwithsample | cobalt_tn - purplein=amber_tn.out |join(cobalt_tn.out) - purplein | map{t1,amber,cobalt,n1 -> tuple(t1,n1,amber,cobalt)} + bamwithsample | amber_tn + bamwithsample | cobalt_tn + purplein=amber_tn.out |join(cobalt_tn.out) + purplein | map{id,t1,n1,amber,t2,n2,cobalt -> tuple(id,t1,n1,amber,cobalt)} | purple_novc } From 662cb0fd5581c1b5fa7f03f3999afb3e0e0a2788 Mon Sep 17 00:00:00 2001 From: Darryl Nousome Date: Mon, 17 Jun 2024 21:37:34 -0400 Subject: [PATCH 07/16] fix: add strelka converter --- bin/convertStrelka.py | 92 ++++++++++++++++++++++++------------------ conf/containers.config | 3 +- 2 files changed, 55 insertions(+), 40 deletions(-) diff --git a/bin/convertStrelka.py b/bin/convertStrelka.py index 8550db1..c6ed2cd 100755 --- a/bin/convertStrelka.py +++ b/bin/convertStrelka.py @@ -1,8 +1,8 @@ #!/usr/bin/env python -import os import numpy as np -import vcfpy +import cyvcf2 import sys +import gzip def _tumor_normal_genotypes(ref, alt, info): """Retrieve standard 0/0, 0/1, 1/1 style genotypes from INFO field. @@ -33,7 +33,7 @@ def name_to_gt(val): # print(fname, coords, ref, alt, info, val) return "0/1" def alleles_to_gt(val): - gt_indices = {gt.upper(): i for i, gt in enumerate([ref] + [alt])} + gt_indices = {gt.upper(): i for i, gt in enumerate([ref] + alt)} tumor_gts = [gt_indices[x.upper()] for x in val if x in gt_indices] if tumor_gts and val not in known_names: if max(tumor_gts) == 0: @@ -45,13 +45,13 @@ def alleles_to_gt(val): else: tumor_gt = name_to_gt(val) return tumor_gt - nt_val = info.get('NT').split("=")[-1] + nt_val = [x.split("=")[-1] for x in info if x.startswith("NT=")][0] normal_gt = name_to_gt(nt_val) - sgt_val = info.get('SGT').split("=")[-1] + sgt_val = [x.split("=")[-1] for x in info if x.startswith("SGT=")] if not sgt_val: tumor_gt = "0/0" else: - sgt_val = sgt_val.split("->")[-1] + sgt_val = sgt_val[0].split("->")[-1] tumor_gt = alleles_to_gt(sgt_val) return normal_gt, tumor_gt @@ -70,47 +70,61 @@ def _af_annotate_and_filter(in_file,out_file): #data = paired.tumor_data if paired else items[0] #min_freq = float(utils.get_in(data["config"], ("algorithm", "min_allele_fraction"), 10)) / 100.0 #logger.debug("Filtering Strelka2 calls with allele fraction threshold of %s" % min_freq) - vcf = vcfpy.Reader.from_path(in_file) - vcf.header.add_format_line(vcfpy.OrderedDict([ - ('ID', 'AF'), - ('Description', 'Allele frequency, as calculated in bcbio: AD/DP (germline), U/DP (somatic snps), TIR/DPI (somatic indels)'), - ('Type','Float'), - ('Number', '.') - ])) - vcf.header.add_format_line(vcfpy.OrderedDict([ - ('ID', 'GT'), - ('Description', 'Genotype'), - ('Type','String'), - ('Number', '1') - ])) - writer = vcfpy.Writer.from_path(out_file, vcf.header) + vcf = cyvcf2.VCF(in_file) + vcf.add_format_to_header(dict( + ID='AF', Number='1',Type='Float', + Description='Allele frequency, as calculated in bcbio: AD/DP (germline), U/DP (somatic snps), TIR/DPI (somatic indels)' + )) + writer = cyvcf2.Writer(out_file, vcf) for rec in vcf: #print(rec) - if rec.is_snv(): # snps? - alt_counts_n = rec.calls[0].data[rec.ALT[0].value + "U"] # {ALT}U=tier1_depth,tier2_depth - alt_counts_t = rec.calls[1].data[rec.ALT[0].value + "U"] # {ALT}U=tier1_depth,tier2_depth + if rec.is_snp: # snps? + alt_counts = rec.format(rec.ALT[0] +'U')[:,0] # {ALT}U=tier1_depth,tier2_depth else: # indels - alt_counts_n = rec.calls[0].data['TIR'] # TIR=tier1_depth,tier2_depth - alt_counts_t = rec.calls[1].data['TIR'] - DP_n=rec.calls[0].data["DP"] - DP_t=rec.calls[1].data["DP"] - if DP_n is not None and DP_t is not None: + alt_counts = rec.format('TIR')[:,0] # TIR=tier1_depth,tier2_depth + dp = rec.format('DP')[:,0] + if dp is not None : with np.errstate(divide='ignore', invalid='ignore'): # ignore division by zero and put AF=.0 #alt_n = alt_counts_n[0]/DP_n #alt_t = alt_counts_t[0]/DP_t - af_n = np.true_divide(alt_counts_n[0], DP_n) - af_t = np.true_divide(alt_counts_t[0], DP_t) - rec.add_format('AF',0) - rec.calls[0].data["AF"]= [round(af_n,5)] - rec.calls[1].data["AF"]= [round(af_t,5)] - normal_gt, tumor_gt= _tumor_normal_genotypes(rec.REF,rec.ALT[0].value,rec.INFO) - rec.add_format('GT',"1/0") - rec.calls[0].data["GT"]=normal_gt - rec.calls[1].data["GT"]=tumor_gt - writer.write_record(rec) + af = np.true_divide(alt_counts, dp) + rec.set_format('AF',np.round(af,5)) + writer.write_record(rec) + writer.close() + +def is_gzipped(path): + return path.endswith(".gz") + +def _add_gt(in_file): + ##Set genotypes now + out_file = in_file.replace(".vcf.gz", "-fixed.vcf") + #open_fn = gzip.open if is_gzipped(in_file) else open + with gzip.open(in_file,'rt') as in_handle: + with open(out_file,"w") as out_handle: + added_gt = False + for line in in_handle: + if line.startswith("##FORMAT") and not added_gt: + added_gt = True + out_handle.write('##FORMAT=\n') + out_handle.write(line) + elif line.startswith("#CHROM"): + assert added_gt + out_handle.write(line) + elif line.startswith("#"): + out_handle.write(line) + else: + parts = line.rstrip().split("\t") + normal_gt,tumor_gt = _tumor_normal_genotypes(parts[3], parts[4].split(","), + parts[7].split(";")) + parts[8] = "GT:%s" % parts[8] + parts[9] = "%s:%s" % (normal_gt, parts[9]) + parts[10] = "%s:%s" % (tumor_gt, parts[10]) + out_handle.write("\t".join(parts) + "\n") + if __name__ == '__main__': filename = sys.argv[1] outname = sys.argv[2] _af_annotate_and_filter(filename, outname) - + _add_gt(outname) + diff --git a/conf/containers.config b/conf/containers.config index c5ad361..213986f 100644 --- a/conf/containers.config +++ b/conf/containers.config @@ -6,6 +6,7 @@ params { vcf2maf = 'docker://dnousome/ccbr_vcf2maf:v102.0.0' lofreq = 'docker://dnousome/ccbr_lofreq:v0.0.1' octopus = 'docker://dancooke/octopus:latest' - annotcnvsv = 'docker://dnousome/ccbr_annotate_cnvsv:v0.0.1' + annotsv = "docker://quay.io/biocontainers/annotsv:3.4.2--py312hdfd78af_0" + annotcnvsv = 'docker://dnousome/ccbr_annotate_cnvsv:v0.0.2' } } From 9f613212c5e80cd3d63406aca910e114a9b443c5 Mon Sep 17 00:00:00 2001 From: Darryl Nousome Date: Tue, 18 Jun 2024 09:40:11 -0400 Subject: [PATCH 08/16] fix: genome version for hmf --- conf/genomes.config | 1 - modules/local/copynumber.nf | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/conf/genomes.config b/conf/genomes.config index 040c1b6..f111b58 100644 --- a/conf/genomes.config +++ b/conf/genomes.config @@ -79,7 +79,6 @@ params { PANELBED = "-panel_bed /data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg19/hmftools/v5_34/ref/37/variants/ActionableCodingPanel.37.bed.gz" HCBED = "-high_confidence_bed /data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg19/hmftools/v5_34/ref/37/variants/NA12878_GIAB_highconf_IllFB-IllGATKHC-CG-Ion-Solid_ALLCHROM_v3.2.2_highconf.bed.gz" ENSEMBLCACHE = "-ensembl_data_dir /data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg19/hmftools/v5_34/ref/37/common/ensembl_data" - //PURPLE GERMLINEHET = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg19/hmftools/v5_34/ref/37/copy_number/AmberGermlineSites.37.tsv.gz" GCPROFILE = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg19/hmftools/v5_34/ref/37/copy_number/GC_profile.1000bp.37.cnp" DIPLODREG = '/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg19/hmftools/v5_34/ref/37/copy_number/DiploidRegions.37.bed.gz' diff --git a/modules/local/copynumber.nf b/modules/local/copynumber.nf index 1bcfe9c..08e74c7 100644 --- a/modules/local/copynumber.nf +++ b/modules/local/copynumber.nf @@ -351,7 +351,7 @@ process amber_tonly { -tumor ${tumorname} -tumor_bam ${tumor} \ -output_dir ${tumorname}_amber \ -threads $task.cpus \ - -ref_genome_version $HMFGENOMEREF \ + -ref_genome_version $GENOMEVER \ -loci $GERMLINEHET """ @@ -386,7 +386,7 @@ process amber_tn { -reference ${normalname} -reference_bam ${normal} \ -output_dir ${tumorname}_vs_${normalname}_amber \ -threads $task.cpus \ - -ref_genome_version $HMFGENOMEREF \ + -ref_genome_version $GENOMEVER \ -loci $GERMLINEHET """ From 7e97a8dd6395789900b14c2211d046da9ca073ad Mon Sep 17 00:00:00 2001 From: Darryl Nousome Date: Tue, 2 Jul 2024 09:21:17 -0400 Subject: [PATCH 09/16] fix: purple process order --- conf/modules.config | 2 +- modules/local/copynumber.nf | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index d171cef..bb2fcbc 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -43,7 +43,7 @@ process { ] } - withName: 'purple' { + withName: 'purple|purple_tonly|purple_novc' { publishDir = [ path: { "${params.outdir}/cnv/purple" }, mode: 'copy' diff --git a/modules/local/copynumber.nf b/modules/local/copynumber.nf index 08e74c7..c1239ec 100644 --- a/modules/local/copynumber.nf +++ b/modules/local/copynumber.nf @@ -510,10 +510,10 @@ process purple_novc { input: tuple val(id), val(tumorname), val(normalname), - path(cobaltin), path(amberin) + path(amberin), path(cobaltin) output: - tuple val(tumorname), path("${tumorname}") + tuple val(id), val(tumorname), val(normalname), path("${id}") script: @@ -528,7 +528,7 @@ process purple_novc { -ref_genome $HMFGENOMEREF \ $ENSEMBLCACHE \ -threads $task.cpus \ - -output_dir ${tvn} + -output_dir ${id} """ @@ -548,7 +548,7 @@ process purple_tonly { input: tuple val(tumorname), - path(cobaltin), path(amberin), + path(amberin), path(cobaltin), path(somaticvcf), path(somaticvcfindex) output: From d17fc2a16ca2d7a995e73e4b5701da2274858f04 Mon Sep 17 00:00:00 2001 From: Darryl Nousome Date: Tue, 2 Jul 2024 09:21:32 -0400 Subject: [PATCH 10/16] feat: add genomes to support output from SF --- conf/genomes.config | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/conf/genomes.config b/conf/genomes.config index f111b58..e6e2e2c 100644 --- a/conf/genomes.config +++ b/conf/genomes.config @@ -121,5 +121,48 @@ params { chromosomes = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chrX','chrY','chrM'] } + 'hg38_sf' { + genome = "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/genome/bwamem2/Homo_sapiens_assembly38.fasta" + genomefai = "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/genome/bwamem2/Homo_sapiens_assembly38.fasta.fai" + bwagenome= "/data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/genome/Homo_sapiens_assembly38.fasta" + genomedict= "/data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/genome/Homo_sapiens_assembly38.dict" + wgsregion = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/resources_broad_hg38_v0_wgs_calling_regions.hg38.interval_list" + intervals= "${projectDir}/assets/hg38_v0_wgs_calling_regions.hg38.bed" + fullinterval = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/genomes/hg38_main.bed" + INDELREF = "/data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/GATK_resource_bundle/ALL.wgs.1000G_phase3.GRCh38.ncbi_remapper.20150424.shapeit2_indels.vcf.gz" + KNOWNINDELS = "-known /data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/GATK_resource_bundle/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz -known /data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/GATK_resource_bundle/ALL.wgs.1000G_phase3.GRCh38.ncbi_remapper.20150424.shapeit2_indels.vcf.gz" + KNOWNRECAL = '--known-sites /data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/dbsnp_138.hg38.vcf.gz --known-sites /data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz --known-sites /data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/ALL.wgs.1000G_phase3.GRCh38.ncbi_remapper.20150424.shapeit2_indels.vcf.gz' + dbsnp = "/data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/GATK_resource_bundle/dbsnp_138.hg38.vcf.gz" + gnomad = '--germline-resource /data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/GNOMAD/somatic-hg38-af-only-gnomad.hg38.vcf.gz' + pon = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/PON/updatedpon.vcf.gz" //pon="/data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/PON/hg38.noCOSMIC_ClinVar.pon.vcf.gz" //file{params.pon} + germline_resource = "/data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/GNOMAD/somatic-hg38-af-only-gnomad.hg38.vcf.gz" + KRAKENBACDB = "/data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/kraken/20180907_standard_kraken2" + snpeff_genome = "GRCh38.86" + snpeff_config = "/data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/snpEff/4.3t/snpEff.config" + snpeff_bundle = "/data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/snpEff/4.3t/" + sites_vcf= "/data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/somalier/sites.hg38.vcf.gz" + somalier_ancestrydb="/data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/somalier/1kg-somalier" + vepcache = "/fdb/VEP/102/cache" + vepspecies = "homo_sapiens" + vepbuild = "GRCh38" + annotsvgenome = "GRCh38" + octopus_sforest= "--somatic-forest /data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/octopus/somatic.v0.7.4.forest" + octopus_gforest= "--forest /data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/octopus/germline.v0.7.4.forest" + SEQUENZAGC = "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/SEQUENZA/hg38_gc50Base.txt.gz" + chromosomes = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY','chrM'] + //HMFTOOLS + GENOMEVER = "38" + HMFGENOME = "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/bwamem2/GRCh38.d1.vd1.fa" + SOMATICHOTSPOTS = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/variants/KnownHotspots.somatic.38.vcf.gz" + GERMLINEHOTSPOTS = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/variants/KnownHotspots.germline.38.vcf.gz" + PANELBED = "-panel_bed /data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/variants/ActionableCodingPanel.38.bed.gz" + HCBED = "-high_confidence_bed /data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/variants/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel_noCENorHET7.bed.gz" + ENSEMBLCACHE = "-ensembl_data_dir /data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/common/ensembl_data" + //PURPLE + GERMLINEHET = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/copy_number/AmberGermlineSites.38.tsv.gz" + GCPROFILE = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/copy_number/GC_profile.1000bp.38.cnp" + DIPLODREG = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/copy_number/DiploidRegions.38.bed.gz" + DRIVERS = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/common/DriverGenePanel.38.tsv" + } } } From d4a4bf9a4f75f17ed650400c3d7c08b5f95ab10f Mon Sep 17 00:00:00 2001 From: Darryl Nousome Date: Tue, 2 Jul 2024 09:21:49 -0400 Subject: [PATCH 11/16] fix: output additional octopus annotation --- modules/local/variant_calling.nf | 2 +- modules/local/variant_calling_tonly.nf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/local/variant_calling.nf b/modules/local/variant_calling.nf index a55b424..5d1a568 100644 --- a/modules/local/variant_calling.nf +++ b/modules/local/variant_calling.nf @@ -472,7 +472,7 @@ process octopus_tn { """ octopus -R $GENOMEREF -I ${normal} ${tumor} --normal-sample ${normalname} \ -C cancer \ - --annotations AF AC AD DP -t ${bed} \ + --annotations AF AC AD DP SB -t ${bed} \ --threads $task.cpus \ $GERMLINE_FOREST \ $SOMATIC_FOREST \ diff --git a/modules/local/variant_calling_tonly.nf b/modules/local/variant_calling_tonly.nf index 4e0991e..6a2516a 100644 --- a/modules/local/variant_calling_tonly.nf +++ b/modules/local/variant_calling_tonly.nf @@ -342,7 +342,7 @@ process octopus_tonly { script: """ octopus -R $GENOMEREF -C cancer -I ${tumor} \ - --annotations AF AC AD DP \ + --annotations AF AC AD DP SB \ -B 92Gb \ -t ${bed} \ --threads ${task.cpus}\ From 609a9bb1dfa17ba25aeff50eaded584af0bcd400 Mon Sep 17 00:00:00 2001 From: Darryl Nousome Date: Fri, 5 Jul 2024 10:26:19 -0400 Subject: [PATCH 12/16] feat: add GT for strelka --- bin/{convertStrelka.py => strelka_convert.py} | 16 +++++++++------- conf/modules.config | 4 ++-- 2 files changed, 11 insertions(+), 9 deletions(-) rename bin/{convertStrelka.py => strelka_convert.py} (93%) diff --git a/bin/convertStrelka.py b/bin/strelka_convert.py similarity index 93% rename from bin/convertStrelka.py rename to bin/strelka_convert.py index c6ed2cd..e0e6a34 100755 --- a/bin/convertStrelka.py +++ b/bin/strelka_convert.py @@ -3,6 +3,7 @@ import cyvcf2 import sys import gzip +import os def _tumor_normal_genotypes(ref, alt, info): """Retrieve standard 0/0, 0/1, 1/1 style genotypes from INFO field. @@ -92,15 +93,15 @@ def _af_annotate_and_filter(in_file,out_file): writer.write_record(rec) writer.close() -def is_gzipped(path): - return path.endswith(".gz") +#def is_gzipped(path): +# return path.endswith(".gz") def _add_gt(in_file): ##Set genotypes now - out_file = in_file.replace(".vcf.gz", "-fixed.vcf") + out_file = os.path.basename(in_file).replace(".vcf.gz", "-fixed.vcf") #open_fn = gzip.open if is_gzipped(in_file) else open with gzip.open(in_file,'rt') as in_handle: - with open(out_file,"w") as out_handle: + with open(out_file,"wt") as out_handle: added_gt = False for line in in_handle: if line.startswith("##FORMAT") and not added_gt: @@ -121,10 +122,11 @@ def _add_gt(in_file): parts[10] = "%s:%s" % (tumor_gt, parts[10]) out_handle.write("\t".join(parts) + "\n") - if __name__ == '__main__': filename = sys.argv[1] outname = sys.argv[2] - _af_annotate_and_filter(filename, outname) - _add_gt(outname) + _add_gt(filename) + newname = os.path.basename(filename).replace(".vcf.gz", "-fixed.vcf") + _af_annotate_and_filter(newname,outname) + os.remove(newname) diff --git a/conf/modules.config b/conf/modules.config index bb2fcbc..a1e595f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -224,14 +224,14 @@ process { ] } - withName: 'contamination_tumoronly|learnreadorientationmodel_tonly|learnreadorientationmodel|mergemut2stats|mergemut2stats_tonly|contamination_paired|mutect2filter' { + withName: 'learnreadorientationmodel|mergemut2stats|contamination_paired|mutect2filter' { publishDir = [ path: { "${params.outdir}/vcfs/mutect2" }, mode: 'copy' ] } - withName: 'mutect2filter_tonly' { + withName: 'mutect2filter_tonly|contamination_tumoronly|learnreadorientationmodel_tonly|mergemut2stats_tonly' { publishDir = [ path: { "${params.outdir}/vcfs/mutect2_tonly" }, mode: 'copy' From 4bf7e9bb7eead80d2f7a0a64de296adfa6f4a10a Mon Sep 17 00:00:00 2001 From: Darryl Nousome Date: Fri, 5 Jul 2024 10:26:33 -0400 Subject: [PATCH 13/16] feat: strelka workflow --- subworkflows/local/workflows.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/subworkflows/local/workflows.nf b/subworkflows/local/workflows.nf index dfce894..d8b98af 100644 --- a/subworkflows/local/workflows.nf +++ b/subworkflows/local/workflows.nf @@ -17,7 +17,7 @@ include {pileup_paired_t; pileup_paired_n; strelka_tn; varscan_tn; vardict_tn; lofreq_tn; muse_tn; sage_tn; octopus_tn; bcftools_index_octopus; bcftools_index_octopus as bcftools_index_octopus_tonly; octopus_convertvcf; - combineVariants_strelka; + combineVariants_strelka; convert_strelka; combineVariants as combineVariants_vardict; combineVariants as combineVariants_vardict_tonly; combineVariants as combineVariants_varscan; combineVariants as combineVariants_varscan_tonly; combineVariants as combineVariants_sage; combineVariants as combineVariants_sage_tonly; @@ -290,6 +290,7 @@ workflow VC { indels.toSorted{ it -> (it.name =~ /${tumor}_vs_${normal}_(.*?).somatic.indels.vcf.gz/)[0][1].toInteger() } ,indelindex)} | combineVariants_strelka | join(sample_sheet_paired) | map{sample,markedvcf,markedindex,finalvcf,finalindex,tumor,normal -> tuple(tumor,normal,"strelka",finalvcf,finalindex)} + | convert_strelka annotvep_tn_strelka(strelka_in) vc_all=vc_all|concat(strelka_in) From 8586a18dcd72dedff3ee55ad2640401c6f9120d7 Mon Sep 17 00:00:00 2001 From: Darryl Nousome Date: Fri, 5 Jul 2024 10:27:14 -0400 Subject: [PATCH 14/16] style: better visibility --- modules/local/variant_calling.nf | 38 ++++++++++++++++++++++---- modules/local/variant_calling_tonly.nf | 3 +- 2 files changed, 34 insertions(+), 7 deletions(-) diff --git a/modules/local/variant_calling.nf b/modules/local/variant_calling.nf index 5d1a568..eae43f2 100644 --- a/modules/local/variant_calling.nf +++ b/modules/local/variant_calling.nf @@ -257,14 +257,12 @@ process mutect2filter { input: tuple val(tumor), val(normal),path(mutvcfs), path(stats), path(obs), - path(pileups), path(normal_pileups),path(tumorcontamination),path(normalcontamination) + path(pileups), path(normal_pileups), path(tumorcontamination), path(normalcontamination) output: tuple val("${tumor}_vs_${normal}"), - path("${tumor}_vs_${normal}.mut2.marked.vcf.gz"), - path("${tumor}_vs_${normal}.mut2.marked.vcf.gz.tbi"), - path("${tumor}_vs_${normal}.mut2.norm.vcf.gz"), - path("${tumor}_vs_${normal}.mut2.norm.vcf.gz.tbi"), + path("${tumor}_vs_${normal}.mut2.marked.vcf.gz"), path("${tumor}_vs_${normal}.mut2.marked.vcf.gz.tbi"), + path("${tumor}_vs_${normal}.mut2.norm.vcf.gz"), path("${tumor}_vs_${normal}.mut2.norm.vcf.gz.tbi"), path("${tumor}_vs_${normal}.mut2.marked.vcf.gz.filteringStats.tsv") script: @@ -813,6 +811,36 @@ process combineVariants_strelka { } +process convert_strelka { + //Add GT column to + conda 'bioconda::cyvcf2 bioconda::bcftools numpy' + label 'process_medium' + + input: + tuple val(tumor), val(normal), val(vc), + path(strelkavcf), path(strelkaindex) + + output: + tuple val(tumor), val(normal), val("strelka"), + path("${tumor}_vs_${normal}.filtered.strelka-fixed.vcf.gz"), + path("${tumor}_vs_${normal}.filtered.strelka-fixed.vcf.gz.tbi") + + + script: + + """ + python /data/nousomedr/wgs/LOGAN/bin/strelka_convert.py ${strelkavcf} ${tumor}_vs_${normal}.filtered.strelka-fixed.vcf.gz + bcftools index -t ${tumor}_vs_${normal}.filtered.strelka-fixed.vcf.gz + """ + + stub: + + """ + touch ${tumor}.filtered.strelka-fixed.vcf.gz ${tumor}.filtered.strelka-fixed.vcf.gz.tbi + """ + +} + process somaticcombine { container "${params.containers.logan}" label 'process_medium' diff --git a/modules/local/variant_calling_tonly.nf b/modules/local/variant_calling_tonly.nf index 6a2516a..f810a04 100644 --- a/modules/local/variant_calling_tonly.nf +++ b/modules/local/variant_calling_tonly.nf @@ -194,7 +194,7 @@ process mutect2filter_tonly { label 'process_medium' input: - tuple val(sample), path(mutvcfs), path(stats), path(obs), path(pileups),path(tumorcontamination) + tuple val(sample), path(mutvcfs), path(stats), path(obs), path(pileups), path(tumorcontamination) output: tuple val(sample), path("${sample}.tonly.mut2.marked.vcf.gz"),path("${sample}.tonly.mut2.marked.vcf.gz.tbi"), @@ -205,7 +205,6 @@ process mutect2filter_tonly { //Include the stats and concat ${mutvcfs} -Oz -o ${sample}.concat.vcf.gz mut2in = mutvcfs.join(" -I ") - """ gatk SortVcf -I ${mut2in} -O ${sample}.tonly.concat.vcf.gz --CREATE_INDEX gatk FilterMutectCalls \ From 7f8e7acb7ee0fc745981862c8804a185e2f8d5e7 Mon Sep 17 00:00:00 2001 From: dnousome Date: Mon, 8 Jul 2024 16:36:34 -0400 Subject: [PATCH 15/16] feat: docker update with gatk, cyvcf, hmftools changes --- docker/logan_base/Dockerfile | 46 +++++++++++++++++------------------- docker/logan_base/build.sh | 8 +++---- 2 files changed, 26 insertions(+), 28 deletions(-) diff --git a/docker/logan_base/Dockerfile b/docker/logan_base/Dockerfile index e80ed91..d11804b 100644 --- a/docker/logan_base/Dockerfile +++ b/docker/logan_base/Dockerfile @@ -1,4 +1,4 @@ -FROM --platform=linux/amd64 nciccbr/ccbr_ubuntu_base_20.04:v5 +FROM --platform=linux/amd64 nciccbr/ccbr_ubuntu_base_20.04:v6 # build time variables ARG BUILD_DATE="000000" @@ -21,7 +21,7 @@ RUN apt-get update \ && apt-get -y upgrade \ && DEBIAN_FRONTEND=noninteractive apt-get install -y \ bc \ - openjdk-17-jdk + openjdk-17-jdk #Needed for GATK >4 # Common bioinformatics tools # bwa/0.7.17-4 bowtie/1.2.3 bowtie2/2.3.5.1 @@ -48,11 +48,11 @@ RUN wget https://github.com/biod/sambamba/releases/download/v0.8.1/sambamba-0.8. # Install GATK4 (GATK/4.5.0.0) # Requires Java17 -RUN wget https://github.com/broadinstitute/gatk/releases/download/4.5.0.0/gatk-4.5.0.0.zip \ - && unzip /opt2/gatk-4.5.0.0.zip \ - && rm /opt2/gatk-4.5.0.0.zip \ - && /opt2/gatk-4.5.0.0/gatk --list -ENV PATH="/opt2/gatk-4.5.0.0:$PATH" +RUN wget https://github.com/broadinstitute/gatk/releases/download/4.6.0.0/gatk-4.6.0.0.zip \ + && unzip /opt2/gatk-4.6.0.0.zip \ + && rm /opt2/gatk-4.6.0.0.zip \ + && /opt2/gatk-4.6.0.0/gatk --list +ENV PATH="/opt2/gatk-4.6.0.0:$PATH" #Use DISCVRSeq For CombineVariants Replacement @@ -96,12 +96,15 @@ RUN Rscript -e 'BiocManager::install(c("rtracklayer"))' # Install Sequenza-Utils/3.0.0 and Sequenza # Requires R, Python, SAMtools, tabix (already satisfied) # https://cran.r-project.org/web/packages/sequenza/vignettes/sequenza.html#getting-started -##Install Old version of IOtools for parallel processing +# Install Old version of IOtools for parallel processing RUN pip3 install --upgrade pip \ && pip3 install sequenza-utils \ && Rscript -e 'remotes::install_github("ShixiangWang/copynumber"); remotes::install_github("cran/sequenza")' \ && Rscript -e 'remotes::install_version("iotools",version="0.3-2")' +##Install cyvcf2 for processing vcfs in python +RUN pip3 install cyvcf2 + # Install Control-FREEC/v11.6 and additional dependencies # Requires R, samtools, bedtools, sambamba (already satisfied) # http://boevalab.inf.ethz.ch/FREEC/tutorial.html#install @@ -114,10 +117,10 @@ ENV PATH="/opt2/FREEC-11.6/src:$PATH" WORKDIR /opt2 -# Install Somalier/v0.2.16 +# Install Somalier/v0.2.19 # download static binary RUN mkdir somalier \ - && wget -O somalier/somalier https://github.com/brentp/somalier/releases/download/v0.2.16/somalier \ + && wget -O somalier/somalier https://github.com/brentp/somalier/releases/download/v0.2.19/somalier \ && chmod a+rx /opt2/somalier/somalier ENV PATH="/opt2/somalier:$PATH" @@ -189,31 +192,26 @@ RUN wget -O svaba_1.2.0 https://github.com/walaj/svaba/releases/download/v1.2.0/ && mkdir svaba \ && mv svaba_1.2.0 svaba/svaba \ && chmod a+x svaba/svaba - ENV PATH="/opt2/svaba:$PATH" -# MUSE -RUN wget -O muse_2.0.4.tar.gz https://github.com/wwylab/MuSE/archive/refs/tags/v2.0.4.tar.gz \ - && tar -xzf muse_2.0.4.tar.gz \ - && cd MuSE-2.0.4 \ +# MUSE-Use Cloned Version due to -ldeflate option needed +RUN git clone https://github.com/dnousome/MuSE \ + && cd MuSE \ && ./install_muse.sh \ - && mv MuSE /opt2/ \ - && chmod a+x /opt2/MuSE \ - && rm -R /opt2/MuSE-2.0.4 \ - && rm /opt2/muse_2.0.4.tar.gz - + && chmod a+rX /opt2/MuSE/MuSE ENV PATH="/opt2/MuSE:$PATH" +WORKDIR /opt2 # HMFtools for PURPLE/COBALT/AMBER -RUN wget https://github.com/hartwigmedical/hmftools/releases/download/amber-v4.0/amber_v4.0.jar \ +RUN wget https://github.com/hartwigmedical/hmftools/releases/download/amber-v4.0.1/amber_v4.0.1.jar \ && wget https://github.com/hartwigmedical/hmftools/releases/download/cobalt-v1.16/cobalt_v1.16.jar \ && wget https://github.com/hartwigmedical/hmftools/releases/download/purple-v4.0.2/purple_v4.0.2.jar \ - && wget https://github.com/hartwigmedical/hmftools/releases/download/sage-v3.4.3/sage_v3.4.3.jar \ + && wget https://github.com/hartwigmedical/hmftools/releases/download/sage-v3.4.4/sage_v3.4.4.jar \ && mkdir hmftools \ - && mv amber_v4.0.jar hmftools/amber.jar \ + && mv amber_v4.0.1.jar hmftools/amber.jar \ && mv cobalt_v1.16.jar hmftools/cobalt.jar \ && mv purple_v4.0.2.jar hmftools/purple.jar \ - && mv sage_v3.4.3.jar hmftools/sage.jar \ + && mv sage_v3.4.4.jar hmftools/sage.jar \ && chmod a+x hmftools/amber.jar ENV PATH="/opt2/hmftools:$PATH" diff --git a/docker/logan_base/build.sh b/docker/logan_base/build.sh index 5515cc0..90b4446 100644 --- a/docker/logan_base/build.sh +++ b/docker/logan_base/build.sh @@ -5,13 +5,13 @@ #docker buildx inspect upbeat_ganguly #docker buildx build --platform linux/amd64 -f Dockerfile -t dnousome/ccbr_logan_base:v0.3.0 -t dnousome/ccbr_logan_base:latest --push . -docker build --platform linux/amd64 --tag ccbr_logan_base:v0.3.5 -f Dockerfile . +docker build --platform linux/amd64 --tag ccbr_logan_base:v0.3.6 -f Dockerfile . -docker tag ccbr_logan_base:v0.3.5 dnousome/ccbr_logan_base:v0.3.5 -docker tag ccbr_logan_base:v0.3.5 dnousome/ccbr_logan_base +docker tag ccbr_logan_base:v0.3.6 dnousome/ccbr_logan_base:v0.3.6 +docker tag ccbr_logan_base:v0.3.6 dnousome/ccbr_logan_base -docker push dnousome/ccbr_logan_base:v0.3.5 +docker push dnousome/ccbr_logan_base:v0.3.6 docker push dnousome/ccbr_logan_base:latest From b518795804b44f7ec4d2c0006c39a144ff118dd5 Mon Sep 17 00:00:00 2001 From: Darryl Nousome Date: Wed, 10 Jul 2024 10:12:39 -0400 Subject: [PATCH 16/16] fix: strelka add GT/AD columns --- conf/containers.config | 2 +- modules/local/variant_calling.nf | 48 +++++++++++++++++--------- modules/local/variant_calling_tonly.nf | 2 +- nextflow.config | 2 +- subworkflows/local/workflows.nf | 11 +++--- subworkflows/local/workflows_tonly.nf | 17 ++++----- 6 files changed, 48 insertions(+), 34 deletions(-) diff --git a/conf/containers.config b/conf/containers.config index 213986f..fa3cd71 100644 --- a/conf/containers.config +++ b/conf/containers.config @@ -2,7 +2,7 @@ params { containers { base = 'docker://nciccbr/ccbr_ubuntu_base_20.04:v6.1' - logan = 'docker://dnousome/ccbr_logan_base:v0.3.5' + logan = 'docker://dnousome/ccbr_logan_base:v0.3.6' vcf2maf = 'docker://dnousome/ccbr_vcf2maf:v102.0.0' lofreq = 'docker://dnousome/ccbr_lofreq:v0.0.1' octopus = 'docker://dancooke/octopus:latest' diff --git a/modules/local/variant_calling.nf b/modules/local/variant_calling.nf index eae43f2..e258a8d 100644 --- a/modules/local/variant_calling.nf +++ b/modules/local/variant_calling.nf @@ -333,7 +333,7 @@ process strelka_tn { mv wd/results/variants/somatic.snvs.vcf.gz ${tumorname}_vs_${normalname}_${bed.simpleName}.somatic_temp.snvs.vcf.gz mv wd/results/variants/somatic.indels.vcf.gz ${tumorname}_vs_${normalname}_${bed.simpleName}.somatic_temp.indels.vcf.gz - printf "NORMAL\t${normalname}\nTUMOR\t${tumorname}\n" >sampname + printf %s "NORMAL\t${normalname}\nTUMOR\t${tumorname}\n" >sampname bcftools reheader -s sampname ${tumorname}_vs_${normalname}_${bed.simpleName}.somatic_temp.snvs.vcf.gz \ | bcftools view -Oz -o ${tumorname}_vs_${normalname}_${bed.simpleName}.somatic.snvs.vcf.gz @@ -631,18 +631,27 @@ process combineVariants { script: vcfin = inputvcf.join(" -I ") + //Create Tumor Normal here + samplist=sample.tokenize('_vs_') + if(samplist.size>1){ + samporder = samplist.join(",") + }else{ + samporder = sample + } """ mkdir ${vc} gatk --java-options "-Xmx48g" SortVcf \ - -O ${sample}.${vc}.marked.vcf.gz \ + -O ${sample}.${vc}.markedtemp.vcf.gz \ -SD $GENOMEDICT \ -I $vcfin + + bcftools view ${sample}.${vc}.markedtemp.vcf.gz -s $samporder -Oz -o ${sample}.${vc}.marked.vcf.gz bcftools norm ${sample}.${vc}.marked.vcf.gz -m- --threads $task.cpus --check-ref s -f $GENOMEREF -O v |\ awk '{{gsub(/\\y[W|K|Y|R|S|M|B|D|H|V]\\y/,"N",\$4); OFS = "\t"; print}}' |\ sed '/^\$/d' > ${sample}.${vc}.temp.vcf - bcftools view ${sample}.${vc}.temp.vcf -f PASS -Oz -o ${vc}/${sample}.${vc}.norm.vcf.gz + bcftools view ${sample}.${vc}.temp.vcf -f PASS -s $samporder -Oz -o ${vc}/${sample}.${vc}.norm.vcf.gz mv ${sample}.${vc}.marked.vcf.gz ${vc} mv ${sample}.${vc}.marked.vcf.gz.tbi ${vc} @@ -681,14 +690,19 @@ process combineVariants_alternative { script: vcfin = vcfs.join(" ") - + samplist=sample.tokenize('_vs_') + if(samplist.size>1){ + samporder = samplist.join(",") + }else{ + samporder = sample + } if (vc.contains("octopus")) { """ mkdir ${vc} bcftools concat $vcfin -a -Oz -o ${sample}.${vc}.temp1.vcf.gz bcftools reheader -f $GENOMEFAI ${sample}.${vc}.temp1.vcf.gz -o ${sample}.${vc}.temp.vcf - bcftools sort ${sample}.${vc}.temp.vcf | bcftools view - -i "INFO/SOMATIC==1" -Oz -o ${sample}.${vc}.marked.vcf.gz + bcftools sort ${sample}.${vc}.temp.vcf | bcftools view - -i "INFO/SOMATIC==1" -s $samporder -Oz -o ${sample}.${vc}.marked.vcf.gz bcftools norm ${sample}.${vc}.marked.vcf.gz -m- --threads $task.cpus --check-ref s -f $GENOMEREF -O v |\ awk '{{gsub(/\\y[W|K|Y|R|S|M|B|D|H|V]\\y/,"N",\$4); OFS = "\t"; print}}' |\ sed '/^\$/d' > ${sample}.${vc}.temp.vcf @@ -705,7 +719,7 @@ process combineVariants_alternative { mkdir ${vc} bcftools concat $vcfin -a -Oz -o ${sample}.${vc}.temp1.vcf.gz bcftools reheader -f $GENOMEFAI ${sample}.${vc}.temp1.vcf.gz -o ${sample}.${vc}.temp.vcf - bcftools sort ${sample}.${vc}.temp.vcf -Oz -o ${sample}.${vc}.marked.vcf.gz + bcftools sort ${sample}.${vc}.temp.vcf | bcftools view - -s $samporder -Oz -o ${sample}.${vc}.marked.vcf.gz bcftools norm ${sample}.${vc}.marked.vcf.gz -m- --threads $task.cpus --check-ref s -f $GENOMEREF -O v |\ awk '{{gsub(/\\y[W|K|Y|R|S|M|B|D|H|V]\\y/,"N",\$4); OFS = "\t"; print}}' |\ sed '/^\$/d' > ${sample}.${vc}.temp.vcf @@ -717,12 +731,7 @@ process combineVariants_alternative { bcftools index ${vc}/${sample}.${vc}.norm.vcf.gz -t """ } - - - - - stub: """ @@ -784,15 +793,19 @@ process combineVariants_strelka { vcfin = strelkasnvs.join(" ") indelsin = strelkaindels.join(" ") - - + samplist=sample.tokenize('_vs_') + if(samplist.size>1){ + samporder = samplist.join(",") + }else{ + samporder = sample + } """ bcftools concat $vcfin $indelsin --threads $task.cpus -Oz -o ${sample}.temp.strelka.vcf.gz -a bcftools norm ${sample}.temp.strelka.vcf.gz -m- --threads $task.cpus --check-ref s -f $GENOMEREF -O v |\ awk '{{gsub(/\\y[W|K|Y|R|S|M|B|D|H|V]\\y/,"N",\$4); OFS = "\t"; print}}' |\ sed '/^\$/d' > ${sample}.temp1.strelka.vcf.gz - bcftools sort ${sample}.temp1.strelka.vcf.gz -Oz -o ${sample}.strelka.vcf.gz + bcftools sort ${sample}.temp1.strelka.vcf.gz |bcftools view - -s $samporder -Oz -o ${sample}.strelka.vcf.gz bcftools view ${sample}.strelka.vcf.gz --threads $task.cpus -f PASS -Oz -o ${sample}.filtered.strelka.vcf.gz @@ -812,8 +825,8 @@ process combineVariants_strelka { process convert_strelka { - //Add GT column to - conda 'bioconda::cyvcf2 bioconda::bcftools numpy' + //Add GT/AD column to Strelka Variants + container "${params.containers.logan}" label 'process_medium' input: @@ -912,6 +925,9 @@ process ffpe_1 { touch "${tumorsample}_vs_${normal}_combined_ffpolish.vcf.gz" """ } + + + /*DISCVR process somaticcombine { container "${params.containers.logan}" diff --git a/modules/local/variant_calling_tonly.nf b/modules/local/variant_calling_tonly.nf index f810a04..7c30eb8 100644 --- a/modules/local/variant_calling_tonly.nf +++ b/modules/local/variant_calling_tonly.nf @@ -264,7 +264,7 @@ process varscan_tonly { awk '{{gsub(/\\y[W|K|Y|R|S|M|B|D|H|V]\\y/,"N",\$4); OFS = "\t"; print}}' !{tumor.simpleName}_!{bed.simpleName}.tonly.varscan.vcf_temp \ | sed '/^$/d' | bcftools view - -Oz -o !{tumor.simpleName}_!{bed.simpleName}.tonly.varscan.vcf - printf "TUMOR\t!{tumorname}\n" > sampname + printf "Sample1\t!{tumorname}\n" > sampname bcftools reheader -s sampname !{tumor.simpleName}_!{bed.simpleName}.tonly.varscan.vcf \ | bcftools view -Oz -o !{tumor.simpleName}_!{bed.simpleName}.tonly.varscan.vcf.gz diff --git a/nextflow.config b/nextflow.config index 9f5c5c6..9a8269a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -54,7 +54,7 @@ params { bam_input=null BAMINPUT=null - callers = "mutect2,octopus,lofreq,muse,sage,vardict,varscan" + callers = "mutect2,octopus,strelka,lofreq,muse,sage,vardict,varscan" tonlycallers = "mutect2,octopus,vardict,varscan" cnvcallers = "purple,sequenza,freec" svcallers = "manta,svaba" diff --git a/subworkflows/local/workflows.nf b/subworkflows/local/workflows.nf index d8b98af..783d0fc 100644 --- a/subworkflows/local/workflows.nf +++ b/subworkflows/local/workflows.nf @@ -97,22 +97,20 @@ workflow ALIGN { take: fastqinput sample_sheet - main: - fastp(fastqinput) + main: if (params.intervals){ intervalbedin = Channel.fromPath(params.intervals) }else{ intervalbedin = Channel.fromPath(params.genomes[params.genome].intervals,checkIfExists: true,type: 'file') } - splitinterval(intervalbedin) + fastp(fastqinput) bwamem2(fastp.out) bqsrbambyinterval=bwamem2.out.combine(splitinterval.out.flatten()) bambyinterval=bwamem2.out.combine(splitinterval.out.flatten()) - bqsr(bqsrbambyinterval) bqsrs=bqsr.out.groupTuple() .map { samplename,beds -> tuple( samplename, @@ -123,7 +121,6 @@ workflow ALIGN { tobqsr=bwamem2.out.combine(gatherbqsr.out,by:0) applybqsr(tobqsr) - //sample_sheet.view() bamwithsample=applybqsr.out.combine(sample_sheet,by:0).map{it.swap(3,0)}.combine(applybqsr.out,by:0).map{it.swap(3,0)} emit: @@ -287,7 +284,7 @@ workflow VC { strelka_in=strelka_tn(bambyinterval) | groupTuple(by:[0,1]) | map { tumor,normal,vcfs,vcfindex,indels,indelindex -> tuple("${tumor}_vs_${normal}", vcfs.toSorted{ it -> (it.name =~ /${tumor}_vs_${normal}_(.*?).somatic.snvs.vcf.gz/)[0][1].toInteger() },vcfindex, - indels.toSorted{ it -> (it.name =~ /${tumor}_vs_${normal}_(.*?).somatic.indels.vcf.gz/)[0][1].toInteger() } ,indelindex)} + indels.toSorted{ it -> (it.name =~ /${tumor}_vs_${normal}_(.*?).somatic.indels.vcf.gz/)[0][1].toInteger() },indelindex)} | combineVariants_strelka | join(sample_sheet_paired) | map{sample,markedvcf,markedindex,finalvcf,finalindex,tumor,normal -> tuple(tumor,normal,"strelka",finalvcf,finalindex)} | convert_strelka @@ -308,7 +305,7 @@ workflow VC { vc_all=vc_all|concat(vardict_in) - //VarDict TOnly + //Vardict TOnly if (!params.no_tonly){ vardict_in_tonly=vardict_tonly(bambyinterval_t) | groupTuple() diff --git a/subworkflows/local/workflows_tonly.nf b/subworkflows/local/workflows_tonly.nf index d691039..02d8698 100644 --- a/subworkflows/local/workflows_tonly.nf +++ b/subworkflows/local/workflows_tonly.nf @@ -49,7 +49,7 @@ workflow INPUT_TONLY { if(params.fastq_input){ fastqinput=Channel.fromFilePairs(params.fastq_input) }else if(params.fastq_file_input){ - fastqinput=Channel.fromPath(params.fastq_file_input).view() + fastqinput=Channel.fromPath(params.fastq_file_input) .splitCsv(header: false, sep: "\t", strip:true) .map{ sample,fq1,fq2 -> tuple(sample, tuple(file(fq1),file(fq2))) @@ -80,14 +80,15 @@ workflow ALIGN_TONLY { sample_sheet main: - fastp(fastqinput) - if (params.intervals){ - intervalbedin = Channel.fromPath(params.intervals,checkIfExists: true,type: 'file') - }else{ - intervalbedin = Channel.fromPath(params.genomes[params.genome].intervals,checkIfExists: true,type: 'file') - } - splitinterval(intervalbedin) + + if (params.intervals){ + intervalbedin = Channel.fromPath(params.intervals,checkIfExists: true,type: 'file') + }else{ + intervalbedin = Channel.fromPath(params.genomes[params.genome].intervals,checkIfExists: true,type: 'file') + } + splitinterval(intervalbedin) + fastp(fastqinput) bwamem2(fastp.out) //indelrealign(bwamem2.out) Consider indelreaglinement using ABRA?