From e7ed76c8ae491ec8237d5fa0942b97c2246b3c88 Mon Sep 17 00:00:00 2001 From: "Brian J. Sanderson" Date: Tue, 19 Mar 2024 10:11:17 -0500 Subject: [PATCH 1/5] added utility module to merge rsem results into counts and tpm tables for all samples, adapted from nfcore rnaseq pipeline --- modules/utility_modules/merge_rsem_counts.nf | 51 ++++++++++++++++++++ workflows/rnaseq.nf | 3 ++ 2 files changed, 54 insertions(+) create mode 100644 modules/utility_modules/merge_rsem_counts.nf diff --git a/modules/utility_modules/merge_rsem_counts.nf b/modules/utility_modules/merge_rsem_counts.nf new file mode 100644 index 0000000..c6958af --- /dev/null +++ b/modules/utility_modules/merge_rsem_counts.nf @@ -0,0 +1,51 @@ +// adapted from https://github.com/nf-core/rnaseq/blob/3.14.0/modules/local/rsem_merge_counts/main.nf +process MERGE_RSEM_COUNTS { + + tag "" + + cpus 1 + memory 24.GB + time 2.h + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container "quay.io/jaxcompsci/py3_perl_pylibs:v2" + + publishDir "${params.pubdir}", pattern: "rsem.merged.*.tsv", mode:'copy' + + input: + path("genes/*") + path("isoforms/*") + + output: + path "rsem.merged.gene_counts.tsv", emit: gene_counts + path "rsem.merged.gene_tpm.tsv", emit: gene_tpm + path "rsem.merged.transcript_counts.tsv", emit: transcript_counts + path "rsem.merged.transcript_tpm.tsv", emit: transcript_tpm + script: + """ + mkdir -p tmp/genes + cut -f 1,2 `ls ./genes/* | head -n 1` > gene_ids.txt + for fileid in `ls ./genes/*`; do + samplename=`basename \$fileid | sed s/\\.genes.results\$//g` + echo \$samplename > tmp/genes/\${samplename}.counts.txt + cut -f 5 \${fileid} | tail -n+2 >> tmp/genes/\${samplename}.counts.txt + echo \$samplename > tmp/genes/\${samplename}.tpm.txt + cut -f 6 \${fileid} | tail -n+2 >> tmp/genes/\${samplename}.tpm.txt + done + + mkdir -p tmp/isoforms + cut -f 1,2 `ls ./isoforms/* | head -n 1` > transcript_ids.txt + for fileid in `ls ./isoforms/*`; do + samplename=`basename \$fileid | sed s/\\.isoforms.results\$//g` + echo \$samplename > tmp/isoforms/\${samplename}.counts.txt + cut -f 5 \${fileid} | tail -n+2 >> tmp/isoforms/\${samplename}.counts.txt + echo \$samplename > tmp/isoforms/\${samplename}.tpm.txt + cut -f 6 \${fileid} | tail -n+2 >> tmp/isoforms/\${samplename}.tpm.txt + done + + paste gene_ids.txt tmp/genes/*.counts.txt > rsem.merged.gene_counts.tsv + paste gene_ids.txt tmp/genes/*.tpm.txt > rsem.merged.gene_tpm.tsv + paste transcript_ids.txt tmp/isoforms/*.counts.txt > rsem.merged.transcript_counts.tsv + paste transcript_ids.txt tmp/isoforms/*.tpm.txt > rsem.merged.transcript_tpm.tsv + """ +} diff --git a/workflows/rnaseq.nf b/workflows/rnaseq.nf index 2e0c7be..5fce8c9 100644 --- a/workflows/rnaseq.nf +++ b/workflows/rnaseq.nf @@ -17,6 +17,7 @@ include {FASTQC} from "${projectDir}/modules/fastqc/fastqc" include {CHECK_STRANDEDNESS} from "${projectDir}/modules/python/python_check_strandedness" include {READ_GROUPS} from "${projectDir}/modules/utility_modules/read_groups" include {RSEM_ALIGNMENT_EXPRESSION} from "${projectDir}/modules/rsem/rsem_alignment_expression" +include {MERGE_RSEM_COUNTS} from "${projectDir}/modules/utility_modules/merge_rsem_counts" include {PICARD_ADDORREPLACEREADGROUPS} from "${projectDir}/modules/picard/picard_addorreplacereadgroups" include {PICARD_REORDERSAM} from "${projectDir}/modules/picard/picard_reordersam" include {PICARD_SORTSAM} from "${projectDir}/modules/picard/picard_sortsam" @@ -137,6 +138,8 @@ workflow RNASEQ { // Step 2: RSEM RSEM_ALIGNMENT_EXPRESSION(rsem_input, params.rsem_ref_files, params.rsem_star_prefix, params.rsem_ref_prefix) + MERGE_RSEM_COUNTS(RSEM_ALIGNMENT_EXPRESSION.out.rsem_genes.collect{it[1]}, + RSEM_ALIGNMENT_EXPRESSION.out.rsem_isoforms.collect{it[1]}) //Step 3: Get Read Group Information READ_GROUPS(FASTP.out.trimmed_fastq, "picard") From 13d40514ec3905e3c03b31abd6dabdfd9259d444 Mon Sep 17 00:00:00 2001 From: Mike Lloyd Date: Mon, 1 Jul 2024 10:09:39 -0400 Subject: [PATCH 2/5] mem adjustment, py2 to py3 updates, multiqc version bump, typo fixes --- bin/help/ancestry.nf | 3 +-- bin/help/illumina.nf | 4 ---- bin/help/ont.nf | 1 - bin/help/pacbio.nf | 1 - bin/log/pta.nf | 4 +++- bin/log/rnaseq.nf | 4 +++- bin/shared/read_group_from_fastq.py | 17 +++++++++-------- modules/fastp/fastp.nf | 2 +- modules/multiqc/multiqc.nf | 2 +- modules/nygenome/lancet.nf | 2 +- modules/picard/picard_markduplicates.nf | 8 +++++--- modules/picard/picard_reordersam.nf | 2 +- modules/picard/picard_sortsam.nf | 2 +- modules/utility_modules/read_groups.nf | 2 +- workflows/somatic_wes_pta.nf | 2 -- 15 files changed, 27 insertions(+), 29 deletions(-) diff --git a/bin/help/ancestry.nf b/bin/help/ancestry.nf index cdab7d8..1351326 100644 --- a/bin/help/ancestry.nf +++ b/bin/help/ancestry.nf @@ -7,8 +7,7 @@ Parameter | Default | Description --cacheDir | /projects/omics_share/meta/containers | This is directory that contains cached Singularity containers. JAX users should not change this parameter. -w | / | The directory that all intermediary files and nextflow processes utilize. This directory can become quite large. This should be a location on /flashscratch or other directory with ample storage. ---csv_input | null | Provide a CSV manifest file with the header: "sampleID,lane,fastq_1,fastq_2". See the repository wiki for an example file. Fastq_2 is optional and used only in PE data. Fastq files can either be absolute paths to local files, or URLs to remote files. If remote URLs are provided, `--download_data` must be specified. ---download_data | null | Requires `--csv_input`. When specified, read data in the CSV manifest will be downloaded from provided URLs. +--csv_input | null | Provide a CSV manifest file with the header: "sampleID,bam". See the repository wiki for an example file. --genotype_targets | '/projects/compsci/omics_share/human/GRCh38/supporting_files/ancestry_panel/snp_panel_v2_targets_annotations.snpwt.bed.gz' | Target SNP bed file for the ancestry panel. Can contain annotation information. --snpID_list | '/projects/compsci/omics_share/human/GRCh38/supporting_files/ancestry_panel/snp_panel_v2.list' | Target SNPs in list used in BCFtools filtering step --snp_annotations | '/projects/compsci/omics_share/human/GRCh38/supporting_files/ancestry_panel/snp_panel_v2_targets_annotations.snpwt.bed.gz' | Target SNP bed file with annotations for the ancestry panel. diff --git a/bin/help/illumina.nf b/bin/help/illumina.nf index 5f2dcc3..fa9cb26 100644 --- a/bin/help/illumina.nf +++ b/bin/help/illumina.nf @@ -20,7 +20,6 @@ def help(){ Parameter | Default | Description --genome_build | "GRCm38" | Parameter that controls reference data used for alignment and annotation. GRCm38 is the default value, GRCm39 is an accepted alternate value. --ref_fa | / | Path to the reference genome in FASTA format. - --fasta_index | / | Optional paramter to specify index for reference genome. If not provided, pipeline will generate an index. --exclude_regions | /ref_data/ucsc_mm10_gap_chr_sorted.bed | BED file that lists the coordinates of centromeres and telomeres to exclude as alignment targets --sv_ins_ref | /ref_data/variants_freeze5_sv_INS_mm39_to_mm10_sorted.bed.gz | BED file that lists previously indentified insertion SVs --sv_del_ref | /ref_data/variants_freeze5_sv_DEL_mm39_to_mm10_sorted.bed.gz | BED file that lists previously indentified deletion SVs @@ -29,12 +28,10 @@ def help(){ --genes_bed | /ref_data/Mus_musculus.GRCm38.102.gene_symbol.bed | BED file that lists gene symbol IDs and coordinates --exons_bed | /ref_data/Mus_musculus.GRCm38.102.exons.bed | BED file that lists exons and coordinates - vep parameters: --vep_cache_directory | '/projects/compsci/omics_share/mouse/GRCm39/genome/annotation/vep_data' | VEP annotation cache. Cache provided is for Ensembl v109. --vep_fasta | '/projects/compsci/omics_share/mouse/GRCm39/genome/sequence/ensembl/GRCm39.p0/Mus_musculus.GRCm39.dna.primary_assembly.fa' | VEP requires an ensembl based fasta. GRCh38.p13 is used for v97-v109. - fastp filtering paramenters: Parameter | Default | Description --quality_phred | 30 | Quality score threshold. @@ -47,6 +44,5 @@ def help(){ --surv_type | 1 | Boolean (0/1) that requires SVs to be the same type for merging. --surv_strand | 1 | Boolean (0/1) that requires SVs to be on the same strand for merging. --surv_min | 30 | Minimum length (bp) to output SVs. - ''' } diff --git a/bin/help/ont.nf b/bin/help/ont.nf index 2c346ed..6546bcc 100644 --- a/bin/help/ont.nf +++ b/bin/help/ont.nf @@ -46,6 +46,5 @@ def help(){ --surv_type | 1 | Boolean (0/1) that requires SVs to be the same type for merging. --surv_strand | 1 | Boolean (0/1) that requires SVs to be on the same strand for merging. --surv_min | 30 | Minimum length (bp) to output SVs. - ''' } diff --git a/bin/help/pacbio.nf b/bin/help/pacbio.nf index 74eb5a8..5bcadb9 100644 --- a/bin/help/pacbio.nf +++ b/bin/help/pacbio.nf @@ -35,6 +35,5 @@ def help(){ --surv_type | 1 | Boolean (0/1) that requires SVs to be the same type for merging. --surv_strand | 1 | Boolean (0/1) that requires SVs to be on the same strand for merging. --surv_min | 30 | Minimum length (bp) to output SVs. - ''' } diff --git a/bin/log/pta.nf b/bin/log/pta.nf index c3a0219..00795f9 100644 --- a/bin/log/pta.nf +++ b/bin/log/pta.nf @@ -114,7 +114,9 @@ ______________________________________________________ --quality_phred ${params.quality_phred} --unqualified_perc ${params.unqualified_perc} --detect_adapter_for_pe ${params.detect_adapter_for_pe} ---xenome_prefix ${params.xenome_prefix} +--xengsort_host_fasta ${params.xengsort_host_fasta} +--xengsort_idx_path ${params.xengsort_idx_path} +--xengsort_idx_name ${params.xengsort_idx_name} --ref_fa ${params.ref_fa} --ref_fa_indices ${params.ref_fa_indices} --ref_fa_dict ${params.ref_fa_dict} diff --git a/bin/log/rnaseq.nf b/bin/log/rnaseq.nf index c788944..24ad00b 100644 --- a/bin/log/rnaseq.nf +++ b/bin/log/rnaseq.nf @@ -109,7 +109,9 @@ ______________________________________________________ --seed_length ${params.seed_length} --pdx ${params.pdx} ---xenome_prefix ${params.xenome_prefix} +--xengsort_host_fasta ${params.xengsort_host_fasta} +--xengsort_idx_path ${params.xengsort_idx_path} +--xengsort_idx_name ${params.xengsort_idx_name} --strandedness_ref ${params.strandedness_ref} --strandedness_gtf ${params.strandedness_gtf} diff --git a/bin/shared/read_group_from_fastq.py b/bin/shared/read_group_from_fastq.py index 5d19862..4c92a80 100644 --- a/bin/shared/read_group_from_fastq.py +++ b/bin/shared/read_group_from_fastq.py @@ -32,7 +32,7 @@ def parse_args(): - parser = argparse.ArgumentParser(version='V2.0') + parser = argparse.ArgumentParser() parser.add_argument('-p', '--picard', action='store_true', help="Use Picard format for read group line") parser.add_argument('-t', '--tumor', action='store_true', @@ -64,11 +64,14 @@ def parse_args(): def multi_open(name): if name.endswith('.gz'): f = gzip.open(name) + line = f.readline().decode() elif name.endswith('.bz2'): f = bz2.BZ2File(name) + line = f.readline().decode() else: f = open(name) - return f + line = f.readline() + return line def make_fake(args): @@ -153,11 +156,9 @@ def main(): # - the ID in the first four fields. # Note: the leading '@' needs to be stripped. try: - inf = multi_open(args.fastq[0]) - line = inf.readline() - except IOError, e: - print >> sys.stderr, "Couldn't read the file: {0}\n {1}". \ - format(fn, e.message) + line = multi_open(args.fastq[0]) + except (IOError, e): + sys.stderr.write("Couldn't read the file: {0}\n {1}". format(fn, e.message)) make_fake(args) # Does not return @@ -200,7 +201,7 @@ def output(id, ges_id, cust_id, bar_code, args): line = '@RG\\tID:{0}{1}\\tLB:{0}{2}\\tSM:{3}\\tPL:ILLUMINA'.\ format(args.sample_type, id, ges_id, cust_id) # This needs to be a single line file; no terminating \n - print >> of, line, + print(line, file=of) if of != sys.stdout: of.close() diff --git a/modules/fastp/fastp.nf b/modules/fastp/fastp.nf index 8c02cd7..19d2bdf 100644 --- a/modules/fastp/fastp.nf +++ b/modules/fastp/fastp.nf @@ -9,7 +9,7 @@ process FASTP { container 'quay.io/biocontainers/fastp:0.23.2--h5f740d0_3' - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'stats'}", pattern: "${sampleID}_fastp_report.*", mode:'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'stats'}", pattern: "${sampleID}_fastp*.*", mode:'copy' input: tuple val(sampleID), path(fq_reads) diff --git a/modules/multiqc/multiqc.nf b/modules/multiqc/multiqc.nf index e1560f4..f9d32ab 100644 --- a/modules/multiqc/multiqc.nf +++ b/modules/multiqc/multiqc.nf @@ -5,7 +5,7 @@ process MULTIQC { errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} - container 'quay.io/jaxcompsci/multiqc:v1.21_custom' + container 'quay.io/jaxcompsci/multiqc:v1.22.3_custom' publishDir "${params.pubdir}/multiqc", pattern: "*multiqc_report.html", mode:'copy' publishDir "${params.pubdir}/multiqc", pattern: "*_data", mode:'copy' diff --git a/modules/nygenome/lancet.nf b/modules/nygenome/lancet.nf index a3ba615..7319143 100644 --- a/modules/nygenome/lancet.nf +++ b/modules/nygenome/lancet.nf @@ -3,7 +3,7 @@ process LANCET { cpus = 4 memory = 30.GB - time = '10:00:00' + time = '18:00:00' errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/jaxcompsci/lancet:v1.1.0' diff --git a/modules/picard/picard_markduplicates.nf b/modules/picard/picard_markduplicates.nf index 8cd8dd1..030bb64 100644 --- a/modules/picard/picard_markduplicates.nf +++ b/modules/picard/picard_markduplicates.nf @@ -2,7 +2,7 @@ process PICARD_MARKDUPLICATES { tag "$sampleID" cpus 1 - memory { bam.size() < 60.GB ? 20.GB : 40.GB } + memory 130.GB time { bam.size() < 60.GB ? '18:00:00' : '48:00:00' } errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} @@ -29,17 +29,19 @@ process PICARD_MARKDUPLICATES { tuple val(sampleID), file("*.txt"), emit: dedup_metrics script: - String my_mem = (task.memory-1.GB).toString() + String my_mem = (task.memory-10.GB).toString() my_mem = my_mem[0..-4] """ - picard -Xmx${my_mem}G MarkDuplicates \ + picard -Xmx${my_mem}G -Xms${my_mem}G MarkDuplicates \ I=${bam[0]} \ O=${sampleID}_dedup.bam \ M=${sampleID}_dup_metrics.txt \ REMOVE_DUPLICATES=false \ CREATE_INDEX=true \ TMP_DIR=${workDir}/temp \ + SORTING_COLLECTION_SIZE_RATIO=0.15 \ + MAX_RECORDS_IN_RAM=300000 \ VALIDATION_STRINGENCY=SILENT """ } diff --git a/modules/picard/picard_reordersam.nf b/modules/picard/picard_reordersam.nf index 9c4f62b..614410b 100644 --- a/modules/picard/picard_reordersam.nf +++ b/modules/picard/picard_reordersam.nf @@ -2,7 +2,7 @@ process PICARD_REORDERSAM { tag "$sampleID" cpus 1 - memory 50.GB + memory 70.GB time '06:00:00' errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} diff --git a/modules/picard/picard_sortsam.nf b/modules/picard/picard_sortsam.nf index cb60520..31dbaa8 100644 --- a/modules/picard/picard_sortsam.nf +++ b/modules/picard/picard_sortsam.nf @@ -2,7 +2,7 @@ process PICARD_SORTSAM { tag "$sampleID" cpus 1 - memory { sam.size() < 60.GB ? 10.GB : 30.GB } + memory { sam.size() < 60.GB ? 30.GB : 60.GB } time { sam.size() < 60.GB ? '10:00:00' : '24:00:00' } errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} diff --git a/modules/utility_modules/read_groups.nf b/modules/utility_modules/read_groups.nf index 9fc95ec..56db735 100644 --- a/modules/utility_modules/read_groups.nf +++ b/modules/utility_modules/read_groups.nf @@ -6,7 +6,7 @@ process READ_GROUPS { time '01:00:00' errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} - container 'quay.io/jaxcompsci/python-bz2file:np_2.7.18' + container 'quay.io/jaxcompsci/py3_bgzip:3.10.12' publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'read_groups' }", pattern: "*read_group.txt", mode:'copy', enabled: params.workflow == 'rnaseq' || params.keep_intermediate diff --git a/workflows/somatic_wes_pta.nf b/workflows/somatic_wes_pta.nf index e81c6dc..bacb560 100755 --- a/workflows/somatic_wes_pta.nf +++ b/workflows/somatic_wes_pta.nf @@ -105,8 +105,6 @@ workflow SOMATIC_WES_PTA { // Step 1: Read Trim FASTP(read_ch) - - xenome_input = FASTP.out.trimmed_fastq FASTQC(FASTP.out.trimmed_fastq) From d461492dad1dc800b9adfc3da03cd28a22f809a1 Mon Sep 17 00:00:00 2001 From: Alan Hoyle Date: Mon, 1 Jul 2024 20:19:14 -0400 Subject: [PATCH 3/5] bowtie now catches pipe failures. If a corrupted FASTQ is input, the process creates corrupted output but does not record the error and cause Nextflow to realize the process has failed. This is because the error comes from the initial zcat and since it goes through a set of unix pipes, the exit code is not caught. This also changes the formatting of the .command.sh file so that each separate process in the pipe is on a separate line. --- modules/bowtie/bowtie.nf | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/modules/bowtie/bowtie.nf b/modules/bowtie/bowtie.nf index 4011266..f0fe8db 100644 --- a/modules/bowtie/bowtie.nf +++ b/modules/bowtie/bowtie.nf @@ -19,7 +19,11 @@ process BOWTIE { script: """ - zcat ${fq_read} | bowtie -p ${task.cpus} -q -a --best --strata --sam -v 3 -x ${params.bowtie_index} - 2> ${sampleID}.bowtie_${paired_read_num}.log | samtools view -bS - > ${sampleID}_mapped_${paired_read_num}.bam + set -o pipefail + + zcat ${fq_read} \\ + | bowtie -p ${task.cpus} -q -a --best --strata --sam -v 3 -x ${params.bowtie_index} - 2> ${sampleID}.bowtie_${paired_read_num}.log \\ + | samtools view -bS - > ${sampleID}_mapped_${paired_read_num}.bam """ // NOTE: This is hard coded to .gz input files. @@ -53,4 +57,4 @@ Report alignments with at most mismatches. -e and -l options are ignored a -m Suppress all alignments for a particular read or pair if more than reportable alignments exist for it. Reportable alignments are those that would be reported given the -n, -v, -l, -e, -k, -a, --best, and --strata options. Default: no limit. Bowtie is designed to be very fast for small -m but bowtie can become significantly slower for larger values of -m. If you would like to use Bowtie for larger values of -k, consider building an index with a denser suffix-array sample, i.e. specify a smaller -o/--offrate when invoking bowtie-build for the relevant index (see the Performance tuning section for details). -*/ \ No newline at end of file +*/ From 3191b56f7e1fb798b4511a83c81ac315a9e05b27 Mon Sep 17 00:00:00 2001 From: Mike Lloyd Date: Wed, 3 Jul 2024 14:50:30 -0400 Subject: [PATCH 4/5] polish --- bin/help/rnaseq.nf | 3 ++- bin/log/rnaseq.nf | 6 ++++++ config/rnaseq.config | 2 +- modules/multiqc/multiqc.nf | 2 +- modules/utility_modules/merge_rsem_counts.nf | 22 +++++++++++--------- nextflow.config | 2 +- subworkflows/pdx_rnaseq.nf | 14 +++++++++++++ workflows/rnaseq.nf | 9 ++++++-- 8 files changed, 44 insertions(+), 16 deletions(-) diff --git a/bin/help/rnaseq.nf b/bin/help/rnaseq.nf index 5229183..cd3c87f 100644 --- a/bin/help/rnaseq.nf +++ b/bin/help/rnaseq.nf @@ -35,6 +35,8 @@ Parameter | Default | Description --seed_length | 25 | 'Seed length used by the read aligner. Providing the correct value is important for RSEM. If RSEM runs Bowtie, it uses this value for Bowtie's seed length parameter.' --rsem_aligner | 'bowtie2' | Options: bowtie2 or star. The aligner algorithm used by RSEM. Note, if using STAR, point rsem_ref_files to STAR based indices. +--merge_rna_counts | false | Options false, true. If specified, gene and transcript counts are merged across all samples. Typically used in multi-sample cases. + --picard_dict | Mouse: '/projects/omics_share/mouse/GRCm38/genome/sequence/ensembl/v102/Mus_musculus.GRCm38.dna.toplevel.dict' | Human: '/projects/omics_share/human/GRCh38/genome/sequence/ensembl/v104/Homo_sapiens.GRCh38.dna.toplevel.dict' | The coverage metric calculation step requires this file. Refers to human assembly when --gen_org human. JAX users should not change this parameter. @@ -54,7 +56,6 @@ Parameter | Default | Description --xengsort_idx_path | '/projects/compsci/omics_share/human/GRCh38/supporting_files/xengsort' | Xengsort index for deconvolution of human and mouse reads. Used when `--pdx` is run. If `null`, Xengsort Index is run using ref_fa and host_fa. --xengsort_idx_name | 'hg38_GRCm39-NOD_ShiLtJ' | Xengsort index name associated with files located in `xengsort_idx_path` or name given to outputs produced by Xengsort Index - There are two additional parameters that are human specific. They are: Parameter| Default| Description diff --git a/bin/log/rnaseq.nf b/bin/log/rnaseq.nf index 24ad00b..35c6cd4 100644 --- a/bin/log/rnaseq.nf +++ b/bin/log/rnaseq.nf @@ -57,6 +57,7 @@ ______________________________________________________ --stradedness ${params.strandedness} --rsem_aligner ${params.rsem_aligner} +--merge_rna_counts ${params.merge_rna_counts} Human specific files: --rsem_ref_prefix_human ${params.rsem_ref_prefix_human} @@ -118,6 +119,7 @@ ______________________________________________________ --stradedness ${params.strandedness} --rsem_aligner ${params.rsem_aligner} +--merge_rna_counts ${params.merge_rna_counts} Human specific files: --rsem_ref_prefix_human ${params.rsem_ref_prefix_human} @@ -176,6 +178,7 @@ ______________________________________________________ --rsem_ref_prefix ${params.rsem_ref_prefix} --rsem_ref_files ${params.rsem_ref_files} --rsem_aligner ${params.rsem_aligner} +--merge_rna_counts ${params.merge_rna_counts} --picard_dict ${params.picard_dict} --ref_flat ${params.ref_flat} --ribo_intervals ${params.ribo_intervals} @@ -220,6 +223,7 @@ ______________________________________________________ --rsem_ref_prefix ${params.rsem_ref_prefix} --rsem_ref_files ${params.rsem_ref_files} --rsem_aligner ${params.rsem_aligner} +--merge_rna_counts ${params.merge_rna_counts} --rsem_star_prefix ${params.rsem_star_prefix} --picard_dict ${params.picard_dict} --ref_flat ${params.ref_flat} @@ -265,6 +269,7 @@ ______________________________________________________ --rsem_ref_prefix ${params.rsem_ref_prefix} --rsem_ref_files ${params.rsem_ref_files} --rsem_aligner ${params.rsem_aligner} +--merge_rna_counts ${params.merge_rna_counts} --picard_dict ${params.picard_dict} Project Directory: ${projectDir} @@ -307,6 +312,7 @@ ______________________________________________________ --rsem_ref_prefix ${params.rsem_ref_prefix} --rsem_ref_files ${params.rsem_ref_files} --rsem_aligner ${params.rsem_aligner} +--merge_rna_counts ${params.merge_rna_counts} --rsem_star_prefix ${params.rsem_star_prefix} --picard_dict ${params.picard_dict} diff --git a/config/rnaseq.config b/config/rnaseq.config index 468014d..3a1989a 100644 --- a/config/rnaseq.config +++ b/config/rnaseq.config @@ -19,7 +19,7 @@ params { concat_lanes = false download_data = false csv_input = null - + merge_rna_counts = false pdx = false // if PDX, gen_org == human and xengsort is run to remove mouse reads from the sample(s). diff --git a/modules/multiqc/multiqc.nf b/modules/multiqc/multiqc.nf index f9d32ab..e1560f4 100644 --- a/modules/multiqc/multiqc.nf +++ b/modules/multiqc/multiqc.nf @@ -5,7 +5,7 @@ process MULTIQC { errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} - container 'quay.io/jaxcompsci/multiqc:v1.22.3_custom' + container 'quay.io/jaxcompsci/multiqc:v1.21_custom' publishDir "${params.pubdir}/multiqc", pattern: "*multiqc_report.html", mode:'copy' publishDir "${params.pubdir}/multiqc", pattern: "*_data", mode:'copy' diff --git a/modules/utility_modules/merge_rsem_counts.nf b/modules/utility_modules/merge_rsem_counts.nf index c6958af..8b8bab6 100644 --- a/modules/utility_modules/merge_rsem_counts.nf +++ b/modules/utility_modules/merge_rsem_counts.nf @@ -1,7 +1,7 @@ // adapted from https://github.com/nf-core/rnaseq/blob/3.14.0/modules/local/rsem_merge_counts/main.nf process MERGE_RSEM_COUNTS { - tag "" + tag "Merge RSEM counts" cpus 1 memory 24.GB @@ -10,17 +10,19 @@ process MERGE_RSEM_COUNTS { container "quay.io/jaxcompsci/py3_perl_pylibs:v2" - publishDir "${params.pubdir}", pattern: "rsem.merged.*.tsv", mode:'copy' + publishDir "${params.pubdir}", pattern: "*rsem.merged.*.tsv", mode:'copy' input: path("genes/*") path("isoforms/*") + val(prefix) output: - path "rsem.merged.gene_counts.tsv", emit: gene_counts - path "rsem.merged.gene_tpm.tsv", emit: gene_tpm - path "rsem.merged.transcript_counts.tsv", emit: transcript_counts - path "rsem.merged.transcript_tpm.tsv", emit: transcript_tpm + path "*rsem.merged.gene_counts.tsv", emit: gene_counts + path "*rsem.merged.gene_tpm.tsv", emit: gene_tpm + path "*rsem.merged.transcript_counts.tsv", emit: transcript_counts + path "*rsem.merged.transcript_tpm.tsv", emit: transcript_tpm + script: """ mkdir -p tmp/genes @@ -43,9 +45,9 @@ process MERGE_RSEM_COUNTS { cut -f 6 \${fileid} | tail -n+2 >> tmp/isoforms/\${samplename}.tpm.txt done - paste gene_ids.txt tmp/genes/*.counts.txt > rsem.merged.gene_counts.tsv - paste gene_ids.txt tmp/genes/*.tpm.txt > rsem.merged.gene_tpm.tsv - paste transcript_ids.txt tmp/isoforms/*.counts.txt > rsem.merged.transcript_counts.tsv - paste transcript_ids.txt tmp/isoforms/*.tpm.txt > rsem.merged.transcript_tpm.tsv + paste gene_ids.txt tmp/genes/*.counts.txt > ${prefix}.rsem.merged.gene_counts.tsv + paste gene_ids.txt tmp/genes/*.tpm.txt > ${prefix}.rsem.merged.gene_tpm.tsv + paste transcript_ids.txt tmp/isoforms/*.counts.txt > ${prefix}.rsem.merged.transcript_counts.tsv + paste transcript_ids.txt tmp/isoforms/*.tpm.txt > ${prefix}.rsem.merged.transcript_tpm.tsv """ } diff --git a/nextflow.config b/nextflow.config index 16a7401..65066d4 100644 --- a/nextflow.config +++ b/nextflow.config @@ -45,7 +45,7 @@ manifest { homePage = "https://github.com/TheJacksonLaboratory/cs-nf-pipelines" mainScript = "main.nf" nextflowVersion = "!>=22.04.3" - version = "0.6.2" + version = "0.6.4" author = 'Michael Lloyd, Brian Sanderson, Barry Guglielmo, Sai Lek, Peter Fields, Harshpreet Chandok, Carolyn Paisie, Gabriel Rech, Ardian Ferraj, Anuj Srivastava. Copyright Jackson Laboratory 2024' } diff --git a/subworkflows/pdx_rnaseq.nf b/subworkflows/pdx_rnaseq.nf index 10392a1..a4de6bf 100644 --- a/subworkflows/pdx_rnaseq.nf +++ b/subworkflows/pdx_rnaseq.nf @@ -14,6 +14,8 @@ include {XENGSORT_CLASSIFY} from "${projectDir}/modules/xengsort/xengsort_classi // GZIP as GZIP_MOUSE} from "${projectDir}/modules/utility_modules/gzip" include {RSEM_ALIGNMENT_EXPRESSION as RSEM_ALIGNMENT_EXPRESSION_HUMAN; RSEM_ALIGNMENT_EXPRESSION as RSEM_ALIGNMENT_EXPRESSION_MOUSE} from "${projectDir}/modules/rsem/rsem_alignment_expression" +include {MERGE_RSEM_COUNTS as MERGE_RSEM_COUNTS_HUMAN; + MERGE_RSEM_COUNTS as MERGE_RSEM_COUNTS_MOUSE} from "${projectDir}/modules/utility_modules/merge_rsem_counts" include {LYMPHOMA_CLASSIFIER} from "${projectDir}/modules/python/python_lymphoma_classifier" include {PICARD_ADDORREPLACEREADGROUPS as PICARD_ADDORREPLACEREADGROUPS_HUMAN; PICARD_ADDORREPLACEREADGROUPS as PICARD_ADDORREPLACEREADGROUPS_MOUSE} from "${projectDir}/modules/picard/picard_addorreplacereadgroups" @@ -72,6 +74,12 @@ workflow PDX_RNASEQ { RSEM_ALIGNMENT_EXPRESSION_HUMAN(human_reads, params.rsem_ref_files_human, params.rsem_star_prefix_human, params.rsem_ref_prefix_human) + if (params.merge_rna_counts) { + MERGE_RSEM_COUNTS_HUMAN(RSEM_ALIGNMENT_EXPRESSION_HUMAN.out.rsem_genes.collect{it[1]}, + RSEM_ALIGNMENT_EXPRESSION_HUMAN.out.rsem_isoforms.collect{it[1]}, + 'humanSamples') + } + LYMPHOMA_CLASSIFIER(RSEM_ALIGNMENT_EXPRESSION_HUMAN.out.rsem_genes) // Picard Alignment Metrics @@ -94,6 +102,12 @@ workflow PDX_RNASEQ { RSEM_ALIGNMENT_EXPRESSION_MOUSE(mouse_reads, params.rsem_ref_files_mouse, params.rsem_star_prefix_mouse, params.rsem_ref_prefix_mouse) + if (params.merge_rna_counts) { + MERGE_RSEM_COUNTS_MOUSE(RSEM_ALIGNMENT_EXPRESSION_MOUSE.out.rsem_genes.collect{it[1]}, + RSEM_ALIGNMENT_EXPRESSION_MOUSE.out.rsem_isoforms.collect{it[1]}, + 'mouseSamples') + } + // Step 4: Picard Alignment Metrics READ_GROUPS_MOUSE(mouse_reads.map{it -> tuple(it[0], it[1])}, "picard") diff --git a/workflows/rnaseq.nf b/workflows/rnaseq.nf index 6d4d287..a21862b 100644 --- a/workflows/rnaseq.nf +++ b/workflows/rnaseq.nf @@ -138,8 +138,12 @@ workflow RNASEQ { // Step 2: RSEM RSEM_ALIGNMENT_EXPRESSION(rsem_input, params.rsem_ref_files, params.rsem_star_prefix, params.rsem_ref_prefix) - MERGE_RSEM_COUNTS(RSEM_ALIGNMENT_EXPRESSION.out.rsem_genes.collect{it[1]}, - RSEM_ALIGNMENT_EXPRESSION.out.rsem_isoforms.collect{it[1]}) + if (params.merge_rna_counts) { + MERGE_RSEM_COUNTS(RSEM_ALIGNMENT_EXPRESSION.out.rsem_genes.collect{it[1]}, + RSEM_ALIGNMENT_EXPRESSION.out.rsem_isoforms.collect{it[1]}, + 'allSamples') + } + //Step 3: Get Read Group Information READ_GROUPS(FASTP.out.trimmed_fastq, "picard") @@ -168,3 +172,4 @@ workflow RNASEQ { ) } } + From f92f969492ebd779189296620ffaf1ef95047159 Mon Sep 17 00:00:00 2001 From: Mike Lloyd Date: Wed, 3 Jul 2024 14:52:06 -0400 Subject: [PATCH 5/5] update release notes --- ReleaseNotes.md | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index bf97c6f..1459849 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -1,5 +1,40 @@ # RELEASE NOTES +## Release 0.6.4 + +In this release, we adjust memory and wallclock requirements for a number of modules, update `read_group_from_fastq.py` from python2 to python3, and incorporate PRs #4 and #5. + +* PR #4 (contributed by @BrianSanderson) adds an optional gene and transcript count merge across samples in the RNA and PDX RNA workflows (merge accessed via including the `--merge_rna_counts` flag). +* PR #5 (contributed by @alanhoyle) adds a catch for corrupt gzip files in the Bowtie module as used by EMASE/GRBS analyses. + +### Pipelines Added: + +None + +### Modules Added: + +1. utility_modules/merge_rsem_counts.nf + +### Pipeline Changes: + +1. workflows/rnaseq.nf module added to merge gene and transcript expression when `--merge_rna_counts` is used. +1. workflows/pdx_rnaseq.nf module added to merge gene and transcript expression when `--merge_rna_counts` is used. + +### Module Changes: + +1. bowtie/bowtie.nf pipefail catch added for corrupt gzip files, per #5. +1. fastp/fastp.nf save json report as well as html report. +1. nygenome/lancet.nf wallclock request increase. +1. picard/picard_markduplicates.nf memory adjustment, and accounting for MarkDuplicates not fully respecting -Xmx memory limits imposed by Java. +1. picard/picard_reordersam.nf memory request increase. +1. picard/picard_sortsam.nf memory request increase. +1. utility_modules/read_groups.nf container changed to py3. + +### Script Changes: + +1. bin/shared/read_group_from_fastq.py update from py2 to py3. + + ## Release 0.6.3 In this release we change the read disambiguation tool Xenome for Xengsort. Extensive benchmarking shows high concordance among results obtained from both tools.