Merged in mem_fix (pull request #193)

Mem fix
TheJacksonLaboratory · Jul 3, 2024 · a6bb26b · a6bb26b
2 parents 382b138 + f92f969
commit a6bb26b
Show file tree

Hide file tree

Showing 22 changed files with 152 additions and 33 deletions.
diff --git a/ReleaseNotes.md b/ReleaseNotes.md
@@ -1,5 +1,40 @@
 # RELEASE NOTES
 
+## Release 0.6.4
+
+In this release, we adjust memory and wallclock requirements for a number of modules, update `read_group_from_fastq.py` from python2 to python3, and incorporate PRs #4 and #5.  
+
+* PR #4 (contributed by @BrianSanderson) adds an optional gene and transcript count merge across samples in the RNA and PDX RNA workflows (merge accessed via including the `--merge_rna_counts` flag).  
+* PR #5 (contributed by @alanhoyle) adds a catch for corrupt gzip files in the Bowtie module as used by EMASE/GRBS analyses.  
+
+### Pipelines Added:
+
+None
+
+### Modules Added:
+
+1. utility_modules/merge_rsem_counts.nf
+
+### Pipeline Changes:
+
+1. workflows/rnaseq.nf module added to merge gene and transcript expression when `--merge_rna_counts` is used.  
+1. workflows/pdx_rnaseq.nf module added to merge gene and transcript expression when `--merge_rna_counts` is used.  
+
+### Module Changes:
+
+1. bowtie/bowtie.nf pipefail catch added for corrupt gzip files, per #5. 
+1. fastp/fastp.nf save json report as well as html report.  
+1. nygenome/lancet.nf wallclock request increase.  
+1. picard/picard_markduplicates.nf memory adjustment, and accounting for MarkDuplicates not fully respecting -Xmx memory limits imposed by Java.  
+1. picard/picard_reordersam.nf memory request increase.  
+1. picard/picard_sortsam.nf memory request increase.  
+1. utility_modules/read_groups.nf container changed to py3.  
+
+### Script Changes:
+
+1. bin/shared/read_group_from_fastq.py update from py2 to py3. 
+
+
 ## Release 0.6.3
 
 In this release we change the read disambiguation tool Xenome for Xengsort. Extensive benchmarking shows high concordance among results obtained from both tools.  

diff --git a/bin/help/ancestry.nf b/bin/help/ancestry.nf
@@ -7,8 +7,7 @@ Parameter | Default | Description
 --cacheDir | /projects/omics_share/meta/containers | This is directory that contains cached Singularity containers. JAX users should not change this parameter.
 -w | /<PATH> | The directory that all intermediary files and nextflow processes utilize. This directory can become quite large. This should be a location on /flashscratch or other directory with ample storage.
 
---csv_input | null | Provide a CSV manifest file with the header: "sampleID,lane,fastq_1,fastq_2". See the repository wiki for an example file. Fastq_2 is optional and used only in PE data. Fastq files can either be absolute paths to local files, or URLs to remote files. If remote URLs are provided, `--download_data` must be specified.
---download_data | null | Requires `--csv_input`. When specified, read data in the CSV manifest will be downloaded from provided URLs. 
+--csv_input | null | Provide a CSV manifest file with the header: "sampleID,bam". See the repository wiki for an example file.
 --genotype_targets | '/projects/compsci/omics_share/human/GRCh38/supporting_files/ancestry_panel/snp_panel_v2_targets_annotations.snpwt.bed.gz' | Target SNP bed file for the ancestry panel. Can contain annotation information. 
 --snpID_list | '/projects/compsci/omics_share/human/GRCh38/supporting_files/ancestry_panel/snp_panel_v2.list' | Target SNPs in list used in BCFtools filtering step
 --snp_annotations | '/projects/compsci/omics_share/human/GRCh38/supporting_files/ancestry_panel/snp_panel_v2_targets_annotations.snpwt.bed.gz' | Target SNP bed file with annotations for the ancestry panel.

diff --git a/bin/help/illumina.nf b/bin/help/illumina.nf
@@ -20,7 +20,6 @@ def help(){
   Parameter | Default | Description
   --genome_build | "GRCm38" | Parameter that controls reference data used for alignment and annotation. GRCm38 is the default value, GRCm39 is an accepted alternate value.
   --ref_fa | /<PATH> | Path to the reference genome in FASTA format.
-  --fasta_index | /<PATH> | Optional paramter to specify index for reference genome. If not provided, pipeline will generate an index.
   --exclude_regions | /ref_data/ucsc_mm10_gap_chr_sorted.bed | BED file that lists the coordinates of centromeres and telomeres to exclude as alignment targets
   --sv_ins_ref | /ref_data/variants_freeze5_sv_INS_mm39_to_mm10_sorted.bed.gz | BED file that lists previously indentified insertion SVs
   --sv_del_ref | /ref_data/variants_freeze5_sv_DEL_mm39_to_mm10_sorted.bed.gz | BED file that lists previously indentified deletion SVs
@@ -29,12 +28,10 @@ def help(){
   --genes_bed | /ref_data/Mus_musculus.GRCm38.102.gene_symbol.bed | BED file that lists gene symbol IDs and coordinates
   --exons_bed | /ref_data/Mus_musculus.GRCm38.102.exons.bed | BED file that lists exons and coordinates
 
-
   vep parameters:
   --vep_cache_directory | '/projects/compsci/omics_share/mouse/GRCm39/genome/annotation/vep_data' | VEP annotation cache. Cache provided is for Ensembl v109.
   --vep_fasta | '/projects/compsci/omics_share/mouse/GRCm39/genome/sequence/ensembl/GRCm39.p0/Mus_musculus.GRCm39.dna.primary_assembly.fa'  | VEP requires an ensembl based fasta. GRCh38.p13 is used for v97-v109.
 
-
   fastp filtering paramenters:
   Parameter | Default | Description
   --quality_phred | 30 | Quality score threshold.
@@ -47,6 +44,5 @@ def help(){
   --surv_type | 1 | Boolean (0/1) that requires SVs to be the same type for merging.
   --surv_strand | 1 | Boolean (0/1) that requires SVs to be on the same strand for merging.
   --surv_min | 30 | Minimum length (bp) to output SVs.
-
 '''
 }
diff --git a/bin/help/ont.nf b/bin/help/ont.nf
@@ -46,6 +46,5 @@ def help(){
   --surv_type | 1 | Boolean (0/1) that requires SVs to be the same type for merging.
   --surv_strand | 1 | Boolean (0/1) that requires SVs to be on the same strand for merging.
   --surv_min | 30 | Minimum length (bp) to output SVs.
-
 '''
 }
diff --git a/bin/help/pacbio.nf b/bin/help/pacbio.nf
@@ -35,6 +35,5 @@ def help(){
   --surv_type | 1 | Boolean (0/1) that requires SVs to be the same type for merging.
   --surv_strand | 1 | Boolean (0/1) that requires SVs to be on the same strand for merging.
   --surv_min | 30 | Minimum length (bp) to output SVs.
-
 '''
 }
diff --git a/bin/help/rnaseq.nf b/bin/help/rnaseq.nf
@@ -35,6 +35,8 @@ Parameter | Default | Description
 --seed_length | 25 | 'Seed length used by the read aligner. Providing the correct value is important for RSEM. If RSEM runs Bowtie, it uses this value for Bowtie's seed length parameter.'
 --rsem_aligner | 'bowtie2' | Options: bowtie2 or star. The aligner algorithm used by RSEM. Note, if using STAR, point rsem_ref_files to STAR based indices.
 
+--merge_rna_counts | false | Options false, true. If specified, gene and transcript counts are merged across all samples. Typically used in multi-sample cases. 
+
 --picard_dict | Mouse: '/projects/omics_share/mouse/GRCm38/genome/sequence/ensembl/v102/Mus_musculus.GRCm38.dna.toplevel.dict' 
               | Human: '/projects/omics_share/human/GRCh38/genome/sequence/ensembl/v104/Homo_sapiens.GRCh38.dna.toplevel.dict'
               | The coverage metric calculation step requires this file. Refers to human assembly when --gen_org human. JAX users should not change this parameter.
@@ -54,7 +56,6 @@ Parameter | Default | Description
 --xengsort_idx_path | '/projects/compsci/omics_share/human/GRCh38/supporting_files/xengsort' | Xengsort index for deconvolution of human and mouse reads. Used when `--pdx` is run. If `null`, Xengsort Index is run using ref_fa and host_fa.  
 --xengsort_idx_name | 'hg38_GRCm39-NOD_ShiLtJ' | Xengsort index name associated with files located in `xengsort_idx_path` or name given to outputs produced by Xengsort Index
 
-
 There are two additional parameters that are human specific. They are: 
 
 Parameter| Default| Description

diff --git a/bin/log/pta.nf b/bin/log/pta.nf
@@ -114,7 +114,9 @@ ______________________________________________________
 --quality_phred                 ${params.quality_phred}
 --unqualified_perc              ${params.unqualified_perc}
 --detect_adapter_for_pe         ${params.detect_adapter_for_pe}
---xenome_prefix                 ${params.xenome_prefix}
+--xengsort_host_fasta           ${params.xengsort_host_fasta}
+--xengsort_idx_path             ${params.xengsort_idx_path}
+--xengsort_idx_name             ${params.xengsort_idx_name}
 --ref_fa                        ${params.ref_fa}
 --ref_fa_indices                ${params.ref_fa_indices}
 --ref_fa_dict                   ${params.ref_fa_dict}

diff --git a/bin/log/rnaseq.nf b/bin/log/rnaseq.nf
@@ -57,6 +57,7 @@ ______________________________________________________
 --stradedness                ${params.strandedness}
 
 --rsem_aligner               ${params.rsem_aligner}
+--merge_rna_counts           ${params.merge_rna_counts}
 
 Human specific files: 
 --rsem_ref_prefix_human      ${params.rsem_ref_prefix_human}
@@ -109,13 +110,16 @@ ______________________________________________________
 --seed_length                ${params.seed_length}
 
 --pdx                        ${params.pdx}
---xenome_prefix              ${params.xenome_prefix}
+--xengsort_host_fasta           ${params.xengsort_host_fasta}
+--xengsort_idx_path             ${params.xengsort_idx_path}
+--xengsort_idx_name             ${params.xengsort_idx_name}
 
 --strandedness_ref           ${params.strandedness_ref}
 --strandedness_gtf           ${params.strandedness_gtf}
 --stradedness                ${params.strandedness}
 
 --rsem_aligner               ${params.rsem_aligner}
+--merge_rna_counts           ${params.merge_rna_counts}
 
 Human specific files: 
 --rsem_ref_prefix_human      ${params.rsem_ref_prefix_human}
@@ -174,6 +178,7 @@ ______________________________________________________
 --rsem_ref_prefix         ${params.rsem_ref_prefix}
 --rsem_ref_files          ${params.rsem_ref_files}
 --rsem_aligner            ${params.rsem_aligner}
+--merge_rna_counts        ${params.merge_rna_counts}
 --picard_dict             ${params.picard_dict}
 --ref_flat                ${params.ref_flat}
 --ribo_intervals          ${params.ribo_intervals}
@@ -218,6 +223,7 @@ ______________________________________________________
 --rsem_ref_prefix         ${params.rsem_ref_prefix}
 --rsem_ref_files          ${params.rsem_ref_files}
 --rsem_aligner            ${params.rsem_aligner}
+--merge_rna_counts        ${params.merge_rna_counts}
 --rsem_star_prefix        ${params.rsem_star_prefix}
 --picard_dict             ${params.picard_dict}
 --ref_flat                ${params.ref_flat}
@@ -263,6 +269,7 @@ ______________________________________________________
 --rsem_ref_prefix               ${params.rsem_ref_prefix}
 --rsem_ref_files                ${params.rsem_ref_files}
 --rsem_aligner                  ${params.rsem_aligner}
+--merge_rna_counts              ${params.merge_rna_counts}
 --picard_dict                   ${params.picard_dict}
 
 Project Directory: ${projectDir}
@@ -305,6 +312,7 @@ ______________________________________________________
 --rsem_ref_prefix               ${params.rsem_ref_prefix}
 --rsem_ref_files                ${params.rsem_ref_files}
 --rsem_aligner                  ${params.rsem_aligner}
+--merge_rna_counts              ${params.merge_rna_counts}
 --rsem_star_prefix              ${params.rsem_star_prefix}
 --picard_dict                   ${params.picard_dict}
 

diff --git a/bin/shared/read_group_from_fastq.py b/bin/shared/read_group_from_fastq.py
@@ -32,7 +32,7 @@
 
 
 def parse_args():
-    parser = argparse.ArgumentParser(version='V2.0')
+    parser = argparse.ArgumentParser()
     parser.add_argument('-p', '--picard', action='store_true',
                         help="Use Picard format for read group line")
     parser.add_argument('-t', '--tumor', action='store_true',
@@ -64,11 +64,14 @@ def parse_args():
 def multi_open(name):
     if name.endswith('.gz'):
         f = gzip.open(name)
+        line = f.readline().decode()
     elif name.endswith('.bz2'):
         f = bz2.BZ2File(name)
+        line = f.readline().decode()
     else:
         f = open(name)
-    return f
+        line = f.readline()
+    return line
 
 
 def make_fake(args):
@@ -153,11 +156,9 @@ def main():
     # - the ID in the first four fields.
     # Note: the leading '@' needs to be stripped.
     try:
-        inf = multi_open(args.fastq[0])
-        line = inf.readline()
-    except IOError, e:
-        print >> sys.stderr, "Couldn't read the file: {0}\n    {1}". \
-            format(fn, e.message)
+        line = multi_open(args.fastq[0])
+    except (IOError, e):
+        sys.stderr.write("Couldn't read the file: {0}\n    {1}". format(fn, e.message))
         make_fake(args)
         # Does not return
 
@@ -200,7 +201,7 @@ def output(id, ges_id, cust_id, bar_code, args):
         line = '@RG\\tID:{0}{1}\\tLB:{0}{2}\\tSM:{3}\\tPL:ILLUMINA'.\
             format(args.sample_type, id, ges_id, cust_id)
     # This needs to be a single line file; no terminating \n
-    print >> of, line,
+    print(line, file=of)
     if of != sys.stdout:
         of.close()
 

diff --git a/config/rnaseq.config b/config/rnaseq.config
@@ -19,7 +19,7 @@ params {
     concat_lanes = false
     download_data = false
     csv_input = null
-
+    merge_rna_counts = false
 
     pdx = false // if PDX, gen_org == human and xengsort is run to remove mouse reads from the sample(s). 
 

diff --git a/modules/bowtie/bowtie.nf b/modules/bowtie/bowtie.nf
@@ -19,7 +19,11 @@ process BOWTIE {
 
     script:
     """
-    zcat ${fq_read} | bowtie -p ${task.cpus} -q -a --best --strata --sam -v 3 -x ${params.bowtie_index} - 2> ${sampleID}.bowtie_${paired_read_num}.log | samtools view -bS - > ${sampleID}_mapped_${paired_read_num}.bam
+    set -o pipefail
+    
+    zcat ${fq_read} \\
+    | bowtie -p ${task.cpus} -q -a --best --strata --sam -v 3 -x ${params.bowtie_index} - 2> ${sampleID}.bowtie_${paired_read_num}.log \\
+    | samtools view -bS - > ${sampleID}_mapped_${paired_read_num}.bam
     """
     // NOTE: This is hard coded to .gz input files. 
 
@@ -53,4 +57,4 @@ Report alignments with at most <int> mismatches. -e and -l options are ignored a
 
 -m <int>
 Suppress all alignments for a particular read or pair if more than <int> reportable alignments exist for it. Reportable alignments are those that would be reported given the -n, -v, -l, -e, -k, -a, --best, and --strata options. Default: no limit. Bowtie is designed to be very fast for small -m but bowtie can become significantly slower for larger values of -m. If you would like to use Bowtie for larger values of -k, consider building an index with a denser suffix-array sample, i.e. specify a smaller -o/--offrate when invoking bowtie-build for the relevant index (see the Performance tuning section for details).
-*/
+*/
diff --git a/modules/fastp/fastp.nf b/modules/fastp/fastp.nf
@@ -9,7 +9,7 @@ process FASTP {
 
   container 'quay.io/biocontainers/fastp:0.23.2--h5f740d0_3'
 
-  publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'stats'}", pattern: "${sampleID}_fastp_report.*", mode:'copy'
+  publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'stats'}", pattern: "${sampleID}_fastp*.*", mode:'copy'
 
   input:
     tuple val(sampleID), path(fq_reads)

diff --git a/modules/nygenome/lancet.nf b/modules/nygenome/lancet.nf
@@ -3,7 +3,7 @@ process LANCET {
 
   cpus = 4
   memory = 30.GB
-  time = '10:00:00'
+  time = '18:00:00'
   errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'}
 
   container 'quay.io/jaxcompsci/lancet:v1.1.0'

diff --git a/modules/picard/picard_markduplicates.nf b/modules/picard/picard_markduplicates.nf
@@ -2,7 +2,7 @@ process PICARD_MARKDUPLICATES {
     tag "$sampleID"
 
     cpus 1
-    memory { bam.size() < 60.GB ? 20.GB : 40.GB }
+    memory 130.GB
     time { bam.size() < 60.GB ? '18:00:00' : '48:00:00' }
     errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'}
 
@@ -29,17 +29,19 @@ process PICARD_MARKDUPLICATES {
     tuple val(sampleID), file("*.txt"), emit: dedup_metrics
 
     script:
-    String my_mem = (task.memory-1.GB).toString()
+    String my_mem = (task.memory-10.GB).toString()
     my_mem =  my_mem[0..-4]
 
     """
-    picard -Xmx${my_mem}G MarkDuplicates \
+    picard -Xmx${my_mem}G -Xms${my_mem}G MarkDuplicates \
     I=${bam[0]} \
     O=${sampleID}_dedup.bam \
     M=${sampleID}_dup_metrics.txt \
     REMOVE_DUPLICATES=false \
     CREATE_INDEX=true \
     TMP_DIR=${workDir}/temp \
+    SORTING_COLLECTION_SIZE_RATIO=0.15 \
+    MAX_RECORDS_IN_RAM=300000 \
     VALIDATION_STRINGENCY=SILENT
     """
 }
diff --git a/modules/picard/picard_reordersam.nf b/modules/picard/picard_reordersam.nf
@@ -2,7 +2,7 @@ process PICARD_REORDERSAM {
     tag "$sampleID"
 
     cpus 1
-    memory 50.GB
+    memory 70.GB
     time '06:00:00'
     errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'}
 

diff --git a/modules/picard/picard_sortsam.nf b/modules/picard/picard_sortsam.nf
@@ -2,7 +2,7 @@ process PICARD_SORTSAM {
     tag "$sampleID"
 
     cpus 1
-    memory { sam.size() < 60.GB ? 10.GB : 30.GB }
+    memory { sam.size() < 60.GB ? 30.GB : 60.GB }
     time { sam.size() < 60.GB ? '10:00:00' : '24:00:00' }
     errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'}
 

diff --git a/modules/utility_modules/merge_rsem_counts.nf b/modules/utility_modules/merge_rsem_counts.nf
@@ -0,0 +1,53 @@
+// adapted from https://github.com/nf-core/rnaseq/blob/3.14.0/modules/local/rsem_merge_counts/main.nf
+process MERGE_RSEM_COUNTS {
+
+  tag "Merge RSEM counts"
+
+  cpus 1  
+  memory 24.GB
+  time 2.h
+  errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'}
+
+  container "quay.io/jaxcompsci/py3_perl_pylibs:v2"
+
+  publishDir "${params.pubdir}", pattern: "*rsem.merged.*.tsv", mode:'copy'
+
+  input:
+  path("genes/*")
+  path("isoforms/*")
+  val(prefix)
+
+  output:
+  path "*rsem.merged.gene_counts.tsv", emit: gene_counts
+  path "*rsem.merged.gene_tpm.tsv", emit: gene_tpm
+  path "*rsem.merged.transcript_counts.tsv", emit: transcript_counts
+  path "*rsem.merged.transcript_tpm.tsv", emit: transcript_tpm
+
+  script:
+  """
+  mkdir -p tmp/genes
+  cut -f 1,2 `ls ./genes/* | head -n 1` > gene_ids.txt
+  for fileid in `ls ./genes/*`; do
+    samplename=`basename \$fileid | sed s/\\.genes.results\$//g`
+    echo \$samplename > tmp/genes/\${samplename}.counts.txt
+    cut -f 5 \${fileid} | tail -n+2 >> tmp/genes/\${samplename}.counts.txt
+    echo \$samplename > tmp/genes/\${samplename}.tpm.txt
+    cut -f 6 \${fileid} | tail -n+2 >> tmp/genes/\${samplename}.tpm.txt
+  done
+
+  mkdir -p tmp/isoforms
+  cut -f 1,2 `ls ./isoforms/* | head -n 1` > transcript_ids.txt
+  for fileid in `ls ./isoforms/*`; do
+    samplename=`basename \$fileid | sed s/\\.isoforms.results\$//g`
+    echo \$samplename > tmp/isoforms/\${samplename}.counts.txt
+    cut -f 5 \${fileid} | tail -n+2 >> tmp/isoforms/\${samplename}.counts.txt
+    echo \$samplename > tmp/isoforms/\${samplename}.tpm.txt
+    cut -f 6 \${fileid} | tail -n+2 >> tmp/isoforms/\${samplename}.tpm.txt
+  done
+
+  paste gene_ids.txt tmp/genes/*.counts.txt > ${prefix}.rsem.merged.gene_counts.tsv
+  paste gene_ids.txt tmp/genes/*.tpm.txt > ${prefix}.rsem.merged.gene_tpm.tsv
+  paste transcript_ids.txt tmp/isoforms/*.counts.txt > ${prefix}.rsem.merged.transcript_counts.tsv
+  paste transcript_ids.txt tmp/isoforms/*.tpm.txt > ${prefix}.rsem.merged.transcript_tpm.tsv  
+  """
+}
diff --git a/modules/utility_modules/read_groups.nf b/modules/utility_modules/read_groups.nf
@@ -6,7 +6,7 @@ process READ_GROUPS {
   time '01:00:00'
   errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'}
 
-  container 'quay.io/jaxcompsci/python-bz2file:np_2.7.18'
+  container 'quay.io/jaxcompsci/py3_bgzip:3.10.12'
 
   publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'read_groups' }", pattern: "*read_group.txt", mode:'copy', enabled: params.workflow == 'rnaseq' || params.keep_intermediate