From 7eca4a5e484ec0a7741e5695f4858798ae28c540 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Tue, 10 Sep 2024 12:08:16 -0400 Subject: [PATCH 1/6] fix: set singularity cachedir inside workdir temporary fix #33 later we will provide a more robust solution that copies previously downloaded SIFs --- aspen | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/aspen b/aspen index 2c53359..62c010c 100755 --- a/aspen +++ b/aspen @@ -765,11 +765,11 @@ function main(){ # CLUSTERSTATUSCMD="${PIPELINE_HOME}/resources/cluster_status.sh" - if [[ ! -z "$SING_CACHE_DIR" ]];then - EXPORT_SING_CACHE_DIR_CMD="export SINGULARITY_CACHEDIR=\"${SING_CACHE_DIR}\"" - else - EXPORT_SING_CACHE_DIR_CMD="" + if [[ -z "$SING_CACHE_DIR" ]]; then + echo "singularity cache dir (--singcache) is not set, using ${WORKDIR}/.singularity" + SING_CACHE_DIR="${WORKDIR}/.singularity" fi + EXPORT_SING_CACHE_DIR_CMD="export SINGULARITY_CACHEDIR=\"${SING_CACHE_DIR}\"" case $RUNMODE in init) init && exit 0;; From 54e2de6eb51c3ca08b3327f3f0be4b49b26f1ce9 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Tue, 10 Sep 2024 12:14:30 -0400 Subject: [PATCH 2/6] feat: use /data/USER for sing cache dir if available --- aspen | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/aspen b/aspen index 62c010c..f2a908d 100755 --- a/aspen +++ b/aspen @@ -766,8 +766,14 @@ function main(){ # CLUSTERSTATUSCMD="${PIPELINE_HOME}/resources/cluster_status.sh" if [[ -z "$SING_CACHE_DIR" ]]; then - echo "singularity cache dir (--singcache) is not set, using ${WORKDIR}/.singularity" - SING_CACHE_DIR="${WORKDIR}/.singularity" + echo "singularity cache dir (--singcache) is not set" + if [[ -d "/data/$USER" ]]; then + SING_CACHE_DIR="/data/$USER/.singularity" + else + SING_CACHE_DIR="${WORKDIR}/.singularity" + fi + echo "\tusing ${SING_CACHE_DIR}" + mkdir -p $SING_CACHE_DIR fi EXPORT_SING_CACHE_DIR_CMD="export SINGULARITY_CACHEDIR=\"${SING_CACHE_DIR}\"" From e2a2e36ad1b84adbff1aedce32b2d64d76dfb997 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Tue, 10 Sep 2024 12:18:50 -0400 Subject: [PATCH 3/6] docs: describe --singcache behavior --- CHANGELOG.md | 2 ++ README.md | 99 +++++++++++++++++++++++++++------------------------- 2 files changed, 53 insertions(+), 48 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e21d1e4..7b91777 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,7 @@ ## ASPEN development version +- Set the singularity cache dir if `--singcache` is not provided. (#37, @kelly-sovacool) + ## ASPEN 1.0.1 - differential ATAC updated diff --git a/README.md b/README.md index 84a00d6..63c76cd 100644 --- a/README.md +++ b/README.md @@ -2,41 +2,41 @@ ### Table of Contents -- [ASPEN - **A**tac **S**eq **P**ip**E**li**N**e](#aspen) - - [1. Outline](#1-outline) - - [2. Runtime details](#2-runtime-details) - - [2.1 Load Module On Biowulf](#21-load-module-on-biowulf) - - [2.2 Create Sample Manifest](#22-create-sample-manifest) - - [2.3 Run Pipeline](#23-run-pipeline) - - [3. Genomes](#3-genomes) - - [4. Disclaimer](#4-disclaimer) - - [5. Help](#5-help) +- [ASPEN - **A**tac **S**eq **P**ip**E**li**N**e](#aspen) + - [1. Outline](#1-outline) + - [2. Runtime details](#2-runtime-details) + - [2.1 Load Module On Biowulf](#21-load-module-on-biowulf) + - [2.2 Create Sample Manifest](#22-create-sample-manifest) + - [2.3 Run Pipeline](#23-run-pipeline) + - [3. Genomes](#3-genomes) + - [4. Disclaimer](#4-disclaimer) + - [5. Help](#5-help) ### 1. Outline ASPEN or **A**tac **S**eq **P**ip**E**li**N**e is CCBR's pipeline to calls peaks for ATAC-Seq datasets. It currently accepts paired-end Illumina data and calls peak using [MACS2](https://doi.org/10.1186/gb-2008-9-9-r137) and [Genrich](https://github.com/jsh58/Genrich) peak callers. Below is a brief outline of the steps performed by the pipeline: -- Trim PE reads with [CutAdapt](https://doi.org/10.14806/ej.17.1.200) -- Remove reads aligning to known [blacklisted regions](https://doi.org/10.1038/s41598-019-45839-z), if provided -- Align reads provided genome using [bowtie2](https://doi.org/10.1038%2Fnmeth.1923). This step generates multiple output files: - - `tagAlign.gz`, which is a BED6 format file mainly required for MACS2 peak calling - - `dedup.bam`, deduplicated BAM format file which may be required for downstream processing (eg. [TOBIAS](https://github.com/CCBR/CCBR_tobias)) - - `qsorted.bam`, query sorted BAM file for Genrich peak calling -- Pre-peakcalling QC metrics: - - [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) is run pre- and post-trimming - - Fragment length distribution is calculated using custom scripts - - [Preseq](https://smithlabresearch.org/software/preseq/) is run to estimate library complexity -- Post-peakcalling QC metrics: - - TSS distributions are calculated for each replicate - - FRiP (Fraction of Reads in Peaks) is calculated for each replicate - - FRiPextra calculations are performed if _fripextra_ config files are supplied - - Fraction of reads in DHS regions - - Fraction of reads in promoter regions - - Fraction of reads in enhancer regions -- Peak calling: Peaks (NarrowPeak format) are called using MACS2 and Genrich. If multiple replicates exist per sample, consensus peaks are called (BED format). -- Peak annotation: [ChIPseeker](https://doi.org/10.1093/bioinformatics/btv145) is used to annotate peaks if the genome is hg38/hg19/mm10 -- Motif enrichment: Motif Enrichment is calculated using [HOMER](http://homer.ucsd.edu/homer/) and [AME (MEME suite)](https://meme-suite.org/meme/doc/ame.html) -- Report: [MultiQC](10.1093/bioinformatics/btw354) is used to generate a customized final HTML report +- Trim PE reads with [CutAdapt](https://doi.org/10.14806/ej.17.1.200) +- Remove reads aligning to known [blacklisted regions](https://doi.org/10.1038/s41598-019-45839-z), if provided +- Align reads provided genome using [bowtie2](https://doi.org/10.1038%2Fnmeth.1923). This step generates multiple output files: + - `tagAlign.gz`, which is a BED6 format file mainly required for MACS2 peak calling + - `dedup.bam`, deduplicated BAM format file which may be required for downstream processing (eg. [TOBIAS](https://github.com/CCBR/CCBR_tobias)) + - `qsorted.bam`, query sorted BAM file for Genrich peak calling +- Pre-peakcalling QC metrics: + - [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) is run pre- and post-trimming + - Fragment length distribution is calculated using custom scripts + - [Preseq](https://smithlabresearch.org/software/preseq/) is run to estimate library complexity +- Post-peakcalling QC metrics: + - TSS distributions are calculated for each replicate + - FRiP (Fraction of Reads in Peaks) is calculated for each replicate + - FRiPextra calculations are performed if _fripextra_ config files are supplied + - Fraction of reads in DHS regions + - Fraction of reads in promoter regions + - Fraction of reads in enhancer regions +- Peak calling: Peaks (NarrowPeak format) are called using MACS2 and Genrich. If multiple replicates exist per sample, consensus peaks are called (BED format). +- Peak annotation: [ChIPseeker](https://doi.org/10.1093/bioinformatics/btv145) is used to annotate peaks if the genome is hg38/hg19/mm10 +- Motif enrichment: Motif Enrichment is calculated using [HOMER](http://homer.ucsd.edu/homer/) and [AME (MEME suite)](https://meme-suite.org/meme/doc/ame.html) +- Report: [MultiQC](10.1093/bioinformatics/btw354) is used to generate a customized final HTML report ### 2. Runtime details @@ -67,9 +67,9 @@ Once the data is stored on biowulf, sample manifest TSV (`samples.tsv`) can be c Note that: -- symlinks are created for R1 and R2 files from the sample manifest in the results folder. These symlinks have the filenames \.R1.fastq.gz and \.R2.fastq.gz, respectively. Thus, original filenames do not matter and original files do not need to be renamed. -- **replicateName** is used as prefix for individual peak calls -- **sampleName** is used as prefix for consensus peak calls +- symlinks are created for R1 and R2 files from the sample manifest in the results folder. These symlinks have the filenames \.R1.fastq.gz and \.R2.fastq.gz, respectively. Thus, original filenames do not matter and original files do not need to be renamed. +- **replicateName** is used as prefix for individual peak calls +- **sampleName** is used as prefix for consensus peak calls > NOTE: > Optionally, if running differential ATAC please also provide `contrasts.tsv` in the output folder after running `init`. This is a simple tab-delimited text file with 2 columns (_Group1_ and _Group2_) without any headers. @@ -129,15 +129,18 @@ Required Arguments: Optional Arguments: +--help|-h : print this help --genome|-g : genome eg. hg38 --manifest|-s : absolute path to samples.tsv. This will be copied to output folder (--runmode=init only) --useenvmod|-e : use "--use-enmodules" option while running Snakemake. This is for using modules on HPC instead of containers(default). ---help|-h : print this help +--singcache|-c : singularity cache directory. Default is `/data/${USER}/.singularity` if available, or falls back to `${WORKDIR}/.singularity`. + Example commands: bash ./aspen -w=/my/output/folder -m=init bash ./aspen -w=/my/output/folder -m=dryrun bash ./aspen -w=/my/output/folder -m=run + bash ./aspen -w=/my/output/folder -m=run -c /data/${USER}/.singularity ########################################################################################## @@ -155,31 +158,31 @@ VersionInfo: 1. **Initialize the output folder**: - This can be done using the following command: + This can be done using the following command: - ```bash - % aspen -m=init -w= - ``` + ```bash + % aspen -m=init -w= + ``` - The above command will create `config.yaml` and `samples.tsv` in the output folder. Please edit these as per your requirements. You can replace the `samples.tsv` file in the output folder with the sample manifest created in the previous step outlined above. `contrasts.tsv` should also be included if running differential ATAC. + The above command will create `config.yaml` and `samples.tsv` in the output folder. Please edit these as per your requirements. You can replace the `samples.tsv` file in the output folder with the sample manifest created in the previous step outlined above. `contrasts.tsv` should also be included if running differential ATAC. 2. **Dryrun**: - To dry-run the pipeline, you can run the following command after initializing the output folder: + To dry-run the pipeline, you can run the following command after initializing the output folder: - ```bash - % aspen -m=dryrun -w= - ``` + ```bash + % aspen -m=dryrun -w= + ``` - This should list out the chain of jobs (DAG) that will be submitted to the job scheduler. + This should list out the chain of jobs (DAG) that will be submitted to the job scheduler. 3. **RUN!!**: - If the dry-run looks as expected, then you can submit the job using: + If the dry-run looks as expected, then you can submit the job using: - ```bash - % aspen -m=run -w= - ``` + ```bash + % aspen -m=run -w= + ``` This will submit one _master_ job to slurm, which will in turn keep managing the entire pipeline and submit/monitor jobs to the job scheduler as and when required. From 479febbf713cd3e7d4ba9729b0e0c3a9eb9787ad Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Tue, 10 Sep 2024 12:39:14 -0400 Subject: [PATCH 4/6] fix: export sing cache dir in sbatch script & for runlocal --- aspen | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/aspen b/aspen index f2a908d..cc1d574 100755 --- a/aspen +++ b/aspen @@ -530,6 +530,8 @@ set -exo pipefail $MODULE_STR +$EXPORT_SING_CACHE_DIR_CMD + snakemake -s $SNAKEFILE \ --directory $WORKDIR \ --printshellcmds \ @@ -594,6 +596,8 @@ $MODULE_STR cd \$SLURM_SUBMIT_DIR +$EXPORT_SING_CACHE_DIR_CMD + snakemake -s $SNAKEFILE \ --directory $WORKDIR \ $SINGULARITY_STR \ @@ -620,12 +624,8 @@ fi EOF cd $WORKDIR - if [[ "$EXPORT_SING_CACHE_DIR_CMD" != "" ]];then - $EXPORT_SING_CACHE_DIR_CMD && \ - sbatch submit_script.sbatch - else - sbatch submit_script.sbatch - fi + sbatch submit_script.sbatch + ########################################################################################## # unlock or dry-run From dadc2ee2ae6473d201bdfed2179653bec3a24511 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Tue, 10 Sep 2024 12:48:54 -0400 Subject: [PATCH 5/6] fix: make sure SING_CACHE_DIR exists --- aspen | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aspen b/aspen index cc1d574..30801b0 100755 --- a/aspen +++ b/aspen @@ -773,8 +773,8 @@ function main(){ SING_CACHE_DIR="${WORKDIR}/.singularity" fi echo "\tusing ${SING_CACHE_DIR}" - mkdir -p $SING_CACHE_DIR fi + mkdir -p $SING_CACHE_DIR EXPORT_SING_CACHE_DIR_CMD="export SINGULARITY_CACHEDIR=\"${SING_CACHE_DIR}\"" case $RUNMODE in From d1f4b06f2c02f9bde1252bfac8248bbbbbd4562b Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Tue, 10 Sep 2024 13:04:38 -0400 Subject: [PATCH 6/6] style: simplify singcache echo statement --- aspen | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/aspen b/aspen index 30801b0..ec7b6f7 100755 --- a/aspen +++ b/aspen @@ -766,13 +766,12 @@ function main(){ # CLUSTERSTATUSCMD="${PIPELINE_HOME}/resources/cluster_status.sh" if [[ -z "$SING_CACHE_DIR" ]]; then - echo "singularity cache dir (--singcache) is not set" if [[ -d "/data/$USER" ]]; then SING_CACHE_DIR="/data/$USER/.singularity" else SING_CACHE_DIR="${WORKDIR}/.singularity" fi - echo "\tusing ${SING_CACHE_DIR}" + echo "singularity cache dir (--singcache) is not set, using ${SING_CACHE_DIR}" fi mkdir -p $SING_CACHE_DIR EXPORT_SING_CACHE_DIR_CMD="export SINGULARITY_CACHEDIR=\"${SING_CACHE_DIR}\""