From 7eca4a5e484ec0a7741e5695f4858798ae28c540 Mon Sep 17 00:00:00 2001
From: Kelly Sovacool <kelly.sovacool@nih.gov>
Date: Tue, 10 Sep 2024 12:08:16 -0400
Subject: [PATCH 1/6] fix: set singularity cachedir inside workdir

temporary fix #33

later we will provide a more robust solution that copies previously downloaded SIFs
---
 aspen | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/aspen b/aspen
index 2c53359..62c010c 100755
--- a/aspen
+++ b/aspen
@@ -765,11 +765,11 @@ function main(){
 
   # CLUSTERSTATUSCMD="${PIPELINE_HOME}/resources/cluster_status.sh"
 
-  if [[ ! -z "$SING_CACHE_DIR" ]];then
-    EXPORT_SING_CACHE_DIR_CMD="export SINGULARITY_CACHEDIR=\"${SING_CACHE_DIR}\""
-  else
-    EXPORT_SING_CACHE_DIR_CMD=""
+  if [[ -z "$SING_CACHE_DIR" ]]; then
+    echo "singularity cache dir (--singcache) is not set, using ${WORKDIR}/.singularity"
+    SING_CACHE_DIR="${WORKDIR}/.singularity"
   fi
+  EXPORT_SING_CACHE_DIR_CMD="export SINGULARITY_CACHEDIR=\"${SING_CACHE_DIR}\""
 
   case $RUNMODE in
     init)     init      && exit 0;;

From 54e2de6eb51c3ca08b3327f3f0be4b49b26f1ce9 Mon Sep 17 00:00:00 2001
From: Kelly Sovacool <kelly.sovacool@nih.gov>
Date: Tue, 10 Sep 2024 12:14:30 -0400
Subject: [PATCH 2/6] feat: use /data/USER for sing cache dir if available

---
 aspen | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/aspen b/aspen
index 62c010c..f2a908d 100755
--- a/aspen
+++ b/aspen
@@ -766,8 +766,14 @@ function main(){
   # CLUSTERSTATUSCMD="${PIPELINE_HOME}/resources/cluster_status.sh"
 
   if [[ -z "$SING_CACHE_DIR" ]]; then
-    echo "singularity cache dir (--singcache) is not set, using ${WORKDIR}/.singularity"
-    SING_CACHE_DIR="${WORKDIR}/.singularity"
+    echo "singularity cache dir (--singcache) is not set"
+    if [[ -d "/data/$USER" ]]; then
+      SING_CACHE_DIR="/data/$USER/.singularity"
+    else
+      SING_CACHE_DIR="${WORKDIR}/.singularity"
+    fi
+    echo "\tusing ${SING_CACHE_DIR}"
+    mkdir -p $SING_CACHE_DIR
   fi
   EXPORT_SING_CACHE_DIR_CMD="export SINGULARITY_CACHEDIR=\"${SING_CACHE_DIR}\""
 

From e2a2e36ad1b84adbff1aedce32b2d64d76dfb997 Mon Sep 17 00:00:00 2001
From: Kelly Sovacool <kelly.sovacool@nih.gov>
Date: Tue, 10 Sep 2024 12:18:50 -0400
Subject: [PATCH 3/6] docs: describe --singcache behavior

---
 CHANGELOG.md |  2 ++
 README.md    | 99 +++++++++++++++++++++++++++-------------------------
 2 files changed, 53 insertions(+), 48 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e21d1e4..7b91777 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,7 @@
 ## ASPEN development version
 
+- Set the singularity cache dir if `--singcache` is not provided. (#37, @kelly-sovacool)
+
 ## ASPEN 1.0.1
 
 - differential ATAC updated
diff --git a/README.md b/README.md
index 84a00d6..63c76cd 100644
--- a/README.md
+++ b/README.md
@@ -2,41 +2,41 @@
 
 ### Table of Contents
 
--   [ASPEN - **A**tac **S**eq **P**ip**E**li**N**e](#aspen)
-    -   [1. Outline](#1-outline)
-    -   [2. Runtime details](#2-runtime-details)
-        -   [2.1 Load Module On Biowulf](#21-load-module-on-biowulf)
-        -   [2.2 Create Sample Manifest](#22-create-sample-manifest)
-        -   [2.3 Run Pipeline](#23-run-pipeline)
-    -   [3. Genomes](#3-genomes)
-    -   [4. Disclaimer](#4-disclaimer)
-    -   [5. Help](#5-help)
+- [ASPEN - **A**tac **S**eq **P**ip**E**li**N**e](#aspen)
+  - [1. Outline](#1-outline)
+  - [2. Runtime details](#2-runtime-details)
+    - [2.1 Load Module On Biowulf](#21-load-module-on-biowulf)
+    - [2.2 Create Sample Manifest](#22-create-sample-manifest)
+    - [2.3 Run Pipeline](#23-run-pipeline)
+  - [3. Genomes](#3-genomes)
+  - [4. Disclaimer](#4-disclaimer)
+  - [5. Help](#5-help)
 
 ### 1. Outline
 
 ASPEN or **A**tac **S**eq **P**ip**E**li**N**e is CCBR's pipeline to calls peaks for ATAC-Seq datasets. It currently accepts paired-end Illumina data and calls peak using [MACS2](https://doi.org/10.1186/gb-2008-9-9-r137) and [Genrich](https://github.com/jsh58/Genrich) peak callers. Below is a brief outline of the steps performed by the pipeline:
 
--   Trim PE reads with [CutAdapt](https://doi.org/10.14806/ej.17.1.200)
--   Remove reads aligning to known [blacklisted regions](https://doi.org/10.1038/s41598-019-45839-z), if provided
--   Align reads provided genome using [bowtie2](https://doi.org/10.1038%2Fnmeth.1923). This step generates multiple output files:
-    -   `tagAlign.gz`, which is a BED6 format file mainly required for MACS2 peak calling
-    -   `dedup.bam`, deduplicated BAM format file which may be required for downstream processing (eg. [TOBIAS](https://github.com/CCBR/CCBR_tobias))
-    -   `qsorted.bam`, query sorted BAM file for Genrich peak calling
--   Pre-peakcalling QC metrics:
-    -   [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) is run pre- and post-trimming
-    -   Fragment length distribution is calculated using custom scripts
-    -   [Preseq](https://smithlabresearch.org/software/preseq/) is run to estimate library complexity
--   Post-peakcalling QC metrics:
-    -   TSS distributions are calculated for each replicate
-    -   FRiP (Fraction of Reads in Peaks) is calculated for each replicate
-    -   FRiPextra calculations are performed if _fripextra_ config files are supplied
-        -   Fraction of reads in DHS regions
-        -   Fraction of reads in promoter regions
-        -   Fraction of reads in enhancer regions
--   Peak calling: Peaks (NarrowPeak format) are called using MACS2 and Genrich. If multiple replicates exist per sample, consensus peaks are called (BED format).
--   Peak annotation: [ChIPseeker](https://doi.org/10.1093/bioinformatics/btv145) is used to annotate peaks if the genome is hg38/hg19/mm10
--   Motif enrichment: Motif Enrichment is calculated using [HOMER](http://homer.ucsd.edu/homer/) and [AME (MEME suite)](https://meme-suite.org/meme/doc/ame.html)
--   Report: [MultiQC](10.1093/bioinformatics/btw354) is used to generate a customized final HTML report
+- Trim PE reads with [CutAdapt](https://doi.org/10.14806/ej.17.1.200)
+- Remove reads aligning to known [blacklisted regions](https://doi.org/10.1038/s41598-019-45839-z), if provided
+- Align reads provided genome using [bowtie2](https://doi.org/10.1038%2Fnmeth.1923). This step generates multiple output files:
+  - `tagAlign.gz`, which is a BED6 format file mainly required for MACS2 peak calling
+  - `dedup.bam`, deduplicated BAM format file which may be required for downstream processing (eg. [TOBIAS](https://github.com/CCBR/CCBR_tobias))
+  - `qsorted.bam`, query sorted BAM file for Genrich peak calling
+- Pre-peakcalling QC metrics:
+  - [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) is run pre- and post-trimming
+  - Fragment length distribution is calculated using custom scripts
+  - [Preseq](https://smithlabresearch.org/software/preseq/) is run to estimate library complexity
+- Post-peakcalling QC metrics:
+  - TSS distributions are calculated for each replicate
+  - FRiP (Fraction of Reads in Peaks) is calculated for each replicate
+  - FRiPextra calculations are performed if _fripextra_ config files are supplied
+    - Fraction of reads in DHS regions
+    - Fraction of reads in promoter regions
+    - Fraction of reads in enhancer regions
+- Peak calling: Peaks (NarrowPeak format) are called using MACS2 and Genrich. If multiple replicates exist per sample, consensus peaks are called (BED format).
+- Peak annotation: [ChIPseeker](https://doi.org/10.1093/bioinformatics/btv145) is used to annotate peaks if the genome is hg38/hg19/mm10
+- Motif enrichment: Motif Enrichment is calculated using [HOMER](http://homer.ucsd.edu/homer/) and [AME (MEME suite)](https://meme-suite.org/meme/doc/ame.html)
+- Report: [MultiQC](10.1093/bioinformatics/btw354) is used to generate a customized final HTML report
 
 ### 2. Runtime details
 
@@ -67,9 +67,9 @@ Once the data is stored on biowulf, sample manifest TSV (`samples.tsv`) can be c
 
 Note that:
 
--   symlinks are created for R1 and R2 files from the sample manifest in the results folder. These symlinks have the filenames \<replicateName\>.R1.fastq.gz and \<replicateName\>.R2.fastq.gz, respectively. Thus, original filenames do not matter and original files do not need to be renamed.
--   **replicateName** is used as prefix for individual peak calls
--   **sampleName** is used as prefix for consensus peak calls
+- symlinks are created for R1 and R2 files from the sample manifest in the results folder. These symlinks have the filenames \<replicateName\>.R1.fastq.gz and \<replicateName\>.R2.fastq.gz, respectively. Thus, original filenames do not matter and original files do not need to be renamed.
+- **replicateName** is used as prefix for individual peak calls
+- **sampleName** is used as prefix for consensus peak calls
 
 > NOTE:
 > Optionally, if running differential ATAC please also provide `contrasts.tsv` in the output folder after running `init`. This is a simple tab-delimited text file with 2 columns (_Group1_ and _Group2_) without any headers.
@@ -129,15 +129,18 @@ Required Arguments:
 
 Optional Arguments:
 
+--help|-h       : print this help
 --genome|-g     : genome eg. hg38
 --manifest|-s   : absolute path to samples.tsv. This will be copied to output folder                    (--runmode=init only)
 --useenvmod|-e  : use "--use-enmodules" option while running Snakemake. This is for using modules on HPC instead of containers(default).
---help|-h       : print this help
+--singcache|-c  : singularity cache directory. Default is `/data/${USER}/.singularity` if available, or falls back to `${WORKDIR}/.singularity`.
+
 
 Example commands:
   bash ./aspen -w=/my/output/folder -m=init
   bash ./aspen -w=/my/output/folder -m=dryrun
   bash ./aspen -w=/my/output/folder -m=run
+  bash ./aspen -w=/my/output/folder -m=run -c /data/${USER}/.singularity
 
 ##########################################################################################
 
@@ -155,31 +158,31 @@ VersionInfo:
 
 1. **Initialize the output folder**:
 
-    This can be done using the following command:
+   This can be done using the following command:
 
-    ```bash
-    % aspen -m=init -w=<path_to_output_folder>
-    ```
+   ```bash
+   % aspen -m=init -w=<path_to_output_folder>
+   ```
 
-    The above command will create `config.yaml` and `samples.tsv` in the output folder. Please edit these as per your requirements. You can replace the `samples.tsv` file in the output folder with the sample manifest created in the previous step outlined above. `contrasts.tsv` should also be included if running differential ATAC.
+   The above command will create `config.yaml` and `samples.tsv` in the output folder. Please edit these as per your requirements. You can replace the `samples.tsv` file in the output folder with the sample manifest created in the previous step outlined above. `contrasts.tsv` should also be included if running differential ATAC.
 
 2. **Dryrun**:
 
-    To dry-run the pipeline, you can run the following command after initializing the output folder:
+   To dry-run the pipeline, you can run the following command after initializing the output folder:
 
-    ```bash
-    % aspen -m=dryrun -w=<path_to_output_folder>
-    ```
+   ```bash
+   % aspen -m=dryrun -w=<path_to_output_folder>
+   ```
 
-    This should list out the chain of jobs (DAG) that will be submitted to the job scheduler.
+   This should list out the chain of jobs (DAG) that will be submitted to the job scheduler.
 
 3. **RUN!!**:
 
-    If the dry-run looks as expected, then you can submit the job using:
+   If the dry-run looks as expected, then you can submit the job using:
 
-    ```bash
-    % aspen -m=run -w=<path_to_output_folder>
-    ```
+   ```bash
+   % aspen -m=run -w=<path_to_output_folder>
+   ```
 
 This will submit one _master_ job to slurm, which will in turn keep managing the entire pipeline and submit/monitor jobs to the job scheduler as and when required.
 

From 479febbf713cd3e7d4ba9729b0e0c3a9eb9787ad Mon Sep 17 00:00:00 2001
From: Kelly Sovacool <kelly.sovacool@nih.gov>
Date: Tue, 10 Sep 2024 12:39:14 -0400
Subject: [PATCH 4/6] fix: export sing cache dir in sbatch script & for
 runlocal

---
 aspen | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/aspen b/aspen
index f2a908d..cc1d574 100755
--- a/aspen
+++ b/aspen
@@ -530,6 +530,8 @@ set -exo pipefail
 
 $MODULE_STR
 
+$EXPORT_SING_CACHE_DIR_CMD
+
 snakemake -s $SNAKEFILE \
 --directory $WORKDIR \
 --printshellcmds \
@@ -594,6 +596,8 @@ $MODULE_STR
 
 cd \$SLURM_SUBMIT_DIR
 
+$EXPORT_SING_CACHE_DIR_CMD
+
 snakemake -s $SNAKEFILE \
 --directory $WORKDIR \
 $SINGULARITY_STR \
@@ -620,12 +624,8 @@ fi
 EOF
 
   cd $WORKDIR
-  if [[ "$EXPORT_SING_CACHE_DIR_CMD" != "" ]];then
-    $EXPORT_SING_CACHE_DIR_CMD && \
-    sbatch submit_script.sbatch
-  else
-    sbatch submit_script.sbatch
-  fi
+  sbatch submit_script.sbatch
+
 
 ##########################################################################################
 # unlock or dry-run

From dadc2ee2ae6473d201bdfed2179653bec3a24511 Mon Sep 17 00:00:00 2001
From: Kelly Sovacool <kelly.sovacool@nih.gov>
Date: Tue, 10 Sep 2024 12:48:54 -0400
Subject: [PATCH 5/6] fix: make sure SING_CACHE_DIR exists

---
 aspen | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aspen b/aspen
index cc1d574..30801b0 100755
--- a/aspen
+++ b/aspen
@@ -773,8 +773,8 @@ function main(){
       SING_CACHE_DIR="${WORKDIR}/.singularity"
     fi
     echo "\tusing ${SING_CACHE_DIR}"
-    mkdir -p $SING_CACHE_DIR
   fi
+  mkdir -p $SING_CACHE_DIR
   EXPORT_SING_CACHE_DIR_CMD="export SINGULARITY_CACHEDIR=\"${SING_CACHE_DIR}\""
 
   case $RUNMODE in

From d1f4b06f2c02f9bde1252bfac8248bbbbbd4562b Mon Sep 17 00:00:00 2001
From: Kelly Sovacool <kelly.sovacool@nih.gov>
Date: Tue, 10 Sep 2024 13:04:38 -0400
Subject: [PATCH 6/6] style: simplify singcache echo statement

---
 aspen | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/aspen b/aspen
index 30801b0..ec7b6f7 100755
--- a/aspen
+++ b/aspen
@@ -766,13 +766,12 @@ function main(){
   # CLUSTERSTATUSCMD="${PIPELINE_HOME}/resources/cluster_status.sh"
 
   if [[ -z "$SING_CACHE_DIR" ]]; then
-    echo "singularity cache dir (--singcache) is not set"
     if [[ -d "/data/$USER" ]]; then
       SING_CACHE_DIR="/data/$USER/.singularity"
     else
       SING_CACHE_DIR="${WORKDIR}/.singularity"
     fi
-    echo "\tusing ${SING_CACHE_DIR}"
+    echo "singularity cache dir (--singcache) is not set, using ${SING_CACHE_DIR}"
   fi
   mkdir -p $SING_CACHE_DIR
   EXPORT_SING_CACHE_DIR_CMD="export SINGULARITY_CACHEDIR=\"${SING_CACHE_DIR}\""