diff --git a/config/config.yaml b/config/config.yaml index 4902842..f8f534f 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -74,6 +74,9 @@ macs2_control: "N" ### default GOPEAKS pvalue is 0.05 https://github.com/maxsonBraunLab/gopeaks/blob/main/README.md ### default SEACR FDR threshold 1 https://github.com/FredHutch/SEACR/blob/master/README.md quality_thresholds: "0.1, 0.05, 0.01" +## MACS2, broad-peaks specific, quality threshold +### if broadPeak is seleted as a 'peaktype', an additional quality threshold can be used +macs2_broad_peak_threshold: "0.01" # annotations ## rose parameters @@ -93,7 +96,7 @@ preparsedDir: "/data/CCBR_Pipeliner/db/PipeDB/homer/preparsedDir" reference: hg38: fa: "/data/CCBR_Pipeliner/db/PipeDB/Indices/hg38_basic/hg38.fa" - gtf: "tbd" + gtf: "/data/CCBR_Pipeliner/db/PipeDB/Indices/hg38_basic/genes.gtf" blacklist: "PIPELINE_HOME/resources/blacklistbed/hg38.bed" regions: "chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chrX chrY" macs2_g: "hs" @@ -101,7 +104,7 @@ reference: rose: "WORKDIR/annotation/hg38_refseq.ucsc" hg19: fa: "/data/CCBR_Pipeliner/db/PipeDB/Indices/hg19_basic/hg19.fa" - gtf: "tbd" + gtf: "/data/CCBR_Pipeliner/db/PipeDB/Indices/hg19_basic/genes.gtf" blacklist: "PIPELINE_HOME/resources/blacklistbed/hg19.bed" regions: "chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chrX chrY" macs2_g: "hs" @@ -109,7 +112,7 @@ reference: rose: "WORKDIR/annotation/hg19_refseq.ucsc" mm10: fa: "/data/CCBR_Pipeliner/db/PipeDB/Indices/mm10_basic/mm10.fa" - gtf: "tbd" + gtf: "/data/CCBR_Pipeliner/db/PipeDB/Indices/mm10_basic/genes.gtf" blacklist: "PIPELINE_HOME/resources/blacklistbed/mm10.bed" regions: "chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chrX chrY" macs2_g: "mm" @@ -132,4 +135,4 @@ spikein_reference: saccharomyces: fa: "PIPELINE_HOME/resources/spikein/S_cer_S288C_R64.fna" -adapters: "PIPELINE_HOME/resources/other/adapters.fa" \ No newline at end of file +adapters: "PIPELINE_HOME/resources/other/adapters.fa" diff --git a/workflow/Snakefile b/workflow/Snakefile index 7ca4bbb..6584aef 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -30,15 +30,21 @@ def run_seacr(wildcards): files.extend(s) s2=expand(join(RESULTSDIR,"peaks","{qthresholds}","seacr","{treatment}_vs_{control}","{treatment}_vs_{control}.{dupstatus}.norm.stringent.bigbed"),qthresholds=QTRESHOLDS,treatment=TREATMENTS,control=CONTROLS,dupstatus=DUPSTATUS), files.extend(s2) - s3=expand(join(RESULTSDIR,"peaks","{qthresholds}","seacr","{treatment}_vs_{control}","{treatment}_vs_{control}.{dupstatus}.non.stringent.bigbed"),qthresholds=QTRESHOLDS,treatment=TREATMENTS,control=CONTROLS,dupstatus=DUPSTATUS), - files.extend(s3) + if "non.stringent.bed" in PEAKTYPE: + s=expand(join(RESULTSDIR,"peaks","{qthresholds}","seacr","{treatment}_vs_{control}","{treatment}_vs_{control}.{dupstatus}.non.stringent.bed"),qthresholds=QTRESHOLDS,treatment=TREATMENTS,control=CONTROLS,dupstatus=DUPSTATUS), + files.extend(s) + s2=expand(join(RESULTSDIR,"peaks","{qthresholds}","seacr","{treatment}_vs_{control}","{treatment}_vs_{control}.{dupstatus}.non.stringent.bigbed"),qthresholds=QTRESHOLDS,treatment=TREATMENTS,control=CONTROLS,dupstatus=DUPSTATUS), + files.extend(s2) if "norm.relaxed.bed" in PEAKTYPE: r=expand(join(RESULTSDIR,"peaks","{qthresholds}","seacr","{treatment}_vs_{control}","{treatment}_vs_{control}.{dupstatus}.norm.relaxed.bed"),qthresholds=QTRESHOLDS,treatment=TREATMENTS,control=CONTROLS,dupstatus=DUPSTATUS), files.extend(r) r2=expand(join(RESULTSDIR,"peaks","{qthresholds}","seacr","{treatment}_vs_{control}","{treatment}_vs_{control}.{dupstatus}.norm.relaxed.bigbed"),qthresholds=QTRESHOLDS,treatment=TREATMENTS,control=CONTROLS,dupstatus=DUPSTATUS), files.extend(r2) - r3=expand(join(RESULTSDIR,"peaks","{qthresholds}","seacr","{treatment}_vs_{control}","{treatment}_vs_{control}.{dupstatus}.non.relaxed.bigbed"),qthresholds=QTRESHOLDS,treatment=TREATMENTS,control=CONTROLS,dupstatus=DUPSTATUS), - files.extend(r3) + if "non.relaxed.bed" in PEAKTYPE: + r=expand(join(RESULTSDIR,"peaks","{qthresholds}","seacr","{treatment}_vs_{control}","{treatment}_vs_{control}.{dupstatus}.non.relaxed.bed"),qthresholds=QTRESHOLDS,treatment=TREATMENTS,control=CONTROLS,dupstatus=DUPSTATUS), + files.extend(r) + r2=expand(join(RESULTSDIR,"peaks","{qthresholds}","seacr","{treatment}_vs_{control}","{treatment}_vs_{control}.{dupstatus}.non.relaxed.bigbed"),qthresholds=QTRESHOLDS,treatment=TREATMENTS,control=CONTROLS,dupstatus=DUPSTATUS), + files.extend(r2) return files def run_gopeaks(wildcards): @@ -92,7 +98,7 @@ def get_annotation_macs2(wildcards): def get_annotation_s_and_g(wildcards): files=[] - if ("narrowGo_peaks.bed" in PEAKTYPE) or ("broadGo_peaks.bed" in PEAKTYPE) or ("norm.stringent.bed" in PEAKTYPE) or ("norm.relaxed.bed" in PEAKTYPE): + if ("narrowGo_peaks.bed" in PEAKTYPE) or ("broadGo_peaks.bed" in PEAKTYPE) or ("norm.stringent.bed" in PEAKTYPE) or ("norm.relaxed.bed" in PEAKTYPE) or ("non.stringent.bed" in PEAKTYPE) or ("non.relaxed.bed" in PEAKTYPE): f=expand(join(RESULTSDIR,"annotation","{qthresholds}","{s_and_g_types}","{t_and_c}","{t_and_c}.{dupstatus}.annotation.txt"),qthresholds=QTRESHOLDS,s_and_g_types=S_AND_G_TYPES,t_and_c=TREATMENT_CONTROL_LIST,dupstatus=DUPSTATUS) files.extend(f) return files diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk index c6a6162..36c7070 100644 --- a/workflow/rules/align.smk +++ b/workflow/rules/align.smk @@ -317,10 +317,11 @@ rule bam2bg: if [[ "{params.spikein}" == "" ]];then spikein_scale=1 elif [[ "{params.spikein}" == "LIBRARY" ]];then - library_size=`cat {params.library_file} | grep {params.replicate} | cut -f2 -d","` + library_size=`cat {params.library_file} | grep {params.replicate} | cut -f2 -d"," | head -n1` if [[ $library_size =~ "Inf" ]]; then spikein_scale=`head -n1 {params.library_file} | awk -F"," \'{{print $2}}\'` + spikein_scale=$(echo "$spikein_scale / 1000000" | bc -l) else spikein_scale=$(echo "$library_size / 1000000" | bc -l) fi diff --git a/workflow/rules/annotations.smk b/workflow/rules/annotations.smk index 5ea9107..4a1067d 100644 --- a/workflow/rules/annotations.smk +++ b/workflow/rules/annotations.smk @@ -30,10 +30,17 @@ rule peakAnnotation_macs2: """ def get_bed_s_and_g(wildcards): + # SEACR OPTIONS if wildcards.s_and_g_types =="norm.stringent.bed": bed=join(RESULTSDIR,"peaks",wildcards.qthresholds,"seacr", wildcards.t_and_c, wildcards.t_and_c + "." + wildcards.dupstatus + ".norm.stringent.bed") if wildcards.s_and_g_types =="norm.relaxed.bed": bed=join(RESULTSDIR,"peaks",wildcards.qthresholds,"seacr", wildcards.t_and_c, wildcards.t_and_c + "." + wildcards.dupstatus + ".norm.relaxed.bed") + if wildcards.s_and_g_types =="non.stringent.bed": + bed=join(RESULTSDIR,"peaks",wildcards.qthresholds,"seacr", wildcards.t_and_c, wildcards.t_and_c + "." + wildcards.dupstatus + ".non.stringent.bed") + if wildcards.s_and_g_types =="non.relaxed.bed": + bed=join(RESULTSDIR,"peaks",wildcards.qthresholds,"seacr", wildcards.t_and_c, wildcards.t_and_c + "." + wildcards.dupstatus + ".non.relaxed.bed") + + #GOPEAKS OPTIONS if wildcards.s_and_g_types =="narrowGo_peaks.bed": bed=join(RESULTSDIR,"peaks",wildcards.qthresholds,"gopeaks", wildcards.t_and_c + "." + wildcards.dupstatus + ".narrowGo_peaks.bed") if wildcards.s_and_g_types =="broadGo_peaks.bed": @@ -71,14 +78,23 @@ def get_cntrl_bam(wildcards): def get_bed_all(wildcards): cntrl=TREAT_to_CONTRL_DICT[wildcards.treatment] + #MACS2 OPTIONS if wildcards.peak_types == "narrowPeak": bed=join(RESULTSDIR,"peaks",wildcards.qthresholds,"macs2", wildcards.treatment, wildcards.treatment + "." + wildcards.dupstatus + "_peaks.narrowPeak") if wildcards.peak_types =="broadPeak": bed=join(RESULTSDIR,"peaks",wildcards.qthresholds,"macs2", wildcards.treatment, wildcards.treatment + "." + wildcards.dupstatus + "_peaks.broadPeak") + + #SEACR OPTIONS if wildcards.peak_types =="norm.stringent.bed": bed=join(RESULTSDIR,"peaks",wildcards.qthresholds,"seacr", wildcards.treatment + "_vs_" + cntrl, wildcards.treatment + "_vs_" + cntrl + "." + wildcards.dupstatus + ".norm.stringent.bed") if wildcards.peak_types =="norm.relaxed.bed": bed=join(RESULTSDIR,"peaks",wildcards.qthresholds,"seacr", wildcards.treatment + "_vs_" + cntrl, wildcards.treatment + "_vs_" + cntrl + "." + wildcards.dupstatus + ".norm.relaxed.bed") + if wildcards.peak_types =="non.stringent.bed": + bed=join(RESULTSDIR,"peaks",wildcards.qthresholds,"seacr", wildcards.treatment + "_vs_" + cntrl, wildcards.treatment + "_vs_" + cntrl + "." + wildcards.dupstatus + ".non.stringent.bed") + if wildcards.peak_types =="non.relaxed.bed": + bed=join(RESULTSDIR,"peaks",wildcards.qthresholds,"seacr", wildcards.treatment + "_vs_" + cntrl, wildcards.treatment + "_vs_" + cntrl + "." + wildcards.dupstatus + ".non.relaxed.bed") + + #GOPEAKS OPTIONS if wildcards.peak_types =="narrowGo_peaks.bed": bed=join(RESULTSDIR,"peaks",wildcards.qthresholds,"gopeaks", wildcards.treatment + "_vs_" + cntrl + "." + wildcards.dupstatus + ".narrowGo_peaks.bed") if wildcards.peak_types =="broadGo_peaks.bed": @@ -184,10 +200,10 @@ rule rose: nl --number-format=rz --number-width=3 $TMPDIR/subset.bed | awk -v sample_id="{params.sampleID}_" \'{{print sample_id$1"\\t0\\t."}}\' > $TMPDIR/col.txt paste -d "\t" $TMPDIR/save.bed $TMPDIR/col.txt > $TMPDIR/subset.bed fi - ## correct GOPEAKS + ## correct SEACR ### original: ### output: <$sampleid_uniquenumber> <.> - if [[ {params.peak_type} == "norm.stringent.bed" ]] || [[ {params.peak_type} == "norm.relaxed.bed" ]]; then + if [[ {params.peak_type} == "norm.stringent.bed" ]] || [[ {params.peak_type} == "norm.relaxed.bed" ]] || [[ {params.peak_type} == "non.relaxed.bed" ]] || [[ {params.peak_type} == "non.relaxed.bed" ]]; then cp $TMPDIR/subset.bed $TMPDIR/save.bed awk -v sample_id="{params.sampleID}_" \'{{print $1"\\t"$2"\\t"$3"\\t"sample_id$1"\\t"$4"\\t."}}\' $TMPDIR/subset.bed > $TMPDIR/col.txt paste -d "\t" $TMPDIR/save.bed $TMPDIR/col.txt > $TMPDIR/subset.bed diff --git a/workflow/rules/diff.smk b/workflow/rules/diff.smk index 577561d..c51128a 100644 --- a/workflow/rules/diff.smk +++ b/workflow/rules/diff.smk @@ -1,17 +1,28 @@ def get_contrast_init(wildcards): files=[] + # MACS2 options if "narrowPeak" in config["peaktype"]: n=expand(join(RESULTSDIR,"peaks","{qthresholds}","macs2","{replicate}","{replicate}.{dupstatus}_peaks.narrowPeak"),qthresholds=QTRESHOLDS,replicate=REPLICATES,dupstatus=DUPSTATUS), files.extend(n) if "broadPeak" in config["peaktype"]: b=expand(join(RESULTSDIR,"peaks","{qthresholds}","macs2","{replicate}","{replicate}.{dupstatus}_peaks.broadPeak"),qthresholds=QTRESHOLDS,replicate=REPLICATES,dupstatus=DUPSTATUS), files.extend(b) + + # SEACR OPTIONS if "norm.stringent.bed" in config["peaktype"]: s=expand([join(RESULTSDIR,"peaks","{qthresholds}","seacr","{treatment}_vs_{control}","{treatment}_vs_{control}.{dupstatus}.norm.stringent.bed")],zip,qthresholds=QTRESHOLDS,treatment=TREATMENTS,control=CONTROLS,dupstatus=DUPSTATUS), files.extend(s) if "norm.relaxed.bed" in config["peaktype"]: r=expand([join(RESULTSDIR,"peaks","{qthresholds}","seacr","{treatment}_vs_{control}","{treatment}_vs_{control}.{dupstatus}.norm.relaxed.bed")],zip,qthresholds=QTRESHOLDS,treatment=TREATMENTS,control=CONTROLS,dupstatus=DUPSTATUS), files.extend(r) + if "non.stringent.bed" in config["peaktype"]: + s=expand([join(RESULTSDIR,"peaks","{qthresholds}","seacr","{treatment}_vs_{control}","{treatment}_vs_{control}.{dupstatus}.non.stringent.bed")],zip,qthresholds=QTRESHOLDS,treatment=TREATMENTS,control=CONTROLS,dupstatus=DUPSTATUS), + files.extend(s) + if "non.relaxed.bed" in config["peaktype"]: + r=expand([join(RESULTSDIR,"peaks","{qthresholds}","seacr","{treatment}_vs_{control}","{treatment}_vs_{control}.{dupstatus}.non.relaxed.bed")],zip,qthresholds=QTRESHOLDS,treatment=TREATMENTS,control=CONTROLS,dupstatus=DUPSTATUS), + files.extend(r) + + # GOPEAKS OPTIONS if "narrowGo_peaks.bed" in config["peaktype"]: n=expand([join(RESULTSDIR,"peaks","{qthresholds}","gopeaks","{treatment}_vs_{control}.dedup.narrowGo_peaks.bed")],zip,qthresholds=QTRESHOLDS,treatment=TREATMENTS,control=CONTROLS), files.extend(n) @@ -154,12 +165,16 @@ rule make_counts_matrix: rule DESeq: input: bbpaths=join(RESULTSDIR,"peaks","{qs}","contrasts","bed_bedgraph_paths.tsv"), # this has the scaling factors - cm=join(RESULTSDIR,"peaks","{qs}","contrasts","{c1}_vs_{c2}__{ds}__{pt}","{c1}_vs_{c2}__{ds}__{pt}_countsmatrix.txt"), + cm_auc=join(RESULTSDIR,"peaks","{qs}","contrasts","{c1}_vs_{c2}__{ds}__{pt}","{c1}_vs_{c2}__{ds}__{pt}_countsmatrix.txt"), + cm_frag=join(RESULTSDIR,"peaks","{qs}","contrasts","{c1}_vs_{c2}__{ds}__{pt}","{c1}_vs_{c2}__{ds}__{pt}_fragmentscountsmatrix.txt"), si=join(RESULTSDIR,"peaks","{qs}","contrasts","{c1}_vs_{c2}__{ds}__{pt}","{c1}_vs_{c2}__{ds}__{pt}_sampleinfo.txt"), output: - results=join(RESULTSDIR,"peaks","{qs}","contrasts","{c1}_vs_{c2}__{ds}__{pt}","{c1}_vs_{c2}__{ds}__{pt}_AUCbased_diffresults.txt"), - html=join(RESULTSDIR,"peaks","{qs}","contrasts","{c1}_vs_{c2}__{ds}__{pt}","{c1}_vs_{c2}__{ds}__{pt}_AUCbased_diffanalysis.html"), - elbowlimits=join(RESULTSDIR,"peaks","{qs}","contrasts","{c1}_vs_{c2}__{ds}__{pt}","{c1}_vs_{c2}__{ds}__{pt}_AUCbased_diffanalysis_elbowlimits.yaml"), + results_auc=join(RESULTSDIR,"peaks","{qs}","contrasts","{c1}_vs_{c2}__{ds}__{pt}","{c1}_vs_{c2}__{ds}__{pt}_AUCbased_diffresults.txt"), + html_auc=join(RESULTSDIR,"peaks","{qs}","contrasts","{c1}_vs_{c2}__{ds}__{pt}","{c1}_vs_{c2}__{ds}__{pt}_AUCbased_diffanalysis.html"), + elbowlimits_auc=join(RESULTSDIR,"peaks","{qs}","contrasts","{c1}_vs_{c2}__{ds}__{pt}","{c1}_vs_{c2}__{ds}__{pt}_AUCbased_diffanalysis_elbowlimits.yaml"), + results_frag=join(RESULTSDIR,"peaks","{qs}","contrasts","{c1}_vs_{c2}__{ds}__{pt}","{c1}_vs_{c2}__{ds}__{pt}_fragmentsbased_diffresults.txt"), + html_frag=join(RESULTSDIR,"peaks","{qs}","contrasts","{c1}_vs_{c2}__{ds}__{pt}","{c1}_vs_{c2}__{ds}__{pt}_fragmentsbased_diffanalysis.html"), + elbowlimits_frag=join(RESULTSDIR,"peaks","{qs}","contrasts","{c1}_vs_{c2}__{ds}__{pt}","{c1}_vs_{c2}__{ds}__{pt}_fragmentsbased_diffanalysis_elbowlimits.yaml"), params: rscript=join(SCRIPTSDIR,"_diff_markdown_wrapper.R"), rmd=join(SCRIPTSDIR,"_diff_markdown.Rmd"), @@ -180,97 +195,73 @@ rule DESeq: dirname=$(basename $(mktemp)) if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then TMPDIR="/lscratch/$SLURM_JOB_ID/$dirname" + TMPDIR_AUC=$TMPDIR/AUC + TMPDIR_FRAG=$TMPDIR/FRAG else TMPDIR="/dev/shm/$dirname" + TMPDIR_AUC=$TMPDIR/AUC + TMPDIR_FRAG=$TMPDIR/FRAG fi - mkdir -p $TMPDIR - mkdir -p ${{TMPDIR}}/intermediates_dir - mkdir -p ${{TMPDIR}}/knit_root_dir - cd $TMPDIR + + ## Run AUC method + mkdir -p ${{TMPDIR_AUC}} + mkdir -p ${{TMPDIR_AUC}}/intermediates_dir + mkdir -p ${{TMPDIR_AUC}}/knit_root_dir + cd $TMPDIR_AUC + Rscript {params.rscript} \\ --rmd {params.rmd} \\ - --countsmatrix {input.cm} \\ + --countsmatrix {input.cm_auc} \\ --sampleinfo {input.si} \\ --dupstatus {params.ds} \\ --condition1 {params.condition1} \\ --condition2 {params.condition2} \\ --fdr_cutoff {params.fdr_cutoff} \\ --log2fc_cutoff {params.log2fc_cutoff} \\ - --results {output.results} \\ - --report {output.html} \\ - --elbowlimits {output.elbowlimits} \\ + --results {output.results_auc} \\ + --report {output.html_auc} \\ + --elbowlimits {output.elbowlimits_auc} \\ --spiked {params.spiked} \\ --rawcountsprescaled \\ --scalesfbymean \\ --bbpaths {input.bbpaths} \\ - --tmpdir $TMPDIR \\ + --tmpdir $TMPDIR_AUC \\ --species {params.species} \\ --gtf {params.gtf} # change elbow limits to provided log2fc if limit is set to .na.real - sed -i "s/low_limit: .na.real/low_limit: -{params.log2fc_cutoff}/" {output.elbowlimits} - sed -i "s/up_limit: .na.real/up_limit: {params.log2fc_cutoff}/g" {output.elbowlimits} - """ + sed -i "s/low_limit: .na.real/low_limit: -{params.log2fc_cutoff}/" {output.elbowlimits_auc} + sed -i "s/up_limit: .na.real/up_limit: {params.log2fc_cutoff}/g" {output.elbowlimits_auc} + + ## Run FRAG method + mkdir -p ${{TMPDIR_FRAG}} + mkdir -p ${{TMPDIR_FRAG}}/intermediates_dir + mkdir -p ${{TMPDIR_FRAG}}/knit_root_dir + cd $TMPDIR_FRAG -rule DESeq2: - input: - bbpaths=join(RESULTSDIR,"peaks","{qs}","contrasts","bed_bedgraph_paths.tsv"), # this has the scaling factors - cm=join(RESULTSDIR,"peaks","{qs}","contrasts","{c1}_vs_{c2}__{ds}__{pt}","{c1}_vs_{c2}__{ds}__{pt}_fragmentscountsmatrix.txt"), - si=join(RESULTSDIR,"peaks","{qs}","contrasts","{c1}_vs_{c2}__{ds}__{pt}","{c1}_vs_{c2}__{ds}__{pt}_sampleinfo.txt"), - output: - results=join(RESULTSDIR,"peaks","{qs}","contrasts","{c1}_vs_{c2}__{ds}__{pt}","{c1}_vs_{c2}__{ds}__{pt}_fragmentsbased_diffresults.txt"), - html=join(RESULTSDIR,"peaks","{qs}","contrasts","{c1}_vs_{c2}__{ds}__{pt}","{c1}_vs_{c2}__{ds}__{pt}_fragmentsbased_diffanalysis.html"), - elbowlimits=join(RESULTSDIR,"peaks","{qs}","contrasts","{c1}_vs_{c2}__{ds}__{pt}","{c1}_vs_{c2}__{ds}__{pt}_fragmentsbased_diffanalysis_elbowlimits.yaml"), - params: - rscript=join(SCRIPTSDIR,"_diff_markdown_wrapper.R"), - rmd=join(SCRIPTSDIR,"_diff_markdown.Rmd"), - condition1 = "{c1}", - condition2 = "{c2}", - ds = "{ds}", - pt = "{pt}", - spiked = SPIKED, # "Y" for spiked - fdr_cutoff = FDRCUTOFF, - log2fc_cutoff = LFCCUTOFF, - species = config["genome"], - gtf=config["reference"][config["genome"]]["gtf"] - envmodules: - TOOLS["R"] - shell: - """ - set -exo pipefail - dirname=$(basename $(mktemp)) - if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then - TMPDIR="/lscratch/$SLURM_JOB_ID/$dirname" - else - TMPDIR="/dev/shm/$dirname" - fi - mkdir -p $TMPDIR - mkdir -p ${{TMPDIR}}/intermediates_dir - mkdir -p ${{TMPDIR}}/knit_root_dir - cd $TMPDIR # Do not use --rawcountsprescaled as these counts are not prescaled! Rscript {params.rscript} \\ --rmd {params.rmd} \\ - --countsmatrix {input.cm} \\ + --countsmatrix {input.cm_frag} \\ --sampleinfo {input.si} \\ --dupstatus {params.ds} \\ --condition1 {params.condition1} \\ --condition2 {params.condition2} \\ --fdr_cutoff {params.fdr_cutoff} \\ --log2fc_cutoff {params.log2fc_cutoff} \\ - --results {output.results} \\ - --report {output.html} \\ - --elbowlimits {output.elbowlimits} \\ + --results {output.results_frag} \\ + --report {output.html_frag} \\ + --elbowlimits {output.elbowlimits_frag} \\ --spiked {params.spiked} \\ --scalesfbymean \\ --bbpaths {input.bbpaths} \\ - --tmpdir $TMPDIR \\ + --tmpdir $TMPDIR_FRAG \\ --species {params.species} \\ --gtf {params.gtf} # change elbow limits to provided log2fc if limit is set to .na.real - sed -i "s/low_limit: .na.real/low_limit: -{params.log2fc_cutoff}/" {output.elbowlimits} - sed -i "s/up_limit: .na.real/up_limit: {params.log2fc_cutoff}/g" {output.elbowlimits} + sed -i "s/low_limit: .na.real/low_limit: -{params.log2fc_cutoff}/" {output.elbowlimits_frag} + sed -i "s/up_limit: .na.real/up_limit: {params.log2fc_cutoff}/g" {output.elbowlimits_frag} """ rule diffbb: diff --git a/workflow/rules/peakcalls.smk b/workflow/rules/peakcalls.smk index a8355da..adcdc40 100644 --- a/workflow/rules/peakcalls.smk +++ b/workflow/rules/peakcalls.smk @@ -18,6 +18,7 @@ rule macs2: replicate = "{replicate}", dupstatus = "{dupstatus}", qthresholds = "{qthresholds}", + broadtreshold = config["macs2_broad_peak_threshold"], control_flag = config["macs2_control"], control = get_cntrl_bed, macs2_genome = config["reference"][GENOME]["macs2_g"], @@ -38,12 +39,61 @@ rule macs2: if [[ ! -d {params.outdir} ]];then mkdir -p {params.outdir};fi cd {params.outdir} + # if running macs2 with an internal control if [[ {params.control_flag} == "Y" ]] && [[ {params.control} != "CONTROL" ]]; then - macs2 callpeak -t {input.fragments_bed} -c {params.control} -f BED -g {params.macs2_genome} --keep-dup all -q {params.qthresholds} -n {params.replicate}.{params.dupstatus} --SPMR --shift 0 --call-summits --nomodel - macs2 callpeak -t {input.fragments_bed} -c {params.control} -f BED -g {params.macs2_genome} --keep-dup all -q {params.qthresholds} -n {params.replicate}.{params.dupstatus} --SPMR --shift 0 --broad --nomodel + # narrow peak calling + macs2 callpeak \\ + -t {input.fragments_bed} \\ + -c {params.control} \\ + -f BED \\ + -g {params.macs2_genome} \\ + --keep-dup all \\ + -q {params.qthresholds} \\ + -n {params.replicate}.{params.dupstatus} \\ + --SPMR \\ + --shift 0 \\ + --call-summits \\ + --nomodel + + # broad peak calling + macs2 callpeak \\ + -t {input.fragments_bed} \\ + -c {params.control} \\ + -f BED \\ + -g {params.macs2_genome} \\ + --keep-dup all \\ + -q {params.qthresholds} \\ + -n {params.replicate}.{params.dupstatus} \\ + --SPMR \\ + --shift 0 \\ + --broad --broad-cutoff {params.broadtreshold} \\ + --nomodel else - macs2 callpeak -t {input.fragments_bed} -f BED -g {params.macs2_genome} --keep-dup all -q {params.qthresholds} -n {params.replicate}.{params.dupstatus} --SPMR --shift 0 --call-summits --nomodel - macs2 callpeak -t {input.fragments_bed} -f BED -g {params.macs2_genome} --keep-dup all -q {params.qthresholds} -n {params.replicate}.{params.dupstatus} --SPMR --shift 0 --broad --nomodel + # narrow peak calling + macs2 callpeak \\ + -t {input.fragments_bed} \\ + -f BED \\ + -g {params.macs2_genome} \\ + --keep-dup all \\ + -q {params.qthresholds} \\ + -n {params.replicate}.{params.dupstatus} \\ + --SPMR \\ + --shift 0 \\ + --call-summits \\ + --nomodel + + # broad peak calling + macs2 callpeak \\ + -t {input.fragments_bed} \\ + -f BED \\ + -g {params.macs2_genome} \\ + --keep-dup all \\ + -q {params.qthresholds} \\ + -n {params.replicate}.{params.dupstatus} \\ + --SPMR \\ + --shift 0 \\ + --broad --broad-cutoff {params.broadtreshold} \\ + --nomodel fi """ @@ -55,15 +105,18 @@ rule peak2bb_macs2: broadPeak = join(RESULTSDIR,"peaks","{qthresholds}","macs2","{replicate}","{replicate}.{dupstatus}_peaks.broadPeak"), genome_len = join(BOWTIE2_INDEX,"genome.len"), output: - narrowbb = join(RESULTSDIR,"peaks","{qthresholds}","macs2","{replicate}","{replicate}.{dupstatus}_peaks.narrow.bigbed"), - broadbb = join(RESULTSDIR,"peaks","{qthresholds}","macs2","{replicate}","{replicate}.{dupstatus}_peaks.broad.bigbed"), + narrowbb = join(RESULTSDIR,"peaks","{qthresholds}","macs2","{replicate}","{replicate}.{dupstatus}_peaks.narrow.bigbed.gz"), + broadbb = join(RESULTSDIR,"peaks","{qthresholds}","macs2","{replicate}","{replicate}.{dupstatus}_peaks.broad.bigbed.gz"), params: replicate="{replicate}", dupstatus="{dupstatus}", memG = getmemG("peapeak2bb_macs2k2bb"), + narrowbb = join(RESULTSDIR,"peaks","{qthresholds}","macs2","{replicate}","{replicate}.{dupstatus}_peaks.narrow.bigbed"), + broadbb = join(RESULTSDIR,"peaks","{qthresholds}","macs2","{replicate}","{replicate}.{dupstatus}_peaks.broad.bigbed"), threads: getthreads("peak2bb_macs2") envmodules: - TOOLS["ucsc"] + TOOLS["ucsc"], + TOOLS["samtools"] shell: """ set -exo pipefail @@ -74,10 +127,15 @@ rule peak2bb_macs2: TMPDIR="/dev/shm/$dirname" mkdir -p $TMPDIR fi + # create files cut -f1-3 {input.narrowPeak} | LC_ALL=C sort --buffer-size={params.memG} --parallel={threads} --temporary-directory=$TMPDIR -k1,1 -k2,2n | uniq > ${{TMPDIR}}/{params.replicate}.{params.dupstatus}.narrow.bed - bedToBigBed -type=bed3 ${{TMPDIR}}/{params.replicate}.{params.dupstatus}.narrow.bed {input.genome_len} {output.narrowbb} + bedToBigBed -type=bed3 ${{TMPDIR}}/{params.replicate}.{params.dupstatus}.narrow.bed {input.genome_len} {params.narrowbb} cut -f1-3 {input.broadPeak} | LC_ALL=C sort --buffer-size={params.memG} --parallel={threads} --temporary-directory=$TMPDIR -k1,1 -k2,2n | uniq > ${{TMPDIR}}/{params.replicate}.{params.dupstatus}.broad.bed - bedToBigBed -type=bed3 ${{TMPDIR}}/{params.replicate}.{params.dupstatus}.broad.bed {input.genome_len} {output.broadbb} + bedToBigBed -type=bed3 ${{TMPDIR}}/{params.replicate}.{params.dupstatus}.broad.bed {input.genome_len} {params.broadbb} + + # zip files + bgzip {params.narrowbb} > {output.narrowbb} + bgzip {params.broadbb} > {output.broadbb} """ def get_input_bedgraphs(wildcards):