diff --git a/.test/hg38test/config/hg38test_atacseq_pipeline_config.yaml b/.test/hg38test/config/hg38test_atacseq_pipeline_config.yaml index 525aff4..9b8a503 100644 --- a/.test/hg38test/config/hg38test_atacseq_pipeline_config.yaml +++ b/.test/hg38test/config/hg38test_atacseq_pipeline_config.yaml @@ -8,7 +8,6 @@ partition: 'shortq' project_name: hg38test #MyATACproject # name of the project/dataset result_path: results/hg38test/ #/path/to/results/ # path to the output folder annotation: .test/hg38test/config/hg38test_atacseq_pipeline_annotation.csv #/path/to/sample_annotation.csv # path to annotation file, specified in config/README.md -email: sreichl@cemm.at # used for UCSC hub generation ##### PROCESSING ##### @@ -22,7 +21,6 @@ sequencing_center: CeMM_BSF ##### REPORT ##### # sample(!) specific annotation columns of interest from the annotation sheet (note: unit specific annotations are not retained) -# first entry is used to color UCSC Genome Browser tracks annot_columns: ['pass_qc','read_type','organism'] # (optional) can be empty [""]. must be columns in the annotation sheet ##### QUANTIFICATION ##### diff --git a/.test/mm10test/config/mm10test_atacseq_pipeline_config.yaml b/.test/mm10test/config/mm10test_atacseq_pipeline_config.yaml index 00c47ad..e4e9a50 100755 --- a/.test/mm10test/config/mm10test_atacseq_pipeline_config.yaml +++ b/.test/mm10test/config/mm10test_atacseq_pipeline_config.yaml @@ -8,7 +8,6 @@ partition: 'shortq' project_name: mm10test #MyATACproject # name of the project/dataset result_path: results/mm10test/ #/path/to/results/ # path to the output folder annotation: .test/mm10test/config/mm10test_atacseq_pipeline_annotation.csv #/path/to/sample_annotation.csv # path to annotation file, specified in config/README.md -email: sreichl@cemm.at # used for UCSC hub generation ##### PROCESSING ##### @@ -22,7 +21,6 @@ sequencing_center: CeMM_BSF ##### REPORT ##### # sample(!) specific annotation columns of interest from the annotation sheet (note: unit specific annotations are not retained) -# first entry is used to color UCSC Genome Browser tracks annot_columns: ['pass_qc','read_type','organism'] # (optional) can be empty [""]. must be columns in the annotation sheet ##### QUANTIFICATION ##### diff --git a/README.md b/README.md index ea04a44..d829719 100755 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ From r**A**w/un**A**ligned BAM files to count**Z**. A [Snakemake](https://snakemake.readthedocs.io/en/stable/) workflow implementation of the [BSF's](https://www.biomedical-sequencing.org/) [ATAC-seq Data Processing Pipeline](https://github.com/berguner/atacseq_pipeline "ATAC-seq Data Processing Pipeline") extended by downstream quantification and annotation steps using Bash and Python. -This workflow adheres to the module specifications of [MR.PARETO](https://github.com/epigen/mr.pareto), an effort to augment research by modularizing (biomedical) data science. For more details and modules check out the project's repository. +This workflow adheres to the module specifications of [MR.PARETO](https://github.com/epigen/mr.pareto), an effort to augment research by modularizing (biomedical) data science. For more details and modules check out the project's repository. Please consider **starring** and sharing modules that are interesting or useful to you, this helps others to find and benefit from the effort and me to prioritize my efforts! **If you use this workflow in a publication, please don't forget to give credits to the authors by citing it using this DOI [10.5281/zenodo.6323634](https://doi.org/10.5281/zenodo.6323634).** @@ -21,7 +21,6 @@ Table of contents * [Configuration](#configuration) * [Examples](#examples) * [Quality Control](#quality-control) - * [Genome Browser Tracks](#genome-browser-tracks) * [Links](#links) * [Resources](#resources) * [Publications](#publications) @@ -75,7 +74,7 @@ The processing and quantification described here was performed using a publicly - Note: even though the peak support of a region in a certain sample is 0, does not mean that there are no reads counted in the count matrix, it just states that there was no peak called. - Note: the peak support can be >1 for certain samples in case of a consensus region spanning more than one peak within the respective sample. - Peak annotation and motif analysis HOMER. - - Coverage analysis generating bigWig files (hub/) and quantifying TSS coverage (results/). + - Quantification of TSS coverage (results/). - Reporting (report/) - MultiQC report generation using MultiQC, extended with an in-house developed plugin [atacseq_report](./workflow/scripts/multiqc_atacseq). - Quantification (counts/) @@ -91,7 +90,6 @@ The processing and quantification described here was performed using a publicly - UROPA with regulatory build and gencode as references - HOMER with annotatePeaks.pl - bedtools for nucleotide counts/content (e.g., % of GC) -- UCSC Genome Browser Trackhub (hub/) # Usage These steps are the recommended usage for this workflow: @@ -126,7 +124,7 @@ Below are some guidelines for the manual quality control of each sample, but kee - Regulatory regions >10% (as it is roughly 10% of the genome) - TSS (Transcription Start Site) normalized coverage ideally > 4 (at least >2) - % Duplications “not excessive” -5. Inspect [Genome Browser Tracks](#genome-browser-tracks) using UCSC Genome Browser (online) or IGV (local) +5. Inspect [Genome Browser Tracks](https://github.com/epigen/genome_tracks/) using UCSC Genome Browser (online) or IGV (local) - Compare all samples to the best, based on above's QC metrics. - Check cell type / experiment-specific markers for accessibility as positive controls. - Check e.g., developmental regions for accessibility as negative controls. @@ -144,25 +142,6 @@ My personal QC value scheme to inform downstream analyses (e.g., unsupervised an Finally, a previous PhD student in our lab, [André Rendeiro](https://orcid.org/0000-0001-9362-5373), wrote about ["ATAC-seq sample quality, wet lab troubleshooting and advice"](https://github.com/epigen/open_pipelines/blob/master/pipelines/atacseq.md#sample-quality-wet-lab-troubleshooting-and-advice). -# Genome Browser Tracks -The `hub` directory contains the read coverage per sample in .bigWig format for visual inspection of each sample e.g., during QC. Below are instructions for two different approaches (online/local). - -## UCSC Genome Browser Track Hub (online) -0. Requirement: web server. -1. Copy (or symlink) the `hub` directory to an accessible location on your web server (=web_server_location). -2. Create a UCSC Genome Browser hyperlink - - the general formula is: ucsc_url + genome + web_server_location + hub/hub.txt - - concretely: `http://genome-euro.ucsc.edu/cgi-bin/hgTracks?db=` + {genome} + `&hubUrl=` + {web_server_location} + `hub/hub.txt` - - [mm10test](http://genome-euro.ucsc.edu/cgi-bin/hgTracks?db=mm10&hubUrl=https://medical-epigenomics.org/data/atacseq_pipeline/mm10test/hub/hub.txt): `http://genome-euro.ucsc.edu/cgi-bin/hgTracks?db=mm10&hubUrl=https://medical-epigenomics.org/data/atacseq_pipeline/mm10test/hub/hub.txt` - - [hg38test](http://genome-euro.ucsc.edu/cgi-bin/hgTracks?db=hg38&hubUrl=https://medical-epigenomics.org/data/atacseq_pipeline/hg38test/hub/hub.txt): `http://genome-euro.ucsc.edu/cgi-bin/hgTracks?db=hg38&hubUrl=https://medical-epigenomics.org/data/atacseq_pipeline/hg38test/hub/hub.txt` -3. Share the link with the world e.g., collaborators or upon publication of your data. - -## IGV: Integrative Genomics Viewer (local) -0. Requirement: [IGV Desktop application](https://igv.org/doc/desktop/). -1. Open IGV. -2. Select genome. -3. Drag and drop all/selected .bigWig files from the `hub` directory directly into the IGV application. - # Links - [GitHub Repository](https://github.com/epigen/atacseq_pipeline/) - [GitHub Page](https://epigen.github.io/atacseq_pipeline/) @@ -188,11 +167,11 @@ The `hub` directory contains the read coverage per sample in .bigWig format for unzip indices_for_Bowtie2.zip && rm indices_for_Bowtie2.zip ``` - Recommended [MR.PARETO](https://github.com/epigen/mr.pareto) modules for downstream analyses (in that order): + - [Genome Browser Track Visualization](https://github.com/epigen/genome_tracks/) for quality control and visual inspection/analysis of genomic regions/genes of interest or top hits. - [Split, Filter, Normalize and Integrate Sequencing Data](https://github.com/epigen/spilterlize_integrate/) after quantification. - [Unsupervised Analysis](https://github.com/epigen/unsupervised_analysis) to understand and visualize similarities and variations between samples. - [Differential Analysis with limma](https://github.com/epigen/dea_limma) to identify and visualize statistically significant genomic regions between sample groups. - [Enrichment Analysis](https://github.com/epigen/enrichment_analysis) for biomedical interpretation of differential analysis results. - - [Genome Tracks](https://github.com/epigen/genome_tracks) for visualization of genomic regions of interest or top hits. # Publications The following publications successfully used this module for their analyses. diff --git a/config/README.md b/config/README.md index f75a705..0e6a590 100644 --- a/config/README.md +++ b/config/README.md @@ -7,7 +7,7 @@ You need one configuration file and one annotation file to run the complete work - sample_name (first column!) - read_type: "single" or "paired". - bam_file: path to the raw/unaligned BAM file. - - pass_qc: number between 0 (not used for downstream steps e.g., quantification) and 1. Every sample with pass_qc>0 is included in the downstream analysis. + - pass_qc: number between 0 (not used for downstream steps e.g., quantification) and 1. Every sample with pass_qc>0 is included in the downstream quantification and annotation steps. - (optional) additional sample metadata columns can be added and indicated for inclusion in the report. 2 examples (hg38 & mm10) are provided in the .test/ directory and are pre-filled with resource locations from the [respective Zenodo download](../README.md#resources). diff --git a/config/config.yaml b/config/config.yaml index ca8821c..13482c7 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -8,7 +8,6 @@ partition: 'shortq' project_name: MyATACproject # name of the project/dataset result_path: /path/to/results/ # path to the output folder annotation: /path/to/MyATACproject_pipeline_annotation.csv # path to annotation file, specified in config/README.md -email: sreichl@cemm.at # used for UCSC hub generation ##### PROCESSING ##### @@ -21,8 +20,7 @@ sequencing_center: CeMM_BSF ##### REPORT ##### -# sample(!) specific annotation columns of interest from the annotation sheet (note: unit specific annotations are not retained) -# first entry is used to color UCSC Genome Browser tracks +# sample(!) specific annotation columns of interest from the annotation sheet (note: sequencing unit specific annotations are not retained) annot_columns: ['pass_qc','read_type','organism'] # (optional) can be empty [""]. must be columns in the annotation sheet ##### QUANTIFICATION ##### diff --git a/workflow/envs/multiqc.yaml b/workflow/envs/multiqc.yaml index 63c60d0..02e1719 100644 --- a/workflow/envs/multiqc.yaml +++ b/workflow/envs/multiqc.yaml @@ -10,5 +10,5 @@ dependencies: - pytables=3.9.1 - pip=23.3.1 - pip: - - 'git+https://github.com/epigen/atacseq_pipeline/#egg=atacseq_report&subdirectory=workflow/scripts/multiqc_atacseq' # works -# - -e /home/sreichl/projects/atacseq_pipeline/workflow/scripts/multiqc_atacseq # works but only local +# - 'git+https://github.com/epigen/atacseq_pipeline/#egg=atacseq_report&subdirectory=workflow/scripts/multiqc_atacseq' # works + - -e /home/sreichl/projects/atacseq_pipeline/workflow/scripts/multiqc_atacseq # works but only local diff --git a/workflow/rules/processing.smk b/workflow/rules/processing.smk index c26b544..e9ee3eb 100644 --- a/workflow/rules/processing.smk +++ b/workflow/rules/processing.smk @@ -88,11 +88,6 @@ rule tss_coverage: "logs/rules/coverage_{sample}.log" shell: """ - #bamCoverage --bam {input.bam} \ - # -p max --binSize 10 --normalizeUsing RPGC \ - # --effectiveGenomeSize {params.genome_size} --extendReads 175 \ - # -o "{output.bigWig}" > "{output.bigWig_log}" 2>&1; - echo "base,count" > {output.tss_hist}; bedtools slop -b {params.tss_slop} -i {params.unique_tss} -g {params.chromosome_sizes} | \ bedtools coverage -a - -b {input.bam} -d -sorted | \ diff --git a/workflow/rules/quantification.smk b/workflow/rules/quantification.smk index 46ed713..de364f3 100644 --- a/workflow/rules/quantification.smk +++ b/workflow/rules/quantification.smk @@ -132,39 +132,29 @@ rule homer_aggregate: threads: config.get("threads", 2) log: "logs/rules/homer_aggregate.log" - shell: - """ - first_file=true - - > {output} - - for file in {input}; do - if [ -s file ]; then - sample=$(awk -F'/' '{{print $(NF-2)}}' <<< "$file") + run: + combined_df = pd.DataFrame() - if $first_file; then - awk -v prefix="$sample" 'BEGIN {{FS="\\t"; OFS=","}} {{ - if (NR==1) {{ - gsub(" ", "_", $0); - printf "sample" OFS; - for (i=1; i<=NF; i++) printf "%s%s", $i, (i> {output} + for file_path in input: + if os.path.getsize(file_path) > 0: + sample = file_path.split('/')[-3] + df = pd.read_csv(file_path, sep='\t') - first_file=false - else - awk -v prefix="$sample" 'BEGIN {{FS="\\t"; OFS=","}} FNR > 1 {{printf prefix OFS "\\"" $1 "\\""; - for (i=2; i<=NF; i++) printf "%s%s", OFS, $i; - print ""; - }}' "$file" >> {output} - fi - fi - done - """ + # replace white space in column names + df.columns = [col.replace(' ', '_') for col in df.columns] + + # Remove columns that start with '#_' (unique per sample) + df = df.loc[:, ~df.columns.str.startswith('#_')] + + # add sample name + df.insert(0, 'sample_name', sample) + + if combined_df.shape[0]==0: + combined_df = df + else: + combined_df = pd.concat([combined_df, df], ignore_index=True) + + combined_df.to_csv(output[0], index=False) # map consensus regions to closest TSS per gene rule map_consensus_tss: diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk index 94705d6..fd2cae8 100644 --- a/workflow/rules/report.smk +++ b/workflow/rules/report.smk @@ -109,8 +109,8 @@ rule multiqc: expand(os.path.join(result_path,"results","{sample}","mapped", "{sample}.filtered.bam"), sample=samples.keys()), expand(os.path.join(result_path,"results","{sample}","peaks","{sample}_peaks.narrowPeak"), sample=samples.keys()), # expand(os.path.join(result_path, "hub","{sample}.bigWig"),sample=samples.keys()), +# trackdb_file = os.path.join(result_path, "hub", config["genome"], "trackDb.txt"), # representing UCSC hub expand(os.path.join(result_path, 'report', '{sample}_peaks.xls'), sample=samples.keys()), # representing symlinked stats - trackdb_file = os.path.join(result_path, "hub", config["genome"], "trackDb.txt"), # representing UCSC hub sample_annotation = config["annotation"], output: multiqc_report = report(os.path.join(result_path,"report","multiqc_report.html"),