add test config files for hg38 nd mm10 #20

epigen · Nov 29, 2023 · f1ca333 · f1ca333
1 parent fa7ef9d
commit f1ca333
Show file tree

Hide file tree

Showing 2 changed files with 176 additions and 0 deletions.
diff --git a/hg38test_atacseq_pipeline_config.yaml b/hg38test_atacseq_pipeline_config.yaml
@@ -0,0 +1,88 @@
+
+##### RESOURCES #####
+mem: '32000'
+threads: 2
+partition: 'shortq'
+
+##### GENERAL #####
+project_name: hg38test #MyATACproject # name of the project/dataset
+result_path: results/hg38test/ #/path/to/results/ # path to the output folder
+annotation: test/hg38test/hg38test_atacseq_pipeline_annotation.csv #/path/to/sample_annotation.csv # path to annotation file, specified in config/README.md
+
+##### PROCESSING #####
+
+# genome in the project
+genome: hg38
+
+##### REPORT #####
+
+# sample(!) specific annotation columns of interest from the annotation sheet (note: unit specific annotations are not retained)
+# first entry is used to color UCSC Genome Browser tracks
+annot_columns: ['pass_qc','read_type','organism'] # (optional) can be empty [""]. must be columns in the annotation sheet
+
+##### QUANTIFICATION #####
+# coverage calculation with bedtools for QC report
+tss_slop: 2000
+noise_lower: 100
+
+# determination of consensus regions using (py)bedtools
+slop_extension: 250
+
+##### ANNOTATION #####
+# region annotation parameters for UROPA with gencode_gtf reference
+
+# assumed size of transcription start sites (TSS)
+tss_size: 100
+
+# assumed TSS proximal distance upstream
+proximal_size_up: 1000
+
+# assumed TSS proximal distance downstream
+proximal_size_dn: 500
+
+# distal distance 
+distal_size: 10000
+
+
+##### RESOURCES #####
+# specify paths to resource files required by the pipeline
+# download resources for the GRCm38 (hg38) assembly of the mouse genome: https://zenodo.org/records/6344322
+# download resources for the GRCh38 (hg38) assembly of the human genome: https://zenodo.org/records/6344174
+
+# (nextera) adapter fasta (.fa) file and/or nucleotide adapter sequence of the used ATAC-seq protocol (used by Bowtie2 if provided)
+# if not available/applicable leave empty: ""
+adapter_fasta: resources/atacseq_pipeline/hg38/atacseq/nextera_adapters.fa
+adapter_sequence: GTCTCGTGGGCTCGG
+
+# indices for Bowtie2
+bowtie2_index: resources/atacseq_pipeline/hg38/indices_for_Bowtie2/hg38
+
+# chromosome lengths for a given genome
+chromosome_sizes: resources/atacseq_pipeline/hg38/atacseq/hg38.chromSizes
+
+# blacklisted regions from ENCODE as .bed files
+blacklisted_regions: resources/atacseq_pipeline/hg38/atacseq/hg38-ENCODE_blacklist.sorted.v3.bed
+
+# complement to the blacklisted regions as .bed files
+whitelisted_regions: resources/atacseq_pipeline/hg38/atacseq/hg38-ENCODE_whitelist.sorted.v3.bed
+
+# .bed files, from e.g., gencode (hg38) or CCDS (hg38)
+unique_tss: resources/atacseq_pipeline/hg38/atacseq/hg38_gencode_tss_unique.sorted.bed
+
+# regulatory build regulatory features chromosomes only from e.g., Ensembl
+regulatory_regions: resources/atacseq_pipeline/hg38/homo_sapiens.GRCh38.Regulatory_Build.regulatory_features.chromosomes_only.20161111.bed
+
+# length of genomes as integer
+genome_size: 2747877777
+
+# abbreviation of mitochondria chromosome
+mitochondria_name: chrM
+
+# genomes as .fa files
+genome_fasta: resources/atacseq_pipeline/hg38/hg38.fa
+
+# gencode .gtf files (generated by running /workflow/scripts/parse_reg_build_file.py on gff.gz files from Ensembl http://www.ensembl.org/)
+gencode_gtf: resources/atacseq_pipeline/hg38/gencode.v38.basic.annotation.gtf
+
+# regulatory build regulatory features .gtf files (generated by running /workflow/scripts/parse_reg_build_file.py on gff.gz files from Ensembl http://www.ensembl.org/)
+regulatory_build_gtf: resources/atacseq_pipeline/hg38/homo_sapiens.GRCh38.Regulatory_Build.regulatory_features.20210107.gtf
diff --git a/mm10test_atacseq_pipeline_config.yaml b/mm10test_atacseq_pipeline_config.yaml
@@ -0,0 +1,88 @@
+
+##### RESOURCES #####
+mem: '32000'
+threads: 2
+partition: 'shortq'
+
+##### GENERAL #####
+project_name: mm10test #MyATACproject # name of the project/dataset
+result_path: results/mm10test/ #/path/to/results/ # path to the output folder
+annotation: test/mm10test/mm10test_atacseq_pipeline_annotation.csv #/path/to/sample_annotation.csv # path to annotation file, specified in config/README.md
+
+##### PROCESSING #####
+
+# genome in the project
+genome: mm10
+
+##### REPORT #####
+
+# sample(!) specific annotation columns of interest from the annotation sheet (note: unit specific annotations are not retained)
+# first entry is used to color UCSC Genome Browser tracks
+annot_columns: ['pass_qc','read_type','organism'] # (optional) can be empty [""]. must be columns in the annotation sheet
+
+##### QUANTIFICATION #####
+# coverage calculation with bedtools for QC report
+tss_slop: 2000
+noise_lower: 100
+
+# determination of consensus regions using (py)bedtools
+slop_extension: 250
+
+##### ANNOTATION #####
+# region annotation parameters for UROPA with gencode_gtf reference
+
+# assumed size of transcription start sites (TSS)
+tss_size: 100
+
+# assumed TSS proximal distance upstream
+proximal_size_up: 1000
+
+# assumed TSS proximal distance downstream
+proximal_size_dn: 500
+
+# distal distance 
+distal_size: 10000
+
+
+##### RESOURCES #####
+# specify paths to resource files required by the pipeline
+# download resources for the GRCm38 (mm10) assembly of the mouse genome: https://zenodo.org/records/6344322
+# download resources for the GRCh38 (hg38) assembly of the human genome: https://zenodo.org/records/6344174
+
+# (nextera) adapter fasta (.fa) file and/or nucleotide adapter sequence of the used ATAC-seq protocol (used by Bowtie2 if provided)
+# if not available/applicable leave empty: ""
+adapter_fasta: resources/atacseq_pipeline/mm10/atacseq/nextera_adapters.fa
+adapter_sequence: GTCTCGTGGGCTCGG
+
+# indices for Bowtie2
+bowtie2_index: resources/atacseq_pipeline/mm10/indices_for_Bowtie2/mm10
+
+# chromosome lengths for a given genome
+chromosome_sizes: resources/atacseq_pipeline/mm10/atacseq/mm10.chromSizes
+
+# blacklisted regions from ENCODE as .bed files
+blacklisted_regions: resources/atacseq_pipeline/mm10/atacseq/mm10-ENCODE_blacklist.sorted.v2.bed
+
+# complement to the blacklisted regions as .bed files
+whitelisted_regions: resources/atacseq_pipeline/mm10/atacseq/mm10-ENCODE_whitelist.sorted.v2.bed
+
+# .bed files, from e.g., gencode (hg38) or CCDS (mm10)
+unique_tss: resources/atacseq_pipeline/mm10/atacseq/mm10_CCDS_tss_unique.bed
+
+# regulatory build regulatory features chromosomes only from e.g., Ensembl
+regulatory_regions: resources/atacseq_pipeline/mm10/mus_musculus.GRCm38.Regulatory_Build.regulatory_features.20161111.sorted.bed
+
+# length of genomes as integer
+genome_size: 2407883318
+
+# abbreviation of mitochondria chromosome
+mitochondria_name: chrM
+
+# genomes as .fa files
+genome_fasta: resources/atacseq_pipeline/mm10/mm10.fa
+
+# gencode .gtf files (generated by running /workflow/scripts/parse_reg_build_file.py on gff.gz files from Ensembl http://www.ensembl.org/)
+gencode_gtf: resources/atacseq_pipeline/mm10/gencode.vM25.chr_patch_hapl_scaff.basic.annotation.gtf
+
+# regulatory build regulatory features .gtf files (generated by running /workflow/scripts/parse_reg_build_file.py on gff.gz files from Ensembl http://www.ensembl.org/)
+regulatory_build_gtf: resources/atacseq_pipeline/mm10/mus_musculus.GRCm38.Regulatory_Build.regulatory_features.20180516.gtf