Skip to content

Commit

Permalink
add test config files for hg38 nd mm10 #20
Browse files Browse the repository at this point in the history
  • Loading branch information
sreichl authored Nov 29, 2023
1 parent fa7ef9d commit f1ca333
Show file tree
Hide file tree
Showing 2 changed files with 176 additions and 0 deletions.
88 changes: 88 additions & 0 deletions hg38test_atacseq_pipeline_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@

##### RESOURCES #####
mem: '32000'
threads: 2
partition: 'shortq'

##### GENERAL #####
project_name: hg38test #MyATACproject # name of the project/dataset
result_path: results/hg38test/ #/path/to/results/ # path to the output folder
annotation: test/hg38test/hg38test_atacseq_pipeline_annotation.csv #/path/to/sample_annotation.csv # path to annotation file, specified in config/README.md

##### PROCESSING #####

# genome in the project
genome: hg38

##### REPORT #####

# sample(!) specific annotation columns of interest from the annotation sheet (note: unit specific annotations are not retained)
# first entry is used to color UCSC Genome Browser tracks
annot_columns: ['pass_qc','read_type','organism'] # (optional) can be empty [""]. must be columns in the annotation sheet

##### QUANTIFICATION #####
# coverage calculation with bedtools for QC report
tss_slop: 2000
noise_lower: 100

# determination of consensus regions using (py)bedtools
slop_extension: 250

##### ANNOTATION #####
# region annotation parameters for UROPA with gencode_gtf reference

# assumed size of transcription start sites (TSS)
tss_size: 100

# assumed TSS proximal distance upstream
proximal_size_up: 1000

# assumed TSS proximal distance downstream
proximal_size_dn: 500

# distal distance
distal_size: 10000


##### RESOURCES #####
# specify paths to resource files required by the pipeline
# download resources for the GRCm38 (hg38) assembly of the mouse genome: https://zenodo.org/records/6344322
# download resources for the GRCh38 (hg38) assembly of the human genome: https://zenodo.org/records/6344174

# (nextera) adapter fasta (.fa) file and/or nucleotide adapter sequence of the used ATAC-seq protocol (used by Bowtie2 if provided)
# if not available/applicable leave empty: ""
adapter_fasta: resources/atacseq_pipeline/hg38/atacseq/nextera_adapters.fa
adapter_sequence: GTCTCGTGGGCTCGG

# indices for Bowtie2
bowtie2_index: resources/atacseq_pipeline/hg38/indices_for_Bowtie2/hg38

# chromosome lengths for a given genome
chromosome_sizes: resources/atacseq_pipeline/hg38/atacseq/hg38.chromSizes

# blacklisted regions from ENCODE as .bed files
blacklisted_regions: resources/atacseq_pipeline/hg38/atacseq/hg38-ENCODE_blacklist.sorted.v3.bed

# complement to the blacklisted regions as .bed files
whitelisted_regions: resources/atacseq_pipeline/hg38/atacseq/hg38-ENCODE_whitelist.sorted.v3.bed

# .bed files, from e.g., gencode (hg38) or CCDS (hg38)
unique_tss: resources/atacseq_pipeline/hg38/atacseq/hg38_gencode_tss_unique.sorted.bed

# regulatory build regulatory features chromosomes only from e.g., Ensembl
regulatory_regions: resources/atacseq_pipeline/hg38/homo_sapiens.GRCh38.Regulatory_Build.regulatory_features.chromosomes_only.20161111.bed

# length of genomes as integer
genome_size: 2747877777

# abbreviation of mitochondria chromosome
mitochondria_name: chrM

# genomes as .fa files
genome_fasta: resources/atacseq_pipeline/hg38/hg38.fa

# gencode .gtf files (generated by running /workflow/scripts/parse_reg_build_file.py on gff.gz files from Ensembl http://www.ensembl.org/)
gencode_gtf: resources/atacseq_pipeline/hg38/gencode.v38.basic.annotation.gtf

# regulatory build regulatory features .gtf files (generated by running /workflow/scripts/parse_reg_build_file.py on gff.gz files from Ensembl http://www.ensembl.org/)
regulatory_build_gtf: resources/atacseq_pipeline/hg38/homo_sapiens.GRCh38.Regulatory_Build.regulatory_features.20210107.gtf
88 changes: 88 additions & 0 deletions mm10test_atacseq_pipeline_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@

##### RESOURCES #####
mem: '32000'
threads: 2
partition: 'shortq'

##### GENERAL #####
project_name: mm10test #MyATACproject # name of the project/dataset
result_path: results/mm10test/ #/path/to/results/ # path to the output folder
annotation: test/mm10test/mm10test_atacseq_pipeline_annotation.csv #/path/to/sample_annotation.csv # path to annotation file, specified in config/README.md

##### PROCESSING #####

# genome in the project
genome: mm10

##### REPORT #####

# sample(!) specific annotation columns of interest from the annotation sheet (note: unit specific annotations are not retained)
# first entry is used to color UCSC Genome Browser tracks
annot_columns: ['pass_qc','read_type','organism'] # (optional) can be empty [""]. must be columns in the annotation sheet

##### QUANTIFICATION #####
# coverage calculation with bedtools for QC report
tss_slop: 2000
noise_lower: 100

# determination of consensus regions using (py)bedtools
slop_extension: 250

##### ANNOTATION #####
# region annotation parameters for UROPA with gencode_gtf reference

# assumed size of transcription start sites (TSS)
tss_size: 100

# assumed TSS proximal distance upstream
proximal_size_up: 1000

# assumed TSS proximal distance downstream
proximal_size_dn: 500

# distal distance
distal_size: 10000


##### RESOURCES #####
# specify paths to resource files required by the pipeline
# download resources for the GRCm38 (mm10) assembly of the mouse genome: https://zenodo.org/records/6344322
# download resources for the GRCh38 (hg38) assembly of the human genome: https://zenodo.org/records/6344174

# (nextera) adapter fasta (.fa) file and/or nucleotide adapter sequence of the used ATAC-seq protocol (used by Bowtie2 if provided)
# if not available/applicable leave empty: ""
adapter_fasta: resources/atacseq_pipeline/mm10/atacseq/nextera_adapters.fa
adapter_sequence: GTCTCGTGGGCTCGG

# indices for Bowtie2
bowtie2_index: resources/atacseq_pipeline/mm10/indices_for_Bowtie2/mm10

# chromosome lengths for a given genome
chromosome_sizes: resources/atacseq_pipeline/mm10/atacseq/mm10.chromSizes

# blacklisted regions from ENCODE as .bed files
blacklisted_regions: resources/atacseq_pipeline/mm10/atacseq/mm10-ENCODE_blacklist.sorted.v2.bed

# complement to the blacklisted regions as .bed files
whitelisted_regions: resources/atacseq_pipeline/mm10/atacseq/mm10-ENCODE_whitelist.sorted.v2.bed

# .bed files, from e.g., gencode (hg38) or CCDS (mm10)
unique_tss: resources/atacseq_pipeline/mm10/atacseq/mm10_CCDS_tss_unique.bed

# regulatory build regulatory features chromosomes only from e.g., Ensembl
regulatory_regions: resources/atacseq_pipeline/mm10/mus_musculus.GRCm38.Regulatory_Build.regulatory_features.20161111.sorted.bed

# length of genomes as integer
genome_size: 2407883318

# abbreviation of mitochondria chromosome
mitochondria_name: chrM

# genomes as .fa files
genome_fasta: resources/atacseq_pipeline/mm10/mm10.fa

# gencode .gtf files (generated by running /workflow/scripts/parse_reg_build_file.py on gff.gz files from Ensembl http://www.ensembl.org/)
gencode_gtf: resources/atacseq_pipeline/mm10/gencode.vM25.chr_patch_hapl_scaff.basic.annotation.gtf

# regulatory build regulatory features .gtf files (generated by running /workflow/scripts/parse_reg_build_file.py on gff.gz files from Ensembl http://www.ensembl.org/)
regulatory_build_gtf: resources/atacseq_pipeline/mm10/mus_musculus.GRCm38.Regulatory_Build.regulatory_features.20180516.gtf

0 comments on commit f1ca333

Please sign in to comment.