-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpipeline.xml
186 lines (159 loc) · 16.2 KB
/
pipeline.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
<?xml version="1.0" encoding="UTF-8"?>
<pipeline xmlns="http://www.sing-group.org/compi/pipeline-1.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<version>1.1.1</version>
<params>
<param name="workingDir" shortName="wd" global="true" defaultValue="/working_dir">The working directory of the project.</param>
<param name="fastqDir" shortName="dd" global="true" defaultValue="input/fastq">The directory containing the fastq files (default is relative to workingDir).</param>
<param name="outDir" shortName="o" global="true" defaultValue="output">The directory containing the pipeline outputs (relative to workingDir).</param>
<param name="organism" shortName="ogn" global="true" defaultValue="Homo sapiens">The organism from which the data was obtained, needed for the functional enrichment analysis.</param>
<param name="adapter" shortName="adp" global="true" defaultValue="NA">The sequence of the adapter to remove.</param>
<param name="genome" shortName="genome" global="true" defaultValue="">The directory path to the genome to align.</param>
<param name="bwtIndex" shortName="bwti" global="true" defaultValue="">The absolute path to the rootname of the Bowtie index.</param>
<param name="gffFile" shortName="gff">The path to the .gff file of the reference genome.</param>
<param name="gffFeature" shortName="gfft" defaultValue="miRNA">Feature of the .gff file to use for the annotations (eg.: miRNA, gene, transcript...), default miRNA.</param>
<param name="gffAttribute" shortName="gffa" defaultValue="Name">Attribute of the .gff to use in the annotations (eg.: Name, gene_id, transcript_id...), default Name.</param>
<param name="conditions" shortName="cond">The path to the .tsv file with the rootnames of the samples, conditions and labels.</param>
<param name="contrast" shortName="cont">The path to the .tsv file with the contrast DESeq2 has to perform.</param>
<param name="qvalue" shortName="qval" global="true" defaultValue="0.05">FDR-corrected pvalue used to filter miRNAs after the differential expression analysis.</param>
<param name="log2FC" shortName="logfc" global="true" defaultValue="0.5">Absolute value of the log2FC, used to filter miRNAs after the differential expression analysis.</param>
<param name="distance_method" shortName="dmt" global="true" defaultValue="euclidean">Method used to compute distances on the hierarchical clustering step, default euclidean.</param>
<param name="vennFormat" shortName="venf" global="true" defaultValue="png">The file format of the Venn diagram (png/svg/tiff), default png.</param>
<param name="fqcOut" shortName="fqco" global="true" defaultValue="1_fastqc">The relative path to the directory containing the FastQC results.</param>
<param name="ctdOut" shortName="ctdo" global="true" defaultValue="2_cutadapt">The relative path to the directory containing the Cutadapt results.</param>
<param name="bwtOut" shortName="bwto" global="true" defaultValue="3_bowtie">The relative path to the directory containing the Bowtie results.</param>
<param name="bamstOut" shortName="bsto" global="true" defaultValue="4_bam_stats">The relative path to the directory containing the Samtools stats and Plot-bamstats results.</param>
<param name="ftqOut" shortName="ftqo" global="true" defaultValue="5_feature_counts">The relative path to the directory containing the FeatureCounts results.</param>
<param name="dsqOut" shortName="dsqo" global="true" defaultValue="6_deseq2">The relative path to the directory containing the DESeq2 results.</param>
<param name="edgOut" shortName="edgo" global="true" defaultValue="6_edger">The relative path to the directory containing the EdgeR results.</param>
<param name="deaIntOut" shortName="dinto" global="true" defaultValue="6_deseq2+edger">The relative path to the directory containing the results of the DESeq2 and EdgeR integration.</param>
<param name="mqcOut" shortName="mqco" global="true" defaultValue="7_multiqc">The relative path to the directory containing the MultiQC report.</param>
<param name="scriptsDir" shortName="sd" global="true" defaultValue="/scripts/">The relative path to the directory containing the R scripts.</param>
<param name="testAdapterBashScript" shortName="tabs" global="true" defaultValue="test_isadapter.sh">The relative path to the directory containing the R script to get the path of the aligned/unaligned data.</param>
<param name="deSeq2Rscript" shortName="rdes" global="true" defaultValue="run_deseq2.R">The relative path to the directory containing the R script to run the DESeq2 analysis.</param>
<param name="filterCtsRscript" shortName="fcrs" global="true" defaultValue="filter_counts.R">The relative path to the directory containing the R script used to filter all-counts.txt and conditions_file.</param>
<param name="edgerRscript" shortName="reds" global="true" defaultValue="run_edger_exact-test.R">The relative path to the directory containing the R script to run the EdgeR analysis.</param>
<param name="enhancedVolcanoRscript" shortName="evrs" global="true" defaultValue="run_enhanced-volcano.R">The relative path to the directory containing the R script to build the Volcano plot.</param>
<param name="deaIntRscript" shortName="deais" global="true" defaultValue="run_dea-integration.R">The relative path to the directory containing the R script to run the DESeq-EdgeR results integration.</param>
<param name="vennRscript" shortName="rven" global="true" defaultValue="run_venn-diagram.R">The relative path to the directory containing the R script to run the DESeq-EdgeR results integration.</param>
<param name="deseq2NormalizationRscript" shortName="dnrs" global="true" defaultValue="run_deseq2_normalization.R">The relative path to the directory containing the R script for the creation of the hclust table.</param>
<param name="hclustMakeTableRscript" shortName="hmk" global="true" defaultValue="run_make_hclust_table.R">The relative path to the directory containing the R script for the creation of the hclust table.</param>
<param name="hclustRscript" shortName="hcr" global="true" defaultValue="run_hierarchical-clustering.R">The relative path to the directory containing the R script for the hierarchical clustering analysis.</param>
<param name="functionalEnrichmentRscript" shortName="fers" global="true" defaultValue="run_functional-enrichment-analysis.R">The relative path to the directory containing the R script for the functional enrichment analysis.</param>
<param name="networkRscript" shortName="nrs" global="true" defaultValue="run_network_plot.R">The relative path to the directory containing the R script for the network creation.</param>
<param name="databasesDir" shortName="db" global="true" defaultValue="/databases/">The relative path to the directory containing the TarBase and Reactome databases.</param>
<param name="tarbaseDB" shortName="tbdb" global="true" defaultValue="TarBase_v8_download.txt">The relative path to the TarBase file.</param>
<param name="reactomeDB" shortName="rcdb" global="true" defaultValue="Ensembl2Reactome.txt">The relative path to a Reactome file with Ensembl IDs and Reactome IDs.</param>
<param name="reactomeInteractionsDB" shortName="rcidb" global="true" defaultValue="reactome.all_species.interactions.tab-delimited.txt">The relative path to a the Reactome file downloaded from https://reactome.org/download/current/interactors/reactome.all_species.interactions.tab-delimited.txt and renamed as ReactomeInteractions.txt.</param>
<param name="rDeseq2Version" shortName="rdv" defaultValue="1.32.0_v2" global="true">Version of the pegi3s/r_deseq2 Docker image to use.</param>
<param name="rEdgerVersion" shortName="redv" defaultValue="3.36.0" global="true">Version of the pegi3s/r_edger Docker image to use.</param>
<param name="rEnhancedVolcanoVersion" shortName="revv" defaultValue="1.12.0" global="true">Version of the pegi3s/r_enhanced-volcano Docker image to use.</param>
<param name="cutadaptVersion" shortName="cdv" defaultValue="1.16" global="true">Version of the pegi3s/cutadapt Docker image to use.</param>
<param name="fastqcVersion" shortName="fqcv" defaultValue="0.11.9" global="true">Version of the pegi3s/fastqc Docker image to use.</param>
<param name="bowtieVersion" shortName="bwv" defaultValue="1.2.3" global="true">Version of the pegi3s/bowtie1 Docker image to use.</param>
<param name="featureCountsVersion" shortName="fqv" defaultValue="2.0.0" global="true">Version of the pegi3s/feature-counts Docker image to use.</param>
<param name="samtoolsVersion" shortName="stv" defaultValue="1.9" global="true">Version of the pegi3s/samtools_bcftools Docker image to use.</param>
<param name="samtoolsBamstatsVersion" shortName="spb" defaultValue="1.10" global="true">Version of the pegi3s/samtools_bcftools Docker image to use for bam analysis.</param>
<param name="rdatanalysisVersion" shortName="rdav" defaultValue="4.1.1_v3" global="true">Version of the pegi3s/r_data-analysis Docker image to use.</param>
<param name="rVennVersion" shortName="rvv" defaultValue="1.7.0" global="true">Version of the pegi3s/r_venn-diagram Docker image to use.</param>
<param name="rNetworkVersion" shortName="rntv" defaultValue="4.1.1_v2_v3" global="true">Version of the pegi3s/r_network Docker image to use.</param>
<param name="multiqcVersion" shortName="mqv" defaultValue="1.14.0" global="true">Version of the pegi3s/multiqc Docker image to use.</param>
<flag name="skipPullDockerImages" shortName="sdi">Use this flag to skip the pull-docker-images task.</flag>
<flag name="deseqNormalizationHclust" shortName="dnh">Use this flag to perform a global count normalization with DESeq2 before the hierarchical clustering.</flag>
<param name="selectDEAsoftware" shortName="dea" defaultValue="both" global="true">Use this param to select the differential expression analysis software (deseq, edger or both).</param>
</params>
<tasks>
<!--Pull the docker images of the software used for the analysis-->
<task id="pull-docker-images" params="skipPullDockerImages" if="[ -v ${skipPullDockerImages} ]">
${scriptsDir}/pull-docker-images.sh
</task>
<!--Creates the output directories-->
<task id="initialization" after="pull-docker-images">
${scriptsDir}/initialization.sh
</task>
<!--Bowtie genome index build-->
<task id="build-genome-index" after="initialization" params="bwtOut genome" if="[ ! -z ${genome} ]"> #if user sets a value for genome, then this task is executed
${scriptsDir}/build-genome-index.sh
</task>
<!--Quality control with FastQC-->
<foreach id="fastqc-qc" after="initialization" of="command" in="ls -1 ${fastqDir} | grep .*\.fastq" as="file">
${scriptsDir}/fastqc-qc.sh
</foreach>
<!--Adapter removal with cutadapt-->
<foreach id="cut-sequences" after="fastqc-qc" of="command" in="ls -1 ${fastqDir} | grep .*\.fastq" as="file" params="adapter">
${scriptsDir}/cut-sequences.sh
</foreach>
<!--Alignment with Bowtie-->
<foreach id="bowtie-alignment" after="build-genome-index cut-sequences" of="command" in="bash ${scriptsDir}/${testAdapterBashScript} ${adapter} ${workingDir}/${outDir}/${ctdOut}/ ${fastqDir}" as="file" params="bwtIndex adapter fastqDir">
${scriptsDir}/bowtie-alignment.sh
</foreach>
<!--Converts the sam files to bam format-->
<foreach id="sam-to-bam" after="bowtie-alignment" of="command" in="ls -1 ${workingDir}/${outDir}/${bwtOut}/ | grep .*.\fastq.*\.sam" as="file">
${scriptsDir}/sam-to-bam.sh
</foreach>
<!--Performs the quality control with QualiMap-->
<foreach id="bam-stats" after="sam-to-bam" of="command" in="ls -1 ${workingDir}/${outDir}/${bwtOut}/ | grep .*.\fastq.*\.bam" as="bamFile">
${scriptsDir}/bam-stats.sh
</foreach>
<!--Performs the quantification with featureCounts-->
<task id="feature-counts" after="bam-stats" params="gffFile gffFeature gffAttribute">
${scriptsDir}/feature-counts.sh
</task>
<!--Performs the differential expression analysis with DESeq2-->
<foreach id="deseq" after="feature-counts" of="command" in="tail -n+2 ${contrast}" as="comparison" params="conditions contrast deSeq2Rscript" if="[ ${selectDEAsoftware} = deseq ] || [ ${selectDEAsoftware} = both ]">
${scriptsDir}/deseq.sh
</foreach>
<!--Performs the differential expression analysis with EdgeR-->
<foreach id="edger" after="feature-counts" of="command" in="tail -n+2 ${contrast}" as="comparison" params="conditions contrast edgerRscript" if="[ ${selectDEAsoftware} = edger ] || [ ${selectDEAsoftware} = both ]">
${scriptsDir}/edger.sh
</foreach>
<!--Compares the results of DESeq2 and EdgeR and builds a result file with the coincidences-->
<foreach id="dea-integration" after="deseq edger" of="command" in="tail -n+2 ${contrast}" as="comparison" params="conditions contrast" if="[ ${selectDEAsoftware} = both ]">
${scriptsDir}/dea-integration.sh
</foreach>
<!--Build the Venn diagram with the DESeq2+EdgeR results-->
<foreach id="venn" after="deseq edger dea-integration" of="command" in="tail -n+2 ${contrast}" as="comparison" params="vennFormat contrast" if="[ -s "${workingDir}/${outDir}/${deaIntOut}/$(echo ${comparison} | cut -d '"' -f 2)"/pipel/*_venn.\tsv ]">
${scriptsDir}/venn.sh
</foreach>
<!--Build the Volcano Plot with the DESeq2/EdgeR results-->
<foreach id="volcano" after="deseq edger dea-integration" of="command" in="tail -n+2 ${contrast}" as="comparison" params="conditions contrast">
${scriptsDir}/volcano.sh
</foreach>
<!--Build a Hierarchical Clustering using the DESeq2/EdgeR results-->
<foreach id="hclust" after="deseq edger dea-integration" of="command" in="tail -n+2 ${contrast}" as="comparison" params="conditions contrast deseqNormalizationHclust">
${scriptsDir}/hclust.sh deseqNormalizationHclust
</foreach>
<!--Performs a Functional Enrichment Analysis using the DESeq2/EdgeR results-->
<foreach id="functional-enrichment" after="hclust" of="command" in="tail -n+2 ${contrast}" as="comparison" params="conditions contrast">
${scriptsDir}/functional-enrichment.sh
</foreach>
<!--Build the network of interactions with the most enriched pathway of the functional-enrichment task-->
<foreach id="network" after="functional-enrichment" of="command" in="tail -n+2 ${contrast}" as="comparison" params="conditions contrast">
${scriptsDir}/network.sh
</foreach>
<!--Create a MultiQC report with the results-->
<task id="multiqc" after="network">
${scriptsDir}/multiqc.sh
</task>
</tasks>
<metadata>
<task-description id="pull-docker-images">Pull the docker images of the software used for the analysis.</task-description>
<task-description id="initialization">Creates the output directories.</task-description>
<task-description id="fastqc-qc">Quality control with FastQC.</task-description>
<task-description id="cut-sequences">Adapter removal with cutadapt.</task-description>
<task-description id="build-genome-index">Builds a genome index if the genome parameter is passed in compi.parameters.</task-description>
<task-description id="bowtie-alignment">Alignment with Bowtie.</task-description>
<task-description id="sam-to-bam">Converts the sam files to bam format.</task-description>
<task-description id="bam-stats">Performs the quality control of the bam files with SAMtools.</task-description>
<task-description id="feature-counts">Performs the quantification with featureCounts.</task-description>
<task-description id="deseq">Performs the differential expression analysis with DESeq2.</task-description>
<task-description id="edger">Performs the differential expression analysis with EdgeR.</task-description>
<task-description id="dea-integration">Compares the results of DESeq2 and EdgeR and builds a result file with the coincidences.</task-description>
<task-description id="venn">Build the Venn diagram with the DESeq2+EdgeR results.</task-description>
<task-description id="volcano">Build the Volcano Plot with the DESeq2/EdgeR results.</task-description>
<task-description id="hclust">Build a Hierarchical Clustering using the DESeq2/EdgeR results.</task-description>
<task-description id="functional-enrichment">Performs a Functional Enrichment Analysis using the DESeq2/EdgeR results.</task-description>
<task-description id="network">Build the network of interactions with the most enriched pathway of the functional-enrichment task.</task-description>
<task-description id="multiqc">Create a MultiQC report with the results.</task-description>
</metadata>
</pipeline>