Merge pull request #125 from CCBR/fix-qc-stats

fix: QC stats sample IDs; add read counts between steps
CCBR · Nov 2, 2023 · 637471f · 637471f
2 parents 8302666 + 263f9e4
commit 637471f
Show file tree

Hide file tree

Showing 45 changed files with 396 additions and 370 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -41,16 +41,16 @@ jobs:
           pip install .[dev,test]
       - name: Stub run
         run: |
-          cd tests/
+          cd tests/cli
           which champagne
           champagne init
-          champagne run -profile ci_stub -stub
+          champagne run -stub -c ci_stub.config --max_cpus 2 --max_memory 6.GB
       - name: Test run
         if: ${{ env.test_run == 'true' }}
         run: |
-          cd tests/
+          cd tests/cli
           champagne init
-          champagne run -profile ci_test,docker
+          champagne run -profile docker -c ci_test.config
       - name: "Upload Artifact"
         uses: actions/upload-artifact@v3
         if: always() # run even if previous steps fail

diff --git a/.gitignore b/.gitignore
@@ -28,6 +28,7 @@ replay_pid*
 /work*/
 /data/
 /results/
+/output/
 /params.yaml
 
 # python packaging

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,8 @@
 ## development version
 
+- Fixed a bug in QC stats that mixed up the statistics for different samples. (#125)
+- Fixed a bug in the CLI that added the `-profile` to the nextflow command even if it wasn't needed (#125).
+- Report read counts between blacklist & filtering steps in the QC table. (#125)
 - Run spooker on workflow completion (#126).
 
 ## CHAMPAGNE 0.2.0

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.2.0-dev
+0.2.1
diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml
@@ -78,13 +78,19 @@ custom_data:
       NReads:
         description: "The number of reads sequenced"
         format: "{:,.0f}"
-        hidden: True
+      N_reads_surviving_blacklist:
+        description: "The number of reads surviving after filtering blacklisted regions"
+        format: "{:,.0f}"
+        hidden: true
       NMappedReads:
         description: "The number of reads mapped"
         format: "{:,.0f}"
-        hidden: True
+      N_mapped_reads_surviving_filter:
+        description: "The number of mapped reads surviving after filtering by alignment quality"
+        format: "{:,.0f}"
+        hidden: true
       NUniqMappedReads:
-        description: "The number of reads remaining after deduplication"
+        description: "The number of mapped & filtered reads remaining after deduplication"
         format: "{:,.0f}"
       NRF:
         description: "Non-Redundant fraction"
@@ -163,7 +169,7 @@ custom_data:
 
 sp:
   QC_Table:
-    fn: "qc_table.txt"
+    fn: "qc_table.tsv"
   NGSQC_data:
     fn: "*NGSQC.txt"
   frip_samples:

diff --git a/bin/compare-tables.R b/bin/compare-tables.R
@@ -0,0 +1,15 @@
+library(tidyverse)
+
+original <- read.table("QCTable.txt", header = TRUE) %>%
+  as_tibble() %>%
+  mutate(across(contains("reads"), as.integer)) %>%
+  select(c("SampleName", contains("reads"))) %>%
+  pivot_longer(-SampleName, values_to = "value_orig")
+new <- read_tsv("qc_table.tsv") %>%
+  select(SampleName, original %>% pull(name)) %>%
+  pivot_longer(-SampleName, values_to = "value_new")
+
+
+inner_join(original, new) %>%
+  mutate(rel_diff_percent = round(100 * (value_new - value_orig) / value_orig, 2)) %>%
+  View()
diff --git a/bin/count-peaks.R b/bin/count-peaks.R
@@ -0,0 +1,25 @@
+library(tidyverse)
+peak_counts <- read_tsv("peak_meta.tsv") %>%
+  group_by(sample_id, tool) %>%
+  count() %>%
+  rename(count_new = n)
+peak_counts %>%
+  pull(tool) %>%
+  unique()
+
+peaks_old <- read_tsv("old_peak_counts.tsv") %>%
+  mutate(tool = str_remove(file, "/.*")) %>%
+  mutate(
+    tool = case_when(
+      tool == "macsBroad" ~ "macs_broad",
+      tool == "macsNarrow" ~ "macs_narrow",
+      TRUE ~ tool
+    ),
+    sample_id = str_replace(file, ".*/(.*)/.*", "\\1"),
+  ) %>%
+  rename(count_old = count) %>%
+  select(sample_id, tool, count_old)
+
+inner_join(peaks_old, peak_counts) %>%
+  mutate(rel_diff_percent = round(100 * (count_new - count_old) / count_old, 2)) %>%
+  View()
diff --git a/bin/createtable.py b/bin/createtable.py
@@ -33,10 +33,7 @@ def file2table():
     df = pd.DataFrame(tabledict)
     df.index.name = "SampleName"
     df.reset_index(inplace=True)
-    # print(df[['NSC', 'FRiP', 'PCB1', 'PCB2', 'RSC']])  #re-order columns
-    # cols = df.columns.tolist() # view df columns names
-    # orderedcols = ordercolumns(cols)
-    # print(df.to_string())
+    df = df.sort_values(by="SampleName")
 
     # sometimes preseq fails, resulting in some columns not being present.
     # so this only keeps columns that exist in the dict.
@@ -46,7 +43,9 @@ def file2table():
         for col in [
             "SampleName",
             "NReads",
+            "N_reads_surviving_blacklist",
             "NMappedReads",
+            "N_mapped_reads_surviving_filter",
             "NUniqMappedReads",
             "NRF",
             "PBC1",
@@ -58,8 +57,8 @@ def file2table():
         ]
         if col in df_columns
     ]
-
-    print(df[column_order].to_string(index=False, justify="left"))
+    df = df[column_order]
+    df.to_csv("qc_table.tsv", sep="\t", index=False)
 
 
 if __name__ == "__main__":

diff --git a/bin/filterMetrics.py b/bin/filterMetrics.py
@@ -64,6 +64,8 @@ def getmetadata(type):
         metadata = "NReads"
     elif type == "mnreads":
         metadata = "NMappedReads"
+    elif type == "N_mapped_reads_surviving_filter":
+        metadata = type
     elif type == "unreads":
         metadata = "NUniqMappedReads"
     elif type == "fragLen":

diff --git a/conf/base.config b/conf/base.config
@@ -10,7 +10,6 @@
 
 process {
 
-    // TODO nf-core: Check the defaults for all processes
     cpus   = { check_max( 1    * task.attempt, 'cpus'   ) }
     memory = { check_max( 6.GB * task.attempt, 'memory' ) }
     time   = { check_max( 4.h  * task.attempt, 'time'   ) }
@@ -32,19 +31,19 @@ process {
         time   = { check_max( 4.h  * task.attempt, 'time'    ) }
     }
     withLabel:process_low {
-        cpus   = { check_max( 2     * task.attempt, 'cpus'    ) }
+        cpus   = { check_max( 4     * task.attempt, 'cpus'    ) }
         memory = { check_max( 12.GB * task.attempt, 'memory'  ) }
         time   = { check_max( 4.h   * task.attempt, 'time'    ) }
     }
     withLabel:process_medium {
-        cpus   = { check_max( 6     * task.attempt, 'cpus'    ) }
+        cpus   = { check_max( 16    * task.attempt, 'cpus'    ) }
         memory = { check_max( 36.GB * task.attempt, 'memory'  ) }
         time   = { check_max( 8.h   * task.attempt, 'time'    ) }
     }
     withLabel:process_high {
-        cpus   = { check_max( 12    * task.attempt, 'cpus'    ) }
-        memory = { check_max( 72.GB * task.attempt, 'memory'  ) }
-        time   = { check_max( 16.h  * task.attempt, 'time'    ) }
+        cpus   = { check_max( 32     * task.attempt, 'cpus'    ) }
+        memory = { check_max( 120.GB * task.attempt, 'memory'  ) }
+        time   = { check_max( 16.h   * task.attempt, 'time'    ) }
     }
     withLabel:process_long {
         time   = { check_max( 20.h  * task.attempt, 'time'    ) }
@@ -59,15 +58,4 @@ process {
         errorStrategy = 'retry'
         maxRetries    = 2
     }
-    /*
-    withName:CUSTOM_DUMPSOFTWAREVERSIONS {
-        cache = false
-    }*/
-
-    // Custom CCBR resource requirements
-    withLabel:process_higher {
-        cpus   = { check_max( 32    * task.attempt, 'cpus'    ) }
-        memory = { check_max( 120.GB * task.attempt, 'memory'  ) }
-        time   = { check_max( 16.h  * task.attempt, 'time'    ) }
-    }
 }
diff --git a/conf/modules.config b/conf/modules.config
@@ -8,20 +8,21 @@ process {
 
     errorStrategy = 'finish'
 
-    withName: 'INPUT_CHECK:SAMPLESHEET_CHECK' {
+    /*
+    withName: '.*CUSTOM_DUMPSOFTWAREVERSIONS' {
+        cache = false
+        publishDir = [
+            path: { "${params.outdir}/pipeline_info" },
+            mode: params.publish_dir_mode,
+            pattern: '*_versions.yml'
+        ]
+    }*/
+
+    withName: '.*INPUT_CHECK:SAMPLESHEET_CHECK' {
         publishDir = [
             path: { "${params.outdir}/pipeline_info" },
             mode: params.publish_dir_mode,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
     }
-    withName: 'CUTADAPT' {
-        ext.args = [
-            '--nextseq-trim=2',
-            '--trim-n -n 5 -O 5',
-            '-q 10,10',
-            '-m 20',
-            '-b file:/opt2/TruSeq_and_nextera_adapters.consolidated.fa'
-        ].join(' ').trim()
-    }
 }
diff --git a/conf/test.config b/conf/test.config
@@ -18,6 +18,7 @@ params {
     deeptools.excluded_chroms = 'chrM'
     run {
         qc = true
+        deeptools = true
         normalize_input = true
         call_peaks = true
         gem = true

diff --git a/main.nf b/main.nf
@@ -49,6 +49,10 @@ workflow MAKE_REFERENCE {
 
 // MAIN WORKFLOW
 workflow {
+    CHIPSEQ()
+}
+
+workflow CHIPSEQ {
     INPUT_CHECK(file(params.input), params.seq_center)
     INPUT_CHECK.out.reads.set { raw_fastqs }
     raw_fastqs | CUTADAPT
@@ -72,8 +76,8 @@ workflow {
 
     ch_multiqc = Channel.of()
     if (params.run.qc) {
-        QC(raw_fastqs, trimmed_fastqs,
-           aligned_bam, ALIGN_GENOME.out.flagstat,
+        QC(raw_fastqs, trimmed_fastqs, FILTER_BLACKLIST.out.n_surviving_reads,
+           aligned_bam, ALIGN_GENOME.out.aligned_flagstat, ALIGN_GENOME.out.filtered_flagstat,
            deduped_bam, DEDUPLICATE.out.flagstat,
            PHANTOM_PEAKS.out.spp, frag_lengths,
            PREPARE_GENOME.out.gene_info,

diff --git a/modules.json b/modules.json
@@ -12,8 +12,13 @@
                     },
                     "bwa/mem": {
                         "branch": "main",
-                        "git_sha": "ca4f84b4c2ca84eb0449b4ba414a8b8052f8d90a",
-                        "installed_by": ["filter_blacklist", "modules"]
+                        "git_sha": "7887b0e0dc5a0320d8ba84c2763ef8692c358087",
+                        "installed_by": ["modules", "filter_blacklist"]
+                    },
+                    "custom/countfastq": {
+                        "branch": "main",
+                        "git_sha": "2ccd43e3734de30fe61ed0ff80e6e3252929505e",
+                        "installed_by": ["filter_blacklist"]
                     },
                     "cutadapt": {
                         "branch": "main",
@@ -27,17 +32,22 @@
                     },
                     "picard/samtofastq": {
                         "branch": "main",
-                        "git_sha": "258d0f336ea1f851ab4223d295bb18b6dc187899",
+                        "git_sha": "25e6e67a4ec172db1bbb0ef995c4a470d847143a",
                         "installed_by": ["filter_blacklist"]
                     },
                     "samtools/filteraligned": {
                         "branch": "main",
                         "git_sha": "879e969c593ab9f321301ac15722728ab30cea49",
                         "installed_by": ["filter_blacklist"]
                     },
+                    "samtools/flagstat": {
+                        "branch": "main",
+                        "git_sha": "25e6e67a4ec172db1bbb0ef995c4a470d847143a",
+                        "installed_by": ["modules"]
+                    },
                     "samtools/sort": {
                         "branch": "main",
-                        "git_sha": "d55ab2580b69a81aa0534a3018cc6e6ea3b28640",
+                        "git_sha": "5b39869abfc740c6243d18a3cd84aa7d78787125",
                         "installed_by": ["modules"]
                     }
                 }
@@ -46,7 +56,7 @@
                 "CCBR": {
                     "filter_blacklist": {
                         "branch": "main",
-                        "git_sha": "bb7dbb42afe47d7e02b2f21e3352720ca2996e11",
+                        "git_sha": "b7764378fac18bea8c84f9dd39cb595241b6e796",
                         "installed_by": ["subworkflows"]
                     }
                 }
@@ -57,17 +67,7 @@
                 "nf-core": {
                     "bedtools/getfasta": {
                         "branch": "master",
-                        "git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f",
-                        "installed_by": ["modules"]
-                    },
-                    "bwa/index": {
-                        "branch": "master",
-                        "git_sha": "28a23ea6529caff44855c774f439a4074883027c",
-                        "installed_by": ["modules"]
-                    },
-                    "samtools/flagstat": {
-                        "branch": "master",
-                        "git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f",
+                        "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a",
                         "installed_by": ["modules"]
                     }
                 }

diff --git a/modules/CCBR/bwa/mem/main.nf b/modules/CCBR/bwa/mem/main.nf
diff --git a/modules/CCBR/custom/countfastq/main.nf b/modules/CCBR/custom/countfastq/main.nf