Merge branch 'CW-2188-denovo-test' into 'dev'

CW-2188 add test for denovo plus warnings Closes CW-2188 See merge request epi2melabs/workflows/wf-transcriptomes!100
epi2me-labs · Jun 14, 2023 · cb73e0a · cb73e0a
2 parents 7d3b67a + 9f449ae
commit cb73e0a
Show file tree

Hide file tree

Showing 6 changed files with 35 additions and 26 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -45,7 +45,7 @@ docker-run:
             - MATRIX_NAME: [
                 "fusions", "differential_expression", "isoforms",
                 "only_differential_expression", "differential_expression_gff3",
-                "ncbi_gzip"
+                "ncbi_gzip", "denovo"
             ]
     rules:
         # NOTE As we're overriding the rules block for the included docker-run
@@ -59,6 +59,10 @@ docker-run:
               NF_WORKFLOW_OPTS: "--fastq  ERR6053095_chr20.fastq --transcriptome-source reference-guided \
                   --ref_genome chr20/hg38_chr20.fa --ref_annotation chr20/gencode.v22.annotation.chr20.gtf"
               NF_IGNORE_PROCESSES: preprocess_reads,merge_transcriptomes,decompress_annotation,decompress_ref
+        - if: $MATRIX_NAME == "denovo"
+          variables:
+              NF_WORKFLOW_OPTS: "--fastq test_data/fastq/SIRV_E0_PCS109_50.fq.gz --transcriptome_source denovo"
+              NF_IGNORE_PROCESSES: preprocess_reads,merge_transcriptomes,decompress_annotation,decompress_ref,build_minimap_index
         - if: $MATRIX_NAME == "fusions"
           variables:
               NF_BEFORE_SCRIPT: wget -O test_data.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-isoforms/wf-isoforms_test_data.tar.gz && tar -xzvf  test_data.tar.gz

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,14 +4,18 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [unreleased]
+## [v0.1.13]
+### Added
+- Handling for GFF3 reference_annotation file type.
+- Warning for the `--transcriptome_source` denovo pipeline option.
 ### Changed
 - Enum choices are enumerated in the `--help` output
 - Enum choices are enumerated as part of the error message when a user has selected an invalid choice
 - Bumped minimum required Nextflow version to 22.10.8
 
 ### Fixed
 - Replaced `--threads` option in fastqingress with hardcoded values to remove warning about undefined `param.threads`
+- Fix for the `--transcriptome_source` denovo pipeline option.
 
 ## [v0.1.12]
 ### Added

diff --git a/bin/workflow_glue/compute_cluster_quality.py b/bin/workflow_glue/compute_cluster_quality.py
@@ -7,7 +7,6 @@
 from collections import defaultdict
 import math
 from pathlib import Path
-import sys
 
 import matplotlib
 from matplotlib import pyplot as plt
@@ -18,10 +17,12 @@
     adjusted_rand_score, completeness_score,
     homogeneity_score, v_measure_score)
 
-from .util import wf_parser  # noqa: ABS101
+from .util import wf_parser, get_named_logger  # noqa: ABS101
 
 matplotlib.use('Agg')
 
+logger = get_named_logger("clustqual")
+
 
 def argparser():
     """Argument parser for entrypoint."""
@@ -181,16 +182,15 @@ def compute_v_measure(clusters, classes):
     compl_score = completeness_score(class_list, cluster_list)
     homog_score = homogeneity_score(class_list, cluster_list)
     ari = adjusted_rand_score(class_list, cluster_list)
-
-    sys.stdout("Not included in clustering but aligned:", len(not_clustered))
-    sys.stdout(
+    logger.info("Not included in clustering but aligned:", len(not_clustered))
+    logger.info(
         "v:",
         v_score,
         "Completeness:",
         compl_score,
         "Homogeneity:",
         homog_score)
-    sys.stdout(
+    logger.info(
         "Nr reads clustered but unaligned "
         "(i.e., no class and excluded from v-measure): ",
         clustered_but_unaligned)
@@ -231,14 +231,14 @@ def compute_v_measure_non_singleton_classes(clusters, classes):
     homog_score = homogeneity_score(class_list, cluster_list)
     nr_filtered_classes = len(
         [1 for cl_id in classes_dict if len(classes_dict[cl_id]) >= 5])
-    sys.stdout(
+    logger.info(
         "NONTRIvIAL CLASSES: v:",
         v_score,
         "Completeness:",
         compl_score,
         "Homogeneity:",
         homog_score)
-    sys.stdout("NUMBER OF CLASSES (FILTERED):", len(
+    logger.info("NUMBER OF CLASSES (FILTERED):", len(
         [1 for cl_id in classes_dict if len(classes_dict[cl_id]) >= 5]))
     return v_score, compl_score, homog_score, nr_filtered_classes
 
@@ -268,20 +268,20 @@ def compute_v_measure_non_singletons(clusters, classes):
             class_list.append(classes[read])
             cluster_list.append(clusters[read])
         else:
-            # sys.stdout("Read was clustered but unaligned:", read)
+            logger.debug("Read was clustered but unaligned:", read)
             clustered_but_unaligned += 1
 
     v_score = v_measure_score(class_list, cluster_list)
     compl_score = completeness_score(class_list, cluster_list)
     homog_score = homogeneity_score(class_list, cluster_list)
-    sys.stdout(
+    logger.info(
         "NONTRIvIAL CLUSTERS: v:",
         v_score,
         "Completeness:",
         compl_score,
         "Homogeneity:",
         homog_score)
-    sys.stdout(
+    logger.info(
         "NONTRIvIAL CLUSTERS: Nr reads clustered but unaligned "
         "(i.e., no class and excluded from v-veasure): ",
         clustered_but_unaligned)
@@ -407,11 +407,11 @@ def get_cluster_information(clusters, classes):
         else:
             clustered_classes[class_id] += 1
 
-    sys.stdout("UNCLUSTERED:", "Tot classes:", len(not_clustered_classes))
-    sys.stdout("CLUSTERED:", "Tot classes:", len(clustered_classes))
-    sys.stdout("MIXED:", "Tot classes containing both:", len(
+    logger.info("UNCLUSTERED:", "Tot classes:", len(not_clustered_classes))
+    logger.info("CLUSTERED:", "Tot classes:", len(clustered_classes))
+    logger.info("MIXED:", "Tot classes containing both:", len(
         set(clustered_classes.keys()) & set(not_clustered_classes.keys())))
-    sys.stdout("Total number of classes (unique gene ID):", total_nr_classes)
+    logger.info("Total number of classes (unique gene ID):", total_nr_classes)
     return (
         total_nr_classes - len(singleton_classes),
         len(singleton_classes),
@@ -507,7 +507,7 @@ def main(args):
 
     non_singleton_clusters = total_nr_clusters - singleton_clusters
 
-    sys.stdout(
+    logger.info(
         "NONTRIVIAL CLUSTERS: ", (total_nr_clusters - singleton_clusters))
 
     outfile.write("CLUSTERS\n")

diff --git a/main.nf b/main.nf
@@ -255,7 +255,7 @@ process run_gffcompare{
     script:
     def out_dir = "${sample_id}_gffcompare"
 
-    if ( ref_annotation.name.startsWith('OPTIONAL_FILE') ){
+    if (params.transcriptome_source == "denovo"){
         """
         mkdir $out_dir
         """
@@ -454,14 +454,14 @@ workflow pipeline {
         ref_transcriptome
         use_ref_ann
     main:
-        if (params.ref_genome.toLowerCase().endsWith("gz")) {
+        if (params.ref_genome && file(params.ref_genome).extension == "gz") {
             // gzipped ref not supported by some downstream tools
             // easier to just decompress and pass it around.
             ref_genome = decompress_ref(ref_genome)
         }else {
             ref_genome = Channel.fromPath(ref_genome)
         }
-        if (params.ref_annotation.toLowerCase().endsWith("gz")) {
+        if (params.ref_annotation && file(params.ref_annotation).extension == "gz") {
             // gzipped ref not supported by some downstream tools
             // easier to just decompress and pass it around.
             decompress_annot= decompress_annotation(ref_annotation)
@@ -519,6 +519,7 @@ workflow pipeline {
 
             if (params.transcriptome_source == "denovo"){
                 log.info("Doing de novo assembly")
+                log.info("WARNING: The `--transcriptome_source` denovo option may have unexpected results and errors. If possible it is preferable to use the reference-guided pipeline.")
                 assembly = denovo_assembly(full_len_reads, ref_genome)
 
             } else {

diff --git a/nextflow.config b/nextflow.config
@@ -116,7 +116,7 @@ manifest {
     description     = 'Transcriptome analysis including gene fusions, differential expression as well as assembly and annotation of cDNA and direct RNA sequencing data.'
     mainScript      = 'main.nf'
     nextflowVersion = '>=22.10.8'
-    version         = 'v0.1.12'
+    version         = 'v0.1.13'
 }
 
 executor {

diff --git a/subworkflows/denovo_assembly.nf b/subworkflows/denovo_assembly.nf
@@ -178,12 +178,13 @@ process make_batches {
             then
                 batch_size=$minimum_batch_size
         fi
+        echo "Num bases: \$nr_bases";
     else
         batch_size=${params.isOnClust2_batch_size}
     fi
 
     echo "Batch size:\$batch_size";
-    echo "Num bases: \$nr_bases";
+    
 
     mkdir -p sorted; isONclust2 sort $params.isOnClust2_sort_options -v -o sorted $fastq;
     """
@@ -199,7 +200,7 @@ process clustering() {
         tuple val(sample_id), path('isONcluster_ROOT.cer'), emit: root_cluster
     script:
     """
-    workflow-glue run_isonclust2 $sorted_batches
+    workflow-glue run_isonclust2 --workdir .
     """
 }
 
@@ -261,8 +262,7 @@ workflow denovo_assembly {
         cds_align(
             merge_cds.out.final_polished_cds
             .join(make_batches.out.sorted_reads_dir))
-
-        if (!reference.name.startsWith('OPTIONAL_FILE')){
+        if (params.ref_genome) {
             cluster_quality(reference, fastq_reads_fl
             .join(dump_clusters.out.final_clusters_dir))