diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 90c7463..a7b7c80 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -45,7 +45,7 @@ docker-run: - MATRIX_NAME: [ "fusions", "differential_expression", "isoforms", "only_differential_expression", "differential_expression_gff3", - "ncbi_gzip" + "ncbi_gzip", "denovo" ] rules: # NOTE As we're overriding the rules block for the included docker-run @@ -59,6 +59,10 @@ docker-run: NF_WORKFLOW_OPTS: "--fastq ERR6053095_chr20.fastq --transcriptome-source reference-guided \ --ref_genome chr20/hg38_chr20.fa --ref_annotation chr20/gencode.v22.annotation.chr20.gtf" NF_IGNORE_PROCESSES: preprocess_reads,merge_transcriptomes,decompress_annotation,decompress_ref + - if: $MATRIX_NAME == "denovo" + variables: + NF_WORKFLOW_OPTS: "--fastq test_data/fastq/SIRV_E0_PCS109_50.fq.gz --transcriptome_source denovo" + NF_IGNORE_PROCESSES: preprocess_reads,merge_transcriptomes,decompress_annotation,decompress_ref,build_minimap_index - if: $MATRIX_NAME == "fusions" variables: NF_BEFORE_SCRIPT: wget -O test_data.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-isoforms/wf-isoforms_test_data.tar.gz && tar -xzvf test_data.tar.gz diff --git a/CHANGELOG.md b/CHANGELOG.md index 399e950..afbd3ec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [unreleased] +## [v0.1.13] +### Added +- Handling for GFF3 reference_annotation file type. +- Warning for the `--transcriptome_source` denovo pipeline option. ### Changed - Enum choices are enumerated in the `--help` output - Enum choices are enumerated as part of the error message when a user has selected an invalid choice @@ -12,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - Replaced `--threads` option in fastqingress with hardcoded values to remove warning about undefined `param.threads` +- Fix for the `--transcriptome_source` denovo pipeline option. ## [v0.1.12] ### Added diff --git a/bin/workflow_glue/compute_cluster_quality.py b/bin/workflow_glue/compute_cluster_quality.py index adf3deb..9e768de 100755 --- a/bin/workflow_glue/compute_cluster_quality.py +++ b/bin/workflow_glue/compute_cluster_quality.py @@ -7,7 +7,6 @@ from collections import defaultdict import math from pathlib import Path -import sys import matplotlib from matplotlib import pyplot as plt @@ -18,10 +17,12 @@ adjusted_rand_score, completeness_score, homogeneity_score, v_measure_score) -from .util import wf_parser # noqa: ABS101 +from .util import wf_parser, get_named_logger # noqa: ABS101 matplotlib.use('Agg') +logger = get_named_logger("clustqual") + def argparser(): """Argument parser for entrypoint.""" @@ -181,16 +182,15 @@ def compute_v_measure(clusters, classes): compl_score = completeness_score(class_list, cluster_list) homog_score = homogeneity_score(class_list, cluster_list) ari = adjusted_rand_score(class_list, cluster_list) - - sys.stdout("Not included in clustering but aligned:", len(not_clustered)) - sys.stdout( + logger.info("Not included in clustering but aligned:", len(not_clustered)) + logger.info( "v:", v_score, "Completeness:", compl_score, "Homogeneity:", homog_score) - sys.stdout( + logger.info( "Nr reads clustered but unaligned " "(i.e., no class and excluded from v-measure): ", clustered_but_unaligned) @@ -231,14 +231,14 @@ def compute_v_measure_non_singleton_classes(clusters, classes): homog_score = homogeneity_score(class_list, cluster_list) nr_filtered_classes = len( [1 for cl_id in classes_dict if len(classes_dict[cl_id]) >= 5]) - sys.stdout( + logger.info( "NONTRIvIAL CLASSES: v:", v_score, "Completeness:", compl_score, "Homogeneity:", homog_score) - sys.stdout("NUMBER OF CLASSES (FILTERED):", len( + logger.info("NUMBER OF CLASSES (FILTERED):", len( [1 for cl_id in classes_dict if len(classes_dict[cl_id]) >= 5])) return v_score, compl_score, homog_score, nr_filtered_classes @@ -268,20 +268,20 @@ def compute_v_measure_non_singletons(clusters, classes): class_list.append(classes[read]) cluster_list.append(clusters[read]) else: - # sys.stdout("Read was clustered but unaligned:", read) + logger.debug("Read was clustered but unaligned:", read) clustered_but_unaligned += 1 v_score = v_measure_score(class_list, cluster_list) compl_score = completeness_score(class_list, cluster_list) homog_score = homogeneity_score(class_list, cluster_list) - sys.stdout( + logger.info( "NONTRIvIAL CLUSTERS: v:", v_score, "Completeness:", compl_score, "Homogeneity:", homog_score) - sys.stdout( + logger.info( "NONTRIvIAL CLUSTERS: Nr reads clustered but unaligned " "(i.e., no class and excluded from v-veasure): ", clustered_but_unaligned) @@ -407,11 +407,11 @@ def get_cluster_information(clusters, classes): else: clustered_classes[class_id] += 1 - sys.stdout("UNCLUSTERED:", "Tot classes:", len(not_clustered_classes)) - sys.stdout("CLUSTERED:", "Tot classes:", len(clustered_classes)) - sys.stdout("MIXED:", "Tot classes containing both:", len( + logger.info("UNCLUSTERED:", "Tot classes:", len(not_clustered_classes)) + logger.info("CLUSTERED:", "Tot classes:", len(clustered_classes)) + logger.info("MIXED:", "Tot classes containing both:", len( set(clustered_classes.keys()) & set(not_clustered_classes.keys()))) - sys.stdout("Total number of classes (unique gene ID):", total_nr_classes) + logger.info("Total number of classes (unique gene ID):", total_nr_classes) return ( total_nr_classes - len(singleton_classes), len(singleton_classes), @@ -507,7 +507,7 @@ def main(args): non_singleton_clusters = total_nr_clusters - singleton_clusters - sys.stdout( + logger.info( "NONTRIVIAL CLUSTERS: ", (total_nr_clusters - singleton_clusters)) outfile.write("CLUSTERS\n") diff --git a/main.nf b/main.nf index e47fb20..706b87e 100644 --- a/main.nf +++ b/main.nf @@ -255,7 +255,7 @@ process run_gffcompare{ script: def out_dir = "${sample_id}_gffcompare" - if ( ref_annotation.name.startsWith('OPTIONAL_FILE') ){ + if (params.transcriptome_source == "denovo"){ """ mkdir $out_dir """ @@ -454,14 +454,14 @@ workflow pipeline { ref_transcriptome use_ref_ann main: - if (params.ref_genome.toLowerCase().endsWith("gz")) { + if (params.ref_genome && file(params.ref_genome).extension == "gz") { // gzipped ref not supported by some downstream tools // easier to just decompress and pass it around. ref_genome = decompress_ref(ref_genome) }else { ref_genome = Channel.fromPath(ref_genome) } - if (params.ref_annotation.toLowerCase().endsWith("gz")) { + if (params.ref_annotation && file(params.ref_annotation).extension == "gz") { // gzipped ref not supported by some downstream tools // easier to just decompress and pass it around. decompress_annot= decompress_annotation(ref_annotation) @@ -519,6 +519,7 @@ workflow pipeline { if (params.transcriptome_source == "denovo"){ log.info("Doing de novo assembly") + log.info("WARNING: The `--transcriptome_source` denovo option may have unexpected results and errors. If possible it is preferable to use the reference-guided pipeline.") assembly = denovo_assembly(full_len_reads, ref_genome) } else { diff --git a/nextflow.config b/nextflow.config index f4b1ebc..6dc6b57 100644 --- a/nextflow.config +++ b/nextflow.config @@ -116,7 +116,7 @@ manifest { description = 'Transcriptome analysis including gene fusions, differential expression as well as assembly and annotation of cDNA and direct RNA sequencing data.' mainScript = 'main.nf' nextflowVersion = '>=22.10.8' - version = 'v0.1.12' + version = 'v0.1.13' } executor { diff --git a/subworkflows/denovo_assembly.nf b/subworkflows/denovo_assembly.nf index 422b7e9..e7fff94 100644 --- a/subworkflows/denovo_assembly.nf +++ b/subworkflows/denovo_assembly.nf @@ -178,12 +178,13 @@ process make_batches { then batch_size=$minimum_batch_size fi + echo "Num bases: \$nr_bases"; else batch_size=${params.isOnClust2_batch_size} fi echo "Batch size:\$batch_size"; - echo "Num bases: \$nr_bases"; + mkdir -p sorted; isONclust2 sort $params.isOnClust2_sort_options -v -o sorted $fastq; """ @@ -199,7 +200,7 @@ process clustering() { tuple val(sample_id), path('isONcluster_ROOT.cer'), emit: root_cluster script: """ - workflow-glue run_isonclust2 $sorted_batches + workflow-glue run_isonclust2 --workdir . """ } @@ -261,8 +262,7 @@ workflow denovo_assembly { cds_align( merge_cds.out.final_polished_cds .join(make_batches.out.sorted_reads_dir)) - - if (!reference.name.startsWith('OPTIONAL_FILE')){ + if (params.ref_genome) { cluster_quality(reference, fastq_reads_fl .join(dump_clusters.out.final_clusters_dir))