Skip to content

Commit

Permalink
Merge branch 'CW-2188-denovo-test' into 'dev'
Browse files Browse the repository at this point in the history
CW-2188 add test for denovo plus warnings

Closes CW-2188

See merge request epi2melabs/workflows/wf-transcriptomes!100
  • Loading branch information
sarahjeeeze committed Jun 14, 2023
2 parents 7d3b67a + 9f449ae commit cb73e0a
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 26 deletions.
6 changes: 5 additions & 1 deletion .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ docker-run:
- MATRIX_NAME: [
"fusions", "differential_expression", "isoforms",
"only_differential_expression", "differential_expression_gff3",
"ncbi_gzip"
"ncbi_gzip", "denovo"
]
rules:
# NOTE As we're overriding the rules block for the included docker-run
Expand All @@ -59,6 +59,10 @@ docker-run:
NF_WORKFLOW_OPTS: "--fastq ERR6053095_chr20.fastq --transcriptome-source reference-guided \
--ref_genome chr20/hg38_chr20.fa --ref_annotation chr20/gencode.v22.annotation.chr20.gtf"
NF_IGNORE_PROCESSES: preprocess_reads,merge_transcriptomes,decompress_annotation,decompress_ref
- if: $MATRIX_NAME == "denovo"
variables:
NF_WORKFLOW_OPTS: "--fastq test_data/fastq/SIRV_E0_PCS109_50.fq.gz --transcriptome_source denovo"
NF_IGNORE_PROCESSES: preprocess_reads,merge_transcriptomes,decompress_annotation,decompress_ref,build_minimap_index
- if: $MATRIX_NAME == "fusions"
variables:
NF_BEFORE_SCRIPT: wget -O test_data.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-isoforms/wf-isoforms_test_data.tar.gz && tar -xzvf test_data.tar.gz
Expand Down
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,18 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [unreleased]
## [v0.1.13]
### Added
- Handling for GFF3 reference_annotation file type.
- Warning for the `--transcriptome_source` denovo pipeline option.
### Changed
- Enum choices are enumerated in the `--help` output
- Enum choices are enumerated as part of the error message when a user has selected an invalid choice
- Bumped minimum required Nextflow version to 22.10.8

### Fixed
- Replaced `--threads` option in fastqingress with hardcoded values to remove warning about undefined `param.threads`
- Fix for the `--transcriptome_source` denovo pipeline option.

## [v0.1.12]
### Added
Expand Down
32 changes: 16 additions & 16 deletions bin/workflow_glue/compute_cluster_quality.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from collections import defaultdict
import math
from pathlib import Path
import sys

import matplotlib
from matplotlib import pyplot as plt
Expand All @@ -18,10 +17,12 @@
adjusted_rand_score, completeness_score,
homogeneity_score, v_measure_score)

from .util import wf_parser # noqa: ABS101
from .util import wf_parser, get_named_logger # noqa: ABS101

matplotlib.use('Agg')

logger = get_named_logger("clustqual")


def argparser():
"""Argument parser for entrypoint."""
Expand Down Expand Up @@ -181,16 +182,15 @@ def compute_v_measure(clusters, classes):
compl_score = completeness_score(class_list, cluster_list)
homog_score = homogeneity_score(class_list, cluster_list)
ari = adjusted_rand_score(class_list, cluster_list)

sys.stdout("Not included in clustering but aligned:", len(not_clustered))
sys.stdout(
logger.info("Not included in clustering but aligned:", len(not_clustered))
logger.info(
"v:",
v_score,
"Completeness:",
compl_score,
"Homogeneity:",
homog_score)
sys.stdout(
logger.info(
"Nr reads clustered but unaligned "
"(i.e., no class and excluded from v-measure): ",
clustered_but_unaligned)
Expand Down Expand Up @@ -231,14 +231,14 @@ def compute_v_measure_non_singleton_classes(clusters, classes):
homog_score = homogeneity_score(class_list, cluster_list)
nr_filtered_classes = len(
[1 for cl_id in classes_dict if len(classes_dict[cl_id]) >= 5])
sys.stdout(
logger.info(
"NONTRIvIAL CLASSES: v:",
v_score,
"Completeness:",
compl_score,
"Homogeneity:",
homog_score)
sys.stdout("NUMBER OF CLASSES (FILTERED):", len(
logger.info("NUMBER OF CLASSES (FILTERED):", len(
[1 for cl_id in classes_dict if len(classes_dict[cl_id]) >= 5]))
return v_score, compl_score, homog_score, nr_filtered_classes

Expand Down Expand Up @@ -268,20 +268,20 @@ def compute_v_measure_non_singletons(clusters, classes):
class_list.append(classes[read])
cluster_list.append(clusters[read])
else:
# sys.stdout("Read was clustered but unaligned:", read)
logger.debug("Read was clustered but unaligned:", read)
clustered_but_unaligned += 1

v_score = v_measure_score(class_list, cluster_list)
compl_score = completeness_score(class_list, cluster_list)
homog_score = homogeneity_score(class_list, cluster_list)
sys.stdout(
logger.info(
"NONTRIvIAL CLUSTERS: v:",
v_score,
"Completeness:",
compl_score,
"Homogeneity:",
homog_score)
sys.stdout(
logger.info(
"NONTRIvIAL CLUSTERS: Nr reads clustered but unaligned "
"(i.e., no class and excluded from v-veasure): ",
clustered_but_unaligned)
Expand Down Expand Up @@ -407,11 +407,11 @@ def get_cluster_information(clusters, classes):
else:
clustered_classes[class_id] += 1

sys.stdout("UNCLUSTERED:", "Tot classes:", len(not_clustered_classes))
sys.stdout("CLUSTERED:", "Tot classes:", len(clustered_classes))
sys.stdout("MIXED:", "Tot classes containing both:", len(
logger.info("UNCLUSTERED:", "Tot classes:", len(not_clustered_classes))
logger.info("CLUSTERED:", "Tot classes:", len(clustered_classes))
logger.info("MIXED:", "Tot classes containing both:", len(
set(clustered_classes.keys()) & set(not_clustered_classes.keys())))
sys.stdout("Total number of classes (unique gene ID):", total_nr_classes)
logger.info("Total number of classes (unique gene ID):", total_nr_classes)
return (
total_nr_classes - len(singleton_classes),
len(singleton_classes),
Expand Down Expand Up @@ -507,7 +507,7 @@ def main(args):

non_singleton_clusters = total_nr_clusters - singleton_clusters

sys.stdout(
logger.info(
"NONTRIVIAL CLUSTERS: ", (total_nr_clusters - singleton_clusters))

outfile.write("CLUSTERS\n")
Expand Down
7 changes: 4 additions & 3 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ process run_gffcompare{
script:
def out_dir = "${sample_id}_gffcompare"

if ( ref_annotation.name.startsWith('OPTIONAL_FILE') ){
if (params.transcriptome_source == "denovo"){
"""
mkdir $out_dir
"""
Expand Down Expand Up @@ -454,14 +454,14 @@ workflow pipeline {
ref_transcriptome
use_ref_ann
main:
if (params.ref_genome.toLowerCase().endsWith("gz")) {
if (params.ref_genome && file(params.ref_genome).extension == "gz") {
// gzipped ref not supported by some downstream tools
// easier to just decompress and pass it around.
ref_genome = decompress_ref(ref_genome)
}else {
ref_genome = Channel.fromPath(ref_genome)
}
if (params.ref_annotation.toLowerCase().endsWith("gz")) {
if (params.ref_annotation && file(params.ref_annotation).extension == "gz") {
// gzipped ref not supported by some downstream tools
// easier to just decompress and pass it around.
decompress_annot= decompress_annotation(ref_annotation)
Expand Down Expand Up @@ -519,6 +519,7 @@ workflow pipeline {

if (params.transcriptome_source == "denovo"){
log.info("Doing de novo assembly")
log.info("WARNING: The `--transcriptome_source` denovo option may have unexpected results and errors. If possible it is preferable to use the reference-guided pipeline.")
assembly = denovo_assembly(full_len_reads, ref_genome)

} else {
Expand Down
2 changes: 1 addition & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ manifest {
description = 'Transcriptome analysis including gene fusions, differential expression as well as assembly and annotation of cDNA and direct RNA sequencing data.'
mainScript = 'main.nf'
nextflowVersion = '>=22.10.8'
version = 'v0.1.12'
version = 'v0.1.13'
}

executor {
Expand Down
8 changes: 4 additions & 4 deletions subworkflows/denovo_assembly.nf
Original file line number Diff line number Diff line change
Expand Up @@ -178,12 +178,13 @@ process make_batches {
then
batch_size=$minimum_batch_size
fi
echo "Num bases: \$nr_bases";
else
batch_size=${params.isOnClust2_batch_size}
fi
echo "Batch size:\$batch_size";
echo "Num bases: \$nr_bases";
mkdir -p sorted; isONclust2 sort $params.isOnClust2_sort_options -v -o sorted $fastq;
"""
Expand All @@ -199,7 +200,7 @@ process clustering() {
tuple val(sample_id), path('isONcluster_ROOT.cer'), emit: root_cluster
script:
"""
workflow-glue run_isonclust2 $sorted_batches
workflow-glue run_isonclust2 --workdir .
"""
}

Expand Down Expand Up @@ -261,8 +262,7 @@ workflow denovo_assembly {
cds_align(
merge_cds.out.final_polished_cds
.join(make_batches.out.sorted_reads_dir))

if (!reference.name.startsWith('OPTIONAL_FILE')){
if (params.ref_genome) {
cluster_quality(reference, fastq_reads_fl
.join(dump_clusters.out.final_clusters_dir))

Expand Down

0 comments on commit cb73e0a

Please sign in to comment.