From 2dbab6352aafe28b64b79665636f911e938d8c3e Mon Sep 17 00:00:00 2001 From: Marc Sturm Date: Fri, 13 Dec 2024 15:12:51 +0100 Subject: [PATCH 1/4] BamToFastq: Added 'fix' parameter. --- src/BamToFastq/main.cpp | 59 +++++++++++++++++--------------- src/cppNGS/BamWriter.cpp | 8 ++++- src/cppNGS/BamWriter.h | 3 ++ src/tools-TEST/BamToFastq_Test.h | 30 ++++++++++++++-- 4 files changed, 69 insertions(+), 31 deletions(-) diff --git a/src/BamToFastq/main.cpp b/src/BamToFastq/main.cpp index 8b6506881..bfd38d021 100644 --- a/src/BamToFastq/main.cpp +++ b/src/BamToFastq/main.cpp @@ -24,14 +24,15 @@ class ConcreteTool //optional addOutfile("out2", "Read 2 output FASTQ.GZ file (required for pair-end samples).", true); - addEnum("mode", "Determine if BAM/CRAM contains paired-end or single-end reads (default: paired-end)", true, QStringList() << "paired-end" << "single-end", "paired-end"); addString("reg", "Export only reads in the given region. Format: chr:start-end.", true); - addFlag("remove_duplicates", "Does not export duplicate reads into the FASTQ file."); + addFlag("remove_duplicates", "Does not export reads marked as duplicates in SAM flags into the FASTQ file."); addInt("compression_level", "Output FASTQ compression level from 1 (fastest) to 9 (best compression).", true, 1); addInt("write_buffer_size", "Output write buffer size (number of FASTQ entry pairs).", true, 100); addInfile("ref", "Reference genome for CRAM support (mandatory if CRAM is used).", true); addInt("extend", "Extend all reads to the given length. Base 'N' and base qualiy '2' are used for extension.", true, 0); + addFlag("fix", "Keep only one read pair if several have the same name (note: needs much memory as read names are kept in memory)."); + changeLog(2024, 12, 13, "Added 'fix' parameter."); changeLog(2024, 12, 9, "Added 'extend' parameter."); changeLog(2020, 11, 27, "Added CRAM support."); changeLog(2020, 5, 29, "Massive speed-up by writing in background. Added 'compression_level' parameter."); @@ -69,14 +70,9 @@ class ConcreteTool timer.start(); QTextStream out(stdout); BamReader reader(getInfile("in"), getInfile("ref")); - QString out1 = getOutfile("out1"); QString out2 = getOutfile("out2"); - QString mode = getEnum("mode"); - - if (mode == "paired-end" && out2.trimmed().isEmpty()) THROW(ArgumentException, "'out2' has to be provided for paired-end reads!"); - if (mode == "single-end" && out2.trimmed() != "") THROW(ArgumentException, "'out2' cannot be set for single-end reads!"); - + bool fix = getFlag("fix"); QString reg = getString("reg"); if (reg!="") { @@ -98,33 +94,30 @@ class ConcreteTool QThreadPool analysis_pool; analysis_pool.setMaxThreadCount(1); OutputWorker* output_worker; - if (mode == "paired-end") + const bool is_pe = !out2.trimmed().isEmpty(); + if (is_pe) { output_worker = new OutputWorker(pair_pool, out1, out2, compression_level); } - else if (mode == "single-end") + else //single-end { output_worker = new OutputWorker(pair_pool, out1, compression_level); } - else - { - THROW(ArgumentException, "Invalid mode '" + mode + "' provided!") - } analysis_pool.start(output_worker); long long c_unpaired = 0; long long c_paired = 0; long long c_duplicates = 0; long long c_single_end = 0; + long long c_fixed = 0; int max_cached = 0; //iterate through reads QHash> al_cache; + QSet reads_processed; QSharedPointer al = QSharedPointer(new BamAlignment()); - while (true) + while (reader.getNextAlignment(*al)) { - bool ok = reader.getNextAlignment(*al); - if (!ok) break; //out << al.name() << " PAIRED=" << al.isPaired() << " SEC=" << al.isSecondaryAlignment() << " PROP=" << al.isProperPair() << endl; //skip secondary alinments @@ -137,7 +130,19 @@ class ConcreteTool continue; } - if(mode == "paired-end") + //remove reads with same name (but handle read pairing properly) + if (fix) + { + QByteArray name = al->name() + (al->isRead1() ? "/1" : "/2"); + if (reads_processed.contains(name)) + { + ++c_fixed; + continue; + } + reads_processed << name; + } + + if(is_pe) { //skip unpaired if(!al->isPaired()) @@ -167,8 +172,7 @@ class ConcreteTool pair.status = ReadPair::TO_BE_WRITTEN; ++c_paired; } - //cache read for later retrieval - else + else //cache read for later retrieval { al_cache.insert(name, al); al = QSharedPointer(new BamAlignment()); @@ -176,21 +180,17 @@ class ConcreteTool max_cached = std::max(max_cached, al_cache.size()); } - else if (mode == "single-end") + else //single-end { ReadPair& pair = pair_pool.nextFreePair(); alignmentToFastq(al, pair.e1, extend); pair.status = ReadPair::TO_BE_WRITTEN; ++c_single_end; } - else - { - THROW(ArgumentException, "Invalid mode '" + mode + "' provided!") - } } //write debug output - if(mode == "paired-end") + if(is_pe) { out << "Pair reads (written) : " << c_paired << endl; out << "Unpaired reads (skipped) : " << c_unpaired << endl; @@ -200,10 +200,13 @@ class ConcreteTool { out << "Reads (written) : " << c_single_end << endl; } - if (remove_duplicates) { - out << "Duplicate reads (skipped) : " << c_duplicates << endl; + out << "Duplicate tagged reads (skipped): " << c_duplicates << endl; + } + if (fix) + { + out << "Duplicate name reads (skipped) : " << c_fixed << endl; } out << endl; out << "Maximum cached reads : " << max_cached << endl; diff --git a/src/cppNGS/BamWriter.cpp b/src/cppNGS/BamWriter.cpp index e91c53848..9f05dfac1 100644 --- a/src/cppNGS/BamWriter.cpp +++ b/src/cppNGS/BamWriter.cpp @@ -39,5 +39,11 @@ BamWriter::BamWriter(const QString& bam_file, const QString& ref_file) BamWriter::~BamWriter() { - sam_close(fp_); + close(); +} + +void BamWriter::close() +{ + if (!fp_closed_) sam_close(fp_); + fp_closed_ = true; } diff --git a/src/cppNGS/BamWriter.h b/src/cppNGS/BamWriter.h index 658372136..1db74d93c 100644 --- a/src/cppNGS/BamWriter.h +++ b/src/cppNGS/BamWriter.h @@ -15,6 +15,8 @@ class CPPNGSSHARED_EXPORT BamWriter BamWriter(const QString& bam_file, const QString& ref_file = QString()); //Destructor ~BamWriter(); + //close file (not necessary in most cases, as the file is closed in the constructor anyway) + void close(); //Write a BAM header from another BAM file void writeHeader(const BamReader& reader) @@ -39,6 +41,7 @@ class CPPNGSSHARED_EXPORT BamWriter QString bam_file_; samFile* fp_ = nullptr; sam_hdr_t* header_ = nullptr; + bool fp_closed_ = false; //"declared away" methods BamWriter(const BamWriter&) = delete; diff --git a/src/tools-TEST/BamToFastq_Test.h b/src/tools-TEST/BamToFastq_Test.h index 5e74c5b06..5827387b8 100644 --- a/src/tools-TEST/BamToFastq_Test.h +++ b/src/tools-TEST/BamToFastq_Test.h @@ -1,12 +1,14 @@ #include "TestFramework.h" #include "FastqFileStream.h" +#include "BamReader.h" +#include "BamWriter.h" TEST_CLASS(BamToFastq_Test) { Q_OBJECT private slots: - void test_mandatory_parameter() + void test_default_parameter() { EXECUTE("BamToFastq", "-in " + TESTDATA("data_in/BamToFastq_in1.bam") + " -out1 out/BamToFastq_out1.fastq.gz -out2 out/BamToFastq_out2.fastq.gz -write_buffer_size 1"); IS_TRUE(QFile::exists("out/BamToFastq_out1.fastq.gz")); @@ -15,6 +17,29 @@ private slots: COMPARE_GZ_FILES("out/BamToFastq_out2.fastq.gz", TESTDATA("data_out/BamToFastq_out2.fastq.gz")); } + void test_fix() //uses data and results from first test, but duplicates the reads + { + //create BAM with each read duplicated + QString bam_temp = Helper::tempFileName(".bam"); + BamWriter writer(bam_temp); + QString bam = TESTDATA("data_in/BamToFastq_in1.bam"); + for (int i=0; i<2; ++i) + { + BamReader reader(bam); + if (i==0) writer.writeHeader(reader); + BamAlignment al; + while(reader.getNextAlignment(al)) + { + writer.writeAlignment(al); + } + } + writer.close(); + + EXECUTE("BamToFastq", "-in " + bam_temp + " -out1 out/BamToFastq_out1_fix.fastq.gz -out2 out/BamToFastq_out2_fix.fastq.gz -write_buffer_size 1 -fix"); + COMPARE_GZ_FILES("out/BamToFastq_out1_fix.fastq.gz", TESTDATA("data_out/BamToFastq_out1.fastq.gz")); + COMPARE_GZ_FILES("out/BamToFastq_out2_fix.fastq.gz", TESTDATA("data_out/BamToFastq_out2.fastq.gz")); + } + void test_remove_duplicates() { EXECUTE("BamToFastq", "-in " + TESTDATA("data_in/BamToFastq_in1.bam") + " -remove_duplicates -out1 out/BamToFastq_out3.fastq.gz -out2 out/BamToFastq_out4.fastq.gz -write_buffer_size 1"); @@ -35,7 +60,7 @@ private slots: void single_end() { - EXECUTE("BamToFastq", "-in " + TESTDATA("data_in/BamToFastq_in3.bam") + " -out1 out/BamToFastq_out7.fastq.gz -mode single-end -write_buffer_size 1"); + EXECUTE("BamToFastq", "-in " + TESTDATA("data_in/BamToFastq_in3.bam") + " -out1 out/BamToFastq_out7.fastq.gz -write_buffer_size 1"); IS_TRUE(QFile::exists("out/BamToFastq_out7.fastq.gz")); COMPARE_GZ_FILES("out/BamToFastq_out7.fastq.gz", TESTDATA("data_out/BamToFastq_out7.fastq.gz")); } @@ -48,6 +73,7 @@ private slots: COMPARE_GZ_FILES("out/BamToFastq_out8.fastq.gz", TESTDATA("data_out/BamToFastq_out8.fastq.gz")); COMPARE_GZ_FILES("out/BamToFastq_out9.fastq.gz", TESTDATA("data_out/BamToFastq_out9.fastq.gz")); } + }; From 8fdaabc9e01a314812d853ea40be842f576ee3d8 Mon Sep 17 00:00:00 2001 From: Marc Sturm Date: Fri, 13 Dec 2024 21:02:23 +0100 Subject: [PATCH 2/4] BamToFastq: improved memory consumption of 'fix' mode. --- src/BamToFastq/main.cpp | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/src/BamToFastq/main.cpp b/src/BamToFastq/main.cpp index bfd38d021..6d8687715 100644 --- a/src/BamToFastq/main.cpp +++ b/src/BamToFastq/main.cpp @@ -63,6 +63,22 @@ class ConcreteTool } } + struct ReadWritten + { + bool read1 = false; + bool read2 = false; + + bool done(bool first_read) const + { + return first_read ? read1 : read2; + } + void setDone(bool first_read) + { + if(first_read) read1 = true; + else read2 = true; + } + }; + virtual void main() { //init @@ -111,10 +127,9 @@ class ConcreteTool long long c_single_end = 0; long long c_fixed = 0; int max_cached = 0; - //iterate through reads QHash> al_cache; - QSet reads_processed; + QHash reads_processed; QSharedPointer al = QSharedPointer(new BamAlignment()); while (reader.getNextAlignment(*al)) { @@ -133,13 +148,13 @@ class ConcreteTool //remove reads with same name (but handle read pairing properly) if (fix) { - QByteArray name = al->name() + (al->isRead1() ? "/1" : "/2"); - if (reads_processed.contains(name)) + ReadWritten& written = reads_processed[al->name()]; + if (written.done(al->isRead1())) { ++c_fixed; continue; } - reads_processed << name; + written.setDone(al->isRead1()); } if(is_pe) From f56ba84f61eb8aede0937dc2f7e9c707f0f11637 Mon Sep 17 00:00:00 2001 From: Marc Sturm Date: Tue, 17 Dec 2024 12:25:41 +0100 Subject: [PATCH 3/4] Added TsvDiff tool (#610) --- src/ExportcBioportal/main.cpp | 8 +- src/GSvar/CohortExpressionDataWidget.cpp | 6 +- src/GSvar/DiseaseCourseWidget.cpp | 4 +- src/GSvar/ExpressionExonWidget.cpp | 52 ++-- src/GSvar/ExpressionGeneWidget.cpp | 28 +- src/GSvar/FusionWidget.cpp | 6 +- src/GSvar/PRSWidget.cpp | 6 +- src/TsvDiff/TsvDiff.pro | 14 + src/TsvDiff/main.cpp | 313 +++++++++++++++++++++++ src/cppCORE | 2 +- src/cppGUI | 2 +- src/cppNGSD-TEST/NGSD_Test.h | 4 +- src/cppNGSD/ExportCBioPortalStudy.cpp | 4 +- src/cppNGSD/GermlineReportGenerator.cpp | 12 +- src/tools-TEST/TsvDiff_Test.h | 22 ++ src/tools-TEST/data_in/TsvDiff_in1.tsv | 19 ++ src/tools-TEST/data_in/TsvDiff_in2.tsv | 20 ++ src/tools-TEST/data_out/TsvDiff_out1.txt | 0 src/tools-TEST/data_out/TsvDiff_out2.txt | 14 + src/tools-TEST/tools-TEST.pro | 3 +- src/tools.pro | 4 + 21 files changed, 475 insertions(+), 68 deletions(-) create mode 100644 src/TsvDiff/TsvDiff.pro create mode 100644 src/TsvDiff/main.cpp create mode 100644 src/tools-TEST/TsvDiff_Test.h create mode 100644 src/tools-TEST/data_in/TsvDiff_in1.tsv create mode 100644 src/tools-TEST/data_in/TsvDiff_in2.tsv create mode 100644 src/tools-TEST/data_out/TsvDiff_out1.txt create mode 100644 src/tools-TEST/data_out/TsvDiff_out2.txt diff --git a/src/ExportcBioportal/main.cpp b/src/ExportcBioportal/main.cpp index 132f04eac..fdb99f516 100644 --- a/src/ExportcBioportal/main.cpp +++ b/src/ExportcBioportal/main.cpp @@ -63,9 +63,9 @@ class ConcreteTool int idx_icd10_catalog = samples.columnIndex("icd10_catalog"); int idx_oncotree_code = samples.columnIndex("oncotree_code"); - for (int i=0; i< samples.rowCount(); i++) + for (int i=0; i< samples.count(); i++) { - QStringList row = samples.row(i); + const QStringList& row = samples[i]; QString sample_id = db.sampleId(row[idx_tumor_name], true); SampleMTBmetadata mtb_data; @@ -177,9 +177,9 @@ class ConcreteTool int idx_datatype = getIndex(headers, "datatype"); int idx_prio = getIndex(headers, "priority"); - for (int i=0; itw_cohort_data->setRowCount(cohort_expression_data.rowCount()); + ui_->tw_cohort_data->setRowCount(cohort_expression_data.count()); ui_->tw_cohort_data->setColumnCount(cohort_expression_data.headers().size()); // create header @@ -69,9 +69,9 @@ void CohortExpressionDataWidget::loadExpressionData() } //fill table - for(int row_idx=0; row_idx 0) diff --git a/src/GSvar/DiseaseCourseWidget.cpp b/src/GSvar/DiseaseCourseWidget.cpp index cbf162354..97e8e046f 100644 --- a/src/GSvar/DiseaseCourseWidget.cpp +++ b/src/GSvar/DiseaseCourseWidget.cpp @@ -217,9 +217,9 @@ void DiseaseCourseWidget::createTableView() { for (int col_idx = 0; col_idx < table_data_.cfdna_samples.length(); ++col_idx) { - if(table_data_.mrd_tables.at(col_idx).rowCount() > 0) + if(table_data_.mrd_tables.at(col_idx).count() > 0) { - ui_->mrd->setItem(row_idx, col_idx, GUIHelper::createTableItem(table_data_.mrd_tables.at(col_idx).row(0).at(row_idx), Qt::AlignRight)); + ui_->mrd->setItem(row_idx, col_idx, GUIHelper::createTableItem(table_data_.mrd_tables.at(col_idx)[0].at(row_idx), Qt::AlignRight)); continue; } ui_->mrd->setItem(row_idx, col_idx, GUIHelper::createTableItem("")); diff --git a/src/GSvar/ExpressionExonWidget.cpp b/src/GSvar/ExpressionExonWidget.cpp index 13c780a18..5898b7aff 100644 --- a/src/GSvar/ExpressionExonWidget.cpp +++ b/src/GSvar/ExpressionExonWidget.cpp @@ -117,7 +117,7 @@ void ExpressionExonWidget::loadExpressionFile() qDebug() << "TSV parsed"; //init filter data - filter_result_ = FilterResult(expression_data_.rowCount()); + filter_result_ = FilterResult(expression_data_.count()); QApplication::restoreOverrideCursor(); } @@ -215,7 +215,7 @@ void ExpressionExonWidget::applyFilters() ui_->tw_expression_table->setEnabled(false); filter_result_.reset(true); - int filtered_lines = expression_data_.rowCount(); + int filtered_lines = expression_data_.count(); QTime timer; timer.start(); @@ -239,11 +239,11 @@ void ExpressionExonWidget::applyFilters() if (gene_idx != -1) { - for(int row_idx=0; row_idxsb_min_rpb->value(); - for(int row_idx=0; row_idxsb_min_srpb_sample->value(); - for(int row_idx=0; row_idxsb_low_expression->value(); - for(int row_idx=0; row_idxsb_min_srpb_cohort->value(); - for(int row_idx=0; row_idxsb_min_logfc->value(); - for(int row_idx=0; row_idxsb_min_zscore->value(); - for(int row_idx=0; row_idxtw_expression_table->sortByColumn(9, Qt::DescendingOrder); //set number of filtered / total rows - ui_->l_filtered_rows->setText(QByteArray::number(filter_result_.flags().count(true)) + " / " + QByteArray::number(expression_data_.rowCount())); + ui_->l_filtered_rows->setText(QByteArray::number(filter_result_.flags().count(true)) + " / " + QByteArray::number(expression_data_.count())); //optimize table view GUIHelper::resizeTableCellWidths(ui_->tw_expression_table, 350); diff --git a/src/GSvar/ExpressionGeneWidget.cpp b/src/GSvar/ExpressionGeneWidget.cpp index 6e63121f6..a9d52cf6a 100644 --- a/src/GSvar/ExpressionGeneWidget.cpp +++ b/src/GSvar/ExpressionGeneWidget.cpp @@ -126,7 +126,7 @@ void ExpressionGeneWidget::applyFilters(int max_rows) if (filtering_in_progress_) return; filtering_in_progress_ = true; //skip if not necessary - int row_count = expression_data_.rowCount(); + int row_count = expression_data_.count(); if (row_count == 0) return; try @@ -183,7 +183,7 @@ void ExpressionGeneWidget::applyFilters(int max_rows) { if (!filter_result_.flags()[row_idx]) continue; - filter_result_.flags()[row_idx] = variant_gene_set_.contains(expression_data_.row(row_idx).at(gene_idx).toUtf8().trimmed()); + filter_result_.flags()[row_idx] = variant_gene_set_.contains(expression_data_[row_idx].at(gene_idx).toUtf8().trimmed()); } qDebug() << filter_result_.countPassing(); @@ -204,7 +204,7 @@ void ExpressionGeneWidget::applyFilters(int max_rows) if (!filter_result_.flags()[row_idx]) continue; // generate GeneSet from column text - GeneSet sv_genes = GeneSet::createFromText(expression_data_.row(row_idx).at(gene_idx).toUtf8().trimmed(), ','); + GeneSet sv_genes = GeneSet::createFromText(expression_data_[row_idx].at(gene_idx).toUtf8().trimmed(), ','); bool match_found = false; foreach(const QByteArray& sv_gene, sv_genes) @@ -225,7 +225,7 @@ void ExpressionGeneWidget::applyFilters(int max_rows) if (!filter_result_.flags()[row_idx]) continue; // generate GeneSet from column text - GeneSet sv_genes = GeneSet::createFromText(expression_data_.row(row_idx).at(gene_idx).toUtf8().trimmed(), ','); + GeneSet sv_genes = GeneSet::createFromText(expression_data_[row_idx].at(gene_idx).toUtf8().trimmed(), ','); filter_result_.flags()[row_idx] = sv_genes.intersectsWith(gene_whitelist); } @@ -245,7 +245,7 @@ void ExpressionGeneWidget::applyFilters(int max_rows) //skip already filtered if (!filter_result_.flags()[row_idx]) continue; - QString value = expression_data_.row(row_idx).at(tpm_idx).trimmed(); + QString value = expression_data_[row_idx].at(tpm_idx).trimmed(); if (value.isEmpty() || value == "n/a") { filter_result_.flags()[row_idx] = false; @@ -281,7 +281,7 @@ void ExpressionGeneWidget::applyFilters(int max_rows) //skip already filtered if (!filter_result_.flags()[row_idx]) continue; - QString biotype = expression_data_.row(row_idx).at(idx_biotype).trimmed().replace("_", " "); + QString biotype = expression_data_[row_idx].at(idx_biotype).trimmed().replace("_", " "); filter_result_.flags()[row_idx] = selected_biotypes.contains(biotype); } @@ -298,7 +298,7 @@ void ExpressionGeneWidget::applyFilters(int max_rows) //skip already filtered if (!filter_result_.flags()[row_idx]) continue; - QString value_sample = expression_data_.row(row_idx).at(tpm_idx); + QString value_sample = expression_data_[row_idx].at(tpm_idx); if (value_sample.isEmpty() || value_sample == "n/a") { filter_result_.flags()[row_idx] = false; @@ -340,7 +340,7 @@ void ExpressionGeneWidget::applyFilters(int max_rows) //get gene and tpm value bool in_db = false; - QByteArray ensg_number = expression_data_.row(row_idx).at(gene_id_idx).toUtf8().trimmed(); + QByteArray ensg_number = expression_data_[row_idx].at(gene_id_idx).toUtf8().trimmed(); QByteArray gene_symbol; if(ensg_mapping_.contains(ensg_number)) @@ -353,7 +353,7 @@ void ExpressionGeneWidget::applyFilters(int max_rows) continue; } double tpm = 0.0; - QString value = expression_data_.row(row_idx).at(tpm_idx).toUtf8().trimmed(); + QString value = expression_data_[row_idx].at(tpm_idx).toUtf8().trimmed(); if (!(value.isEmpty() || value == "n/a")) { tpm = Helper::toDouble(value); @@ -777,7 +777,7 @@ void ExpressionGeneWidget::loadExpressionData() //init filter mask - filter_result_ = FilterResult(expression_data_.rowCount()); + filter_result_ = FilterResult(expression_data_.count()); QApplication::restoreOverrideCursor(); } @@ -885,12 +885,12 @@ void ExpressionGeneWidget::updateTable(int max_rows) } - for(int file_row_idx=0; file_row_idx= max_rows) { - ui_->filtered_rows->setText(QByteArray::number(max_rows) + "+ / " + QByteArray::number(expression_data_.rowCount()) + " (showing only first " + QByteArray::number(max_rows) + ")"); + ui_->filtered_rows->setText(QByteArray::number(max_rows) + "+ / " + QByteArray::number(expression_data_.count()) + " (showing only first " + QByteArray::number(max_rows) + ")"); } else { - ui_->filtered_rows->setText(QByteArray::number(ui_->expression_data->rowCount()) + " / " + QByteArray::number(expression_data_.rowCount())); + ui_->filtered_rows->setText(QByteArray::number(ui_->expression_data->rowCount()) + " / " + QByteArray::number(expression_data_.count())); } diff --git a/src/GSvar/FusionWidget.cpp b/src/GSvar/FusionWidget.cpp index fa17eea8d..c099e3b60 100644 --- a/src/GSvar/FusionWidget.cpp +++ b/src/GSvar/FusionWidget.cpp @@ -61,10 +61,10 @@ void FusionWidget::loadFusionData() } //fill table widget with expression data - ui_->fusions->setRowCount(fusion_data.rowCount()); - for(int row_idx=0; row_idxfusions->setRowCount(fusion_data.count()); + for(int row_idx=0; row_idxsetRowCount(prs_table_.rowCount()); - for (int r=0; rsetRowCount(prs_table_.count()); + for (int r=0; rsetItem(r , c, GUIHelper::createTableItem(prs_table_.row(r)[c].trimmed())); + ui_.prs->setItem(r , c, GUIHelper::createTableItem(prs_table_[r][c].trimmed())); } } diff --git a/src/TsvDiff/TsvDiff.pro b/src/TsvDiff/TsvDiff.pro new file mode 100644 index 000000000..090937b7c --- /dev/null +++ b/src/TsvDiff/TsvDiff.pro @@ -0,0 +1,14 @@ +#------------------------------------------------- +# +# Project created by QtCreator 2013-10-08T13:40:57 +# +#------------------------------------------------- + +TEMPLATE = app +QT -= gui +CONFIG += console +CONFIG -= app_bundle + +SOURCES += main.cpp + +include("../app_cli.pri") diff --git a/src/TsvDiff/main.cpp b/src/TsvDiff/main.cpp new file mode 100644 index 000000000..413e30b68 --- /dev/null +++ b/src/TsvDiff/main.cpp @@ -0,0 +1,313 @@ +#include "ToolBase.h" +#include "TsvFile.h" +#include "Exceptions.h" +#include "Helper.h" + + +template +QString stringRepresentation(const T& /*element*/) +{ + return "[not implemented]"; +} +template<> +QString stringRepresentation(const QString& element) +{ + return element; +} +template<> +QString stringRepresentation(const QStringList& element) +{ + return element.join("\t"); +} + +class ConcreteTool + : public ToolBase +{ + Q_OBJECT + +public: + ConcreteTool(int& argc, char *argv[]) + : ToolBase(argc, argv) + { + } + + virtual void setup() + { + setDescription("Compares TSV files."); + setExtendedDescription(QStringList () << "A simple pairwise alignment algorithm is used, which is slow for a large number of lines"); + + addInfile("in1", "Input TSV file. If unset, reads from STDIN.", false); + addInfile("in2", "Input TSV file. If unset, reads from STDIN.", false); + + //optional + addOutfile("out", "Output file with differences. If unset, writes to stdout.", true); + addFlag("skip_comments", "Do not compare comment lines starting with '##'."); + addString("skip_comments_matching", "Comma-separated list of sub-strings for skipping comment lines (case-sensitive matching).", true); + addString("skip_cols", "Comma-separated list of colums to skip.", true); + addFlag("no_error", "Do not exit with error state if differences are detected"); + } + + struct DiffSummary + { + int added = 0; + int removed = 0; + + bool hasDifferences() const + { + return added+removed>0; + } + }; + + void removeComments(TsvFile& file, QStringList strings) + { + if (strings.isEmpty()) return; + + QStringList tmp = file.comments(); + foreach(const QString& str, strings) + { + std::remove_if(tmp.begin(), tmp.end(), [str](const QString& line){ return line.contains(str);}); + } + file.setComments(tmp); + + } + + //matrix for dynamic programming + class Matrix + { + public: + Matrix(int n, int m) + : n_(n) + , m_(m) + { + QVector tmp(m+1, -1); + tmp[0] = 0; + + data_.reserve(n+1); + for(int i=0; i<=n; ++i) + { + if (i==0) + { + data_.append(QVector(m+1, 0)); + } + else + { + data_.append(tmp); + } + } + } + + int n() const { return n_; }; + int m() const { return m_; }; + + char value(int n, int m) const + { + if (n>n_) THROW(Exception, "Matrix:value() Invalid matrix position 'n': Is " + QString::number(n) + " but max is " + QString::number(n_)); + if (m>m_) THROW(Exception, "Matrix:value() Invalid matrix position 'm': Is " + QString::number(m) + " but max is " + QString::number(m_)); + + return data_[n][m]; + } + + void setValue(int n, int m, char v) + { + if (n>n_) THROW(Exception, "Matrix:value() Invalid matrix position 'n': Is " + QString::number(n) + " but max is " + QString::number(n_)); + if (m>m_) THROW(Exception, "Matrix:value() Invalid matrix position 'm': Is " + QString::number(m) + " but max is " + QString::number(m_)); + + data_[n][m] = v; + } + + void print(QTextStream& ostream) const + { + ostream << "Matrix: " << endl; + foreach(const QVector& element, data_) + { + for (int i=0; i0) ostream << " "; + QString value = QString::number(element[i]); + value = value.rightJustified(4, ' '); + ostream << value; + } + ostream << endl; + } + ostream << endl; + } + + QList> findMatchIndices() + { + QList> output; + + int i =n_; + for (int j=m_; j>0; --j) + { + if (value(i,j)==value(i,j-1)) continue; + if (value(i,j)==value(i-1,j)) + { + --i; + ++j; + continue; + } + + output << qMakePair(i-1, j-1); + --i; + } + + std::reverse(output.begin(), output.end()); + + return output; + } + + private: + QVector> data_; + int n_; + int m_; + }; + + //build matrix for dynamic programming + template + char buildMatrix(const T& s1, const T& s2, int i, int j, Matrix& m) + { + //already calculated > return value + char v = m.value(i,j); + if (v!=-1) return v; + + if (s1[i-1]==s2[j-1]) + { + v = 1 + buildMatrix(s1, s2, i-1, j-1, m); + } + else + { + v = std::max(buildMatrix(s1, s2, i-1, j, m), buildMatrix(s1, s2, i, j-1, m)); + } + + m.setValue(i, j, v); + return v; + } + + template + void compare(const T& lines1, const T& lines2, QTextStream& ostream, DiffSummary& summary) + { + int n = lines1.count(); + int m = lines2.count(); + + //determine LCS + Matrix matrix(n, m); + buildMatrix(lines1, lines2, n, m, matrix); + //matrix.print(ostream); + QList> matches = matrix.findMatchIndices(); + //foreach(auto m, matches) ostream << m.first << "/" << m.second << endl; + + //before first match + for (int i=0; i out = Helper::openFileForWriting(getOutfile("out"), true); + QTextStream ostream(out.data()); + bool no_error = getFlag("no_error"); + + //load files + TsvFile in1; + in1.load(getInfile("in1")); + TsvFile in2; + in2.load(getInfile("in2")); + + //compare comments + if (!skip_comments) + { + removeComments(in1, skip_comments_matching); + removeComments(in2, skip_comments_matching); + + compare(in1.comments(), in2.comments(), ostream, summary_comments); + } + + //remove skipped columns + foreach(const QString& col, skip_cols) + { + int idx = in1.columnIndex(col, false); + if (idx!=-1) in1.removeColumn(idx); + + idx = in2.columnIndex(col, false); + if (idx!=-1) in2.removeColumn(idx); + } + + //compare headers + if(in1.headers()!=in2.headers()) + { + THROW(Exception, "Cannot compare files with differing column headers:\nin1:"+in1.headers().join("\t")+"\nin2:"+in2.headers().join("\t")); + } + + //compare content lines + compare(in1, in2, ostream, summary_content); + + //output + bool has_differences = summary_comments.hasDifferences() || summary_content.hasDifferences(); + if (has_differences) + { + ostream << "Difference summary:" << endl; + if (summary_comments.added) ostream << "comment lines added: " << summary_comments.added << endl; + if (summary_comments.removed) ostream << "comment lines removed: " << summary_comments.removed << endl; + if (summary_content.added) ostream << "content lines added: " << summary_content.added << endl; + if (summary_content.removed) ostream << "content lines removed: " << summary_content.removed << endl; + } + + //set exit code + if (has_differences && !no_error) + { + setExitErrorState(true); + } + } +}; + +#include "main.moc" + +int main(int argc, char *argv[]) +{ + ConcreteTool tool(argc, argv); + return tool.execute(); +} + diff --git a/src/cppCORE b/src/cppCORE index aac4da654..e94c727d7 160000 --- a/src/cppCORE +++ b/src/cppCORE @@ -1 +1 @@ -Subproject commit aac4da6549a6908ac87c103d7674c89de7e41e61 +Subproject commit e94c727d7bafe20e8d7ab8bc7b56f49f63f5ed79 diff --git a/src/cppGUI b/src/cppGUI index 95c6bb518..419301b22 160000 --- a/src/cppGUI +++ b/src/cppGUI @@ -1 +1 @@ -Subproject commit 95c6bb518effcf473d47d6313401e06130d009b4 +Subproject commit 419301b227b4e4a036505686ae7b115e1f888266 diff --git a/src/cppNGSD-TEST/NGSD_Test.h b/src/cppNGSD-TEST/NGSD_Test.h index 22377b1f6..f3bb0276e 100644 --- a/src/cppNGSD-TEST/NGSD_Test.h +++ b/src/cppNGSD-TEST/NGSD_Test.h @@ -3296,9 +3296,9 @@ private slots: QMap best; TsvFile tmp; tmp.load(TESTDATA("data_in/VariantHgvsAnnotator_comparison_vep_best_transcripts.tsv")); - for (int i=0; i0) + if (data_.prs.count()>0) { stream << endl; stream << "

" << trans("Polygener Risiko-Score (PRS)") << "

" << endl; @@ -700,9 +700,9 @@ void GermlineReportGenerator::writeHTML(QString filename) int trait_idx = data_.prs.columnIndex("trait"); int score_idx = data_.prs.columnIndex("score"); int citation_idx = data_.prs.columnIndex("citation"); - for (int r=0; r0) + if (data_.prs.count()>0) { int i_id = data_.prs.columnIndex("pgs_id"); int i_trait = data_.prs.columnIndex("trait"); int i_citation = data_.prs.columnIndex("citation"); int i_score = data_.prs.columnIndex("score"); int i_percentile = data_.prs.columnIndex("percentile"); - for (int r=0; r Date: Tue, 17 Dec 2024 13:28:21 +0100 Subject: [PATCH 4/4] TsvDiff: updated documentation --- src/TsvDiff/main.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/TsvDiff/main.cpp b/src/TsvDiff/main.cpp index 413e30b68..fee0ff943 100644 --- a/src/TsvDiff/main.cpp +++ b/src/TsvDiff/main.cpp @@ -34,10 +34,8 @@ class ConcreteTool virtual void setup() { setDescription("Compares TSV files."); - setExtendedDescription(QStringList () << "A simple pairwise alignment algorithm is used, which is slow for a large number of lines"); - - addInfile("in1", "Input TSV file. If unset, reads from STDIN.", false); - addInfile("in2", "Input TSV file. If unset, reads from STDIN.", false); + addInfile("in1", "First input TSV file.", false); + addInfile("in2", "Second input TSV file.", false); //optional addOutfile("out", "Output file with differences. If unset, writes to stdout.", true);