Merge remote-tracking branch 'remotes/origin/master' into NGSDAnnotat…

…eSV_pathogenic
imgag · Dec 17, 2024 · b34a458 · b34a458
2 parents f20b7e1 + bba8d44
commit b34a458
Show file tree

Hide file tree

Showing 25 changed files with 558 additions and 100 deletions.
diff --git a/src/BamToFastq/main.cpp b/src/BamToFastq/main.cpp
@@ -24,14 +24,15 @@ class ConcreteTool
 
 		//optional
 		addOutfile("out2", "Read 2 output FASTQ.GZ file (required for pair-end samples).", true);
-		addEnum("mode", "Determine if BAM/CRAM contains paired-end or single-end reads (default: paired-end)", true, QStringList() << "paired-end" << "single-end", "paired-end");
 		addString("reg", "Export only reads in the given region. Format: chr:start-end.", true);
-		addFlag("remove_duplicates", "Does not export duplicate reads into the FASTQ file.");
+		addFlag("remove_duplicates", "Does not export reads marked as duplicates in SAM flags into the FASTQ file.");
 		addInt("compression_level", "Output FASTQ compression level from 1 (fastest) to 9 (best compression).", true, 1);
 		addInt("write_buffer_size", "Output write buffer size (number of FASTQ entry pairs).", true, 100);
 		addInfile("ref", "Reference genome for CRAM support (mandatory if CRAM is used).", true);
 		addInt("extend", "Extend all reads to the given length. Base 'N' and base qualiy '2' are used for extension.", true, 0);
+		addFlag("fix", "Keep only one read pair if several have the same name (note: needs much memory as read names are kept in memory).");
 
+		changeLog(2024, 12, 13, "Added 'fix' parameter.");
 		changeLog(2024, 12,  9, "Added 'extend' parameter.");
 		changeLog(2020, 11, 27, "Added CRAM support.");
 		changeLog(2020,  5, 29, "Massive speed-up by writing in background. Added 'compression_level' parameter.");
@@ -62,21 +63,32 @@ class ConcreteTool
 		}
 	}
 
+	struct ReadWritten
+	{
+		bool read1 = false;
+		bool read2 = false;
+
+		bool done(bool first_read) const
+		{
+			return first_read ? read1 : read2;
+		}
+		void setDone(bool first_read)
+		{
+			if(first_read) read1 = true;
+			else read2 = true;
+		}
+	};
+
 	virtual void main()
 	{
 		//init
 		QTime timer;
 		timer.start();
 		QTextStream out(stdout);
 		BamReader reader(getInfile("in"), getInfile("ref"));
-
 		QString out1 = getOutfile("out1");
 		QString out2 = getOutfile("out2");
-		QString mode = getEnum("mode");
-
-		if (mode == "paired-end" && out2.trimmed().isEmpty()) THROW(ArgumentException, "'out2' has to be provided for paired-end reads!");
-		if (mode == "single-end" && out2.trimmed() != "") THROW(ArgumentException, "'out2' cannot be set for single-end reads!");
-
+		bool fix = getFlag("fix");
 		QString reg = getString("reg");
 		if (reg!="")
 		{
@@ -98,33 +110,29 @@ class ConcreteTool
 		QThreadPool analysis_pool;
 		analysis_pool.setMaxThreadCount(1);
 		OutputWorker* output_worker;
-		if (mode == "paired-end")
+		const bool is_pe = !out2.trimmed().isEmpty();
+		if (is_pe)
 		{
 			output_worker = new OutputWorker(pair_pool, out1, out2, compression_level);
 		}
-		else if (mode == "single-end")
+		else //single-end
 		{
 			output_worker = new OutputWorker(pair_pool, out1, compression_level);
 		}
-		else
-		{
-			THROW(ArgumentException, "Invalid mode '" + mode + "' provided!")
-		}
 		analysis_pool.start(output_worker);
 
 		long long c_unpaired = 0;
 		long long c_paired = 0;
 		long long c_duplicates = 0;
 		long long c_single_end = 0;
+		long long c_fixed = 0;
 		int max_cached = 0;
-
 		//iterate through reads
 		QHash<QByteArray, QSharedPointer<BamAlignment>> al_cache;
+		QHash<QByteArray, ReadWritten> reads_processed;
 		QSharedPointer<BamAlignment> al = QSharedPointer<BamAlignment>(new BamAlignment());
-		while (true)
+		while (reader.getNextAlignment(*al))
 		{
-			bool ok = reader.getNextAlignment(*al);
-			if (!ok) break;
 			//out << al.name() << " PAIRED=" << al.isPaired() << " SEC=" << al.isSecondaryAlignment() << " PROP=" << al.isProperPair() << endl;
 
 			//skip secondary alinments
@@ -137,7 +145,19 @@ class ConcreteTool
 				continue;
 			}
 
-			if(mode == "paired-end")
+			//remove reads with same name (but handle read pairing properly)
+			if (fix)
+			{
+				ReadWritten& written = reads_processed[al->name()];
+				if (written.done(al->isRead1()))
+				{
+					++c_fixed;
+					continue;
+				}
+				written.setDone(al->isRead1());
+			}
+
+			if(is_pe)
 			{
 				//skip unpaired
 				if(!al->isPaired())
@@ -167,30 +187,25 @@ class ConcreteTool
 					pair.status = ReadPair::TO_BE_WRITTEN;
 					++c_paired;
 				}
-				//cache read for later retrieval
-				else
+				else //cache read for later retrieval
 				{
 					al_cache.insert(name, al);
 					al = QSharedPointer<BamAlignment>(new BamAlignment());
 				}
 
 				max_cached = std::max(max_cached, al_cache.size());
 			}
-			else if (mode == "single-end")
+			else //single-end
 			{
 				ReadPair& pair = pair_pool.nextFreePair();
 				alignmentToFastq(al, pair.e1, extend);
 				pair.status = ReadPair::TO_BE_WRITTEN;
 				++c_single_end;
 			}
-			else
-			{
-				THROW(ArgumentException, "Invalid mode '" + mode + "' provided!")
-			}
 		}
 
 		//write debug output
-		if(mode == "paired-end")
+		if(is_pe)
 		{
 			out << "Pair reads (written)            : " << c_paired << endl;
 			out << "Unpaired reads (skipped)        : " << c_unpaired << endl;
@@ -200,10 +215,13 @@ class ConcreteTool
 		{
 			out << "Reads (written)                 : " << c_single_end << endl;
 		}
-
 		if (remove_duplicates)
 		{
-			out << "Duplicate reads (skipped)       : " << c_duplicates << endl;
+			out << "Duplicate tagged reads (skipped): " << c_duplicates << endl;
+		}
+		if (fix)
+		{
+			out << "Duplicate name reads (skipped)  : " << c_fixed << endl;
 		}
 		out << endl;
 		out << "Maximum cached reads            : " << max_cached << endl;

diff --git a/src/ExportcBioportal/main.cpp b/src/ExportcBioportal/main.cpp
@@ -63,9 +63,9 @@ class ConcreteTool
 		int idx_icd10_catalog = samples.columnIndex("icd10_catalog");
 		int idx_oncotree_code = samples.columnIndex("oncotree_code");
 
-		for (int i=0; i< samples.rowCount(); i++)
+		for (int i=0; i< samples.count(); i++)
 		{
-			QStringList row = samples.row(i);
+			const QStringList& row = samples[i];
 			QString sample_id = db.sampleId(row[idx_tumor_name], true);
 
 			SampleMTBmetadata mtb_data;
@@ -177,9 +177,9 @@ class ConcreteTool
 		int idx_datatype = getIndex(headers, "datatype");
 		int idx_prio = getIndex(headers, "priority");
 
-		for (int i=0; i<attr_data.rowCount(); i++)
+		for (int i=0; i<attr_data.count(); i++)
 		{
-			QStringList row = attr_data.row(i);
+			const QStringList& row = attr_data[i];
 
 			SampleAttribute attr;
 			attr.name = row[idx_attr_name];

diff --git a/src/GSvar/CohortExpressionDataWidget.cpp b/src/GSvar/CohortExpressionDataWidget.cpp
@@ -58,7 +58,7 @@ void CohortExpressionDataWidget::loadExpressionData()
 	}
 
 	//set dimensions
-	ui_->tw_cohort_data->setRowCount(cohort_expression_data.rowCount());
+	ui_->tw_cohort_data->setRowCount(cohort_expression_data.count());
 	ui_->tw_cohort_data->setColumnCount(cohort_expression_data.headers().size());
 
 	// create header
@@ -69,9 +69,9 @@ void CohortExpressionDataWidget::loadExpressionData()
 	}
 
 	//fill table
-	for(int row_idx=0; row_idx<cohort_expression_data.rowCount(); ++row_idx)
+	for(int row_idx=0; row_idx<cohort_expression_data.count(); ++row_idx)
 	{
-		QStringList row = cohort_expression_data.row(row_idx);
+		const QStringList& row = cohort_expression_data[row_idx];
 		for (int col_idx = 0; col_idx < tsv_header.size(); ++col_idx)
 		{
 			if(col_idx > 0)

diff --git a/src/GSvar/DiseaseCourseWidget.cpp b/src/GSvar/DiseaseCourseWidget.cpp
@@ -217,9 +217,9 @@ void DiseaseCourseWidget::createTableView()
 	{
 		for (int col_idx = 0; col_idx < table_data_.cfdna_samples.length(); ++col_idx)
 		{
-			if(table_data_.mrd_tables.at(col_idx).rowCount() > 0)
+			if(table_data_.mrd_tables.at(col_idx).count() > 0)
 			{
-				ui_->mrd->setItem(row_idx, col_idx, GUIHelper::createTableItem(table_data_.mrd_tables.at(col_idx).row(0).at(row_idx), Qt::AlignRight));
+				ui_->mrd->setItem(row_idx, col_idx, GUIHelper::createTableItem(table_data_.mrd_tables.at(col_idx)[0].at(row_idx), Qt::AlignRight));
 				continue;
 			}
 			ui_->mrd->setItem(row_idx, col_idx, GUIHelper::createTableItem(""));