Skip to content

Commit

Permalink
some refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
marc-sturm committed Jan 17, 2025
1 parent efb375e commit b219cd4
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 109 deletions.
199 changes: 93 additions & 106 deletions src/VcfAdd/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,31 +16,33 @@ class ConcreteTool

virtual void setup()
{
setDescription("Appends variants from a VCF file to another VCF file.");
setExtendedDescription(QStringList() << "VCF header lines are taken from 'in' only.");
addInfile("in2", "Input VCF file that is added to 'in'.", false);
setDescription("Merges several VCF files into one VCF by appending one to the other.");
setExtendedDescription(QStringList() << "Variant lines from all other input files are appended to the first input file." << "VCF header lines are taken from the first input file only.");
addInfileList("in", "Input VCF files to merge.", false);

//optional
addInfile("in", "Input VCF file to add 'in2' to.", true);
addOutfile("out", "Output VCF file with variants from 'in' and 'in2'.", true);
addString("filter", "Tag variants from 'in2' with this filter entry.", true);
addOutfile("out", "Output VCF file with all variants.", true);
addString("filter", "Tag variants from all but the first input file with this filter entry.", true);
addString("filter_desc", "Description used in the filter header - use underscore instead of spaces.", true);
addFlag("skip_duplicates", "Skip variants from 'in2' which are also contained in 'in'.");
addFlag("skip_duplicates", "Skip variants if they occur more than once.");

//TODO Marc: add gzip support, allow several files in in2, allow removing duplicate variants in 'in' > remove VcfMerge in ngs-bits and megSAP
//TODO Marc: add gzip support > remove VcfMerge in ngs-bits and megSAP
changeLog(2022, 12, 8, "Initial implementation.");
}

virtual void main()
{
// init
QString in = getInfile("in");
QString in2 = getInfile("in2");
QStringList in_files = getInfileList("in");
QString out = getOutfile("out");
if(in!="" && in==out)
foreach(QString in, in_files)
{
THROW(ArgumentException, "Input and output files must be different when streaming!");
if(in==out)
{
THROW(ArgumentException, "Input and output files must be different!");
}
}
QSharedPointer<QFile> out_p = Helper::openFileForWriting(out, true);
QByteArray filter = getString("filter").toUtf8();
QByteArray filter_desc = getString("filter_desc").toUtf8();
bool filter_used = !filter.isEmpty();
Expand All @@ -52,124 +54,109 @@ class ConcreteTool
QSet<QByteArray> vars;

//counts
int c_in = 0;
int c_in2 = 0;
int c_written = 0;
int c_dup = 0;

int c_filter = 0;

//copy in to out
QSharedPointer<QFile> in_p = Helper::openFileForReading(in, true);
QSharedPointer<QFile> out_p = Helper::openFileForWriting(out, true);
while (!in_p->atEnd())
for (int i=0; i<in_files.count();++i)
{
QByteArray line = in_p->readLine();
while (line.endsWith('\n') || line.endsWith('\r')) line.chop(1);
QSharedPointer<QFile> in_p = Helper::openFileForReading(in_files[i], true);
while (!in_p->atEnd())
{
bool is_first = (i==0);
QByteArray line = in_p->readLine();
while (line.endsWith('\n') || line.endsWith('\r')) line.chop(1);

//skip empty lines
if (line.isEmpty()) continue;
//skip empty lines
if (line.isEmpty()) continue;

//headers
if (line[0]=='#')
{
//store defined filters
if (line.startsWith("##FILTER=<ID="))
{
QByteArray tmp = line.mid(13);
filters_defined << tmp.left(tmp.indexOf(','));
}
QByteArrayList parts = line.split('\t');

if (!line.startsWith("##"))
//header lines
if (line[0]=='#')
{
//store column count
column_count = line.split('\t').count();

//add filter header if missing
if (filter_used && !filters_defined.contains(filter))
if (is_first)
{
out_p->write("##FILTER=<ID="+filter+",Description=\""+filter_desc.replace("_", " ")+"\">\n");
//store defined filters
if (line.startsWith("##FILTER=<ID="))
{
QByteArray tmp = line.mid(13);
filters_defined << tmp.left(tmp.indexOf(','));
}

if (!line.startsWith("##"))
{
//store column count
column_count = line.split('\t').count();

//add filter header if missing
if (filter_used && !filters_defined.contains(filter))
{
out_p->write("##FILTER=<ID="+filter+",Description=\""+filter_desc.replace("_", " ")+"\">\n");
}
}
out_p->write(line);
out_p->write("\n");
}
else
{
if (!line.startsWith("##"))
{
if (parts.count()!=column_count) THROW(ArgumentException, "VCF files with differing column count cannot be combined! First file has " + QString::number(column_count) + " columns, but second as " + QString::number(parts.count()) + " columns!");
}
continue;
}
}
}
else if (skip_duplicates)
{
QByteArrayList parts = line.split('\t');
QByteArray tag = parts[VcfFile::CHROM] + '\t' + parts[VcfFile::POS] + '\t' + parts[VcfFile::REF] + '\t' + parts[VcfFile::ALT];
vars << tag;

++c_in;
}

out_p->write(line);
out_p->write("\n");
}
in_p->close();

//copy in2 to out
in_p = Helper::openFileForReading(in2, false);
while (!in_p->atEnd())
{
QByteArray line = in_p->readLine();
while (line.endsWith('\n') || line.endsWith('\r')) line.chop(1);

//skip empty lines
if (line.isEmpty()) continue;
continue;
}

//skip header lines
if (line[0]=='#')
{
//add filter header if missing
if (!line.startsWith("##"))
//content lines
if (skip_duplicates)
{
QByteArrayList parts = line.split('\t');
if (parts.count()!=column_count) THROW(ArgumentException, "VCF files with differing column count cannot be combined! First file has " + QString::number(column_count) + " columns, but second as " + QString::number(parts.count()) + " columns!");
QByteArray tag = parts[VcfFile::CHROM] + '\t' + parts[VcfFile::POS] + '\t' + parts[VcfFile::REF] + '\t' + parts[VcfFile::ALT];
if (vars.contains(tag))
{
++c_dup;
continue;
}
vars << tag;
}

continue;
}

//skip duplicates
QByteArrayList parts;
if (skip_duplicates)
{
parts = line.split('\t');
QByteArray tag = parts[VcfFile::CHROM] + '\t' + parts[VcfFile::POS] + '\t' + parts[VcfFile::REF] + '\t' + parts[VcfFile::ALT];
if (vars.contains(tag))
//add filter entries
if (!is_first && filter_used)
{
++c_dup;
continue;
QByteArray filter_str = parts[VcfFile::FILTER];
if (filter_str=="PASS" || filter_str==".")
{
parts[VcfFile::FILTER] = filter;
}
else
{
parts[VcfFile::FILTER] = filter_str + ";" + filter;
}
line = parts.join('\t');
++c_filter;
}
}

//add filter entries
if (filter_used)
{
if (parts.isEmpty()) parts = line.split('\t');
QByteArray filter_str = parts[VcfFile::FILTER];
if (filter_str=="PASS" || filter_str==".")
{
parts[VcfFile::FILTER] = filter;
}
else
{
parts[VcfFile::FILTER] = filter_str + ";" + filter;
}
line = parts.join('\t');
++c_written;
out_p->write(line);
out_p->write("\n");
}


out_p->write(line);
out_p->write("\n");

++c_in2;
in_p->close();
}
in_p->close();


//Statistics output
QTextStream stream(stdout);
stream << "Variants from in : " << c_in << endl;
stream << "Variants from in2 : " << c_in2 << endl;
stream << "Variants written: " << c_written << endl;
if (filter_used)
{
stream << "Filter entries added to variants: " << c_filter << endl;
}
if (skip_duplicates)
{
stream << "Duplicates skipped: " << c_dup << endl;
stream << "Duplicate variants skipped: " << c_dup << endl;
}
}
};
Expand Down
6 changes: 3 additions & 3 deletions src/tools-TEST/VcfAdd_Test.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,21 @@ private slots:

void default_mode()
{
EXECUTE("VcfAdd", "-in " + TESTDATA("data_in/VcfAdd_in1.vcf") + " -in2 " + TESTDATA("data_in/VcfAdd_in2.vcf") + " -out out/VcfAdd_out1.vcf");
EXECUTE("VcfAdd", "-in " + TESTDATA("data_in/VcfAdd_in1.vcf") + " " + TESTDATA("data_in/VcfAdd_in2.vcf") + " -out out/VcfAdd_out1.vcf");
COMPARE_FILES("out/VcfAdd_out1.vcf", TESTDATA("data_out/VcfAdd_out1.vcf"));
VCF_IS_VALID_HG19("out/VcfAdd_out1.vcf");
}

void with_filters()
{
EXECUTE("VcfAdd", "-in " + TESTDATA("data_in/VcfAdd_in1.vcf") + " -in2 " + TESTDATA("data_in/VcfAdd_in2.vcf") + " -filter mosaic -filter_desc bli_bla_bluff. -out out/VcfAdd_out2.vcf");
EXECUTE("VcfAdd", "-in " + TESTDATA("data_in/VcfAdd_in1.vcf") + " " + TESTDATA("data_in/VcfAdd_in2.vcf") + " -filter mosaic -filter_desc bli_bla_bluff. -out out/VcfAdd_out2.vcf");
COMPARE_FILES("out/VcfAdd_out2.vcf", TESTDATA("data_out/VcfAdd_out2.vcf"));
VCF_IS_VALID_HG19("out/VcfAdd_out2.vcf");
}

void with_filters_and_skip_duplicates()
{
EXECUTE("VcfAdd", "-in " + TESTDATA("data_in/VcfAdd_in1.vcf") + " -in2 " + TESTDATA("data_in/VcfAdd_in2.vcf") + " -filter mosaic -filter_desc bli_bla_bluff. -skip_duplicates -out out/VcfAdd_out3.vcf");
EXECUTE("VcfAdd", "-in " + TESTDATA("data_in/VcfAdd_in1.vcf") + " " + TESTDATA("data_in/VcfAdd_in2.vcf") + " -filter mosaic -filter_desc bli_bla_bluff. -skip_duplicates -out out/VcfAdd_out3.vcf");
COMPARE_FILES("out/VcfAdd_out3.vcf", TESTDATA("data_out/VcfAdd_out3.vcf"));
VCF_IS_VALID_HG19("out/VcfAdd_out3.vcf");
}
Expand Down

0 comments on commit b219cd4

Please sign in to comment.