Skip to content

Commit

Permalink
finished
Browse files Browse the repository at this point in the history
  • Loading branch information
marc-sturm committed Jan 17, 2025
1 parent b219cd4 commit 80fc08e
Show file tree
Hide file tree
Showing 16 changed files with 57 additions and 586 deletions.
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ The default output format of the quality control tools is [qcML](https://pubmed.

### VCF tools (small variants)

* [VcfAdd](doc/tools/VcfAdd.md) - Appends variants from a VCF file to another VCF file.
* [VcfAdd](doc/tools/VcfAdd.md) - Merges several VCF files into one VCF by appending one to the other.
* [VcfAnnotateConsequence](doc/tools/VcfAnnotateConsequence.md) - Adds transcript-specific consequence predictions to a VCF file (similar to Ensembl VEP).
* [VcfAnnotateFromBed](doc/tools/VcfAnnotateFromBed.md) - Annotates the INFO column of a VCF with data from a BED file.
* [VcfAnnotateFromBigWig](doc/tools/VcfAnnotateFromBigWig.md) - Annotates the INFO column of a VCF with data from a BED file.
Expand All @@ -137,7 +137,6 @@ The default output format of the quality control tools is [qcML](https://pubmed.
* [VcfExtractSamples](doc/tools/VcfExtractSamples.md) - Extract one or several samples from a VCF file. Can also be used to re-order sample columns.
* [VcfFilter](doc/tools/VcfFilter.md) - Filters a VCF based on the given criteria.
* [VcfLeftNormalize](doc/tools/VcfLeftNormalize.md) - Normalizes all variants and shifts indels to the left in a VCF file.
* [VcfMerge](doc/tools/VcfMerge.md) - Merges several VCF files into one VCF.
* [VcfSort](doc/tools/VcfSort.md) - Sorts variant lists according to chromosomal position.
* [VcfSplit](doc/tools/VcfSplit.md) - Splits a VCF into several chunks.
* [VcfStrip](doc/tools/VcfStrip.md) - Removes unwanted information from a VCF file
Expand Down
25 changes: 0 additions & 25 deletions doc/tools/VcfMerge.md

This file was deleted.

75 changes: 53 additions & 22 deletions src/VcfAdd/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,15 @@ class ConcreteTool
{
setDescription("Merges several VCF files into one VCF by appending one to the other.");
setExtendedDescription(QStringList() << "Variant lines from all other input files are appended to the first input file." << "VCF header lines are taken from the first input file only.");
addInfileList("in", "Input VCF files to merge.", false);
addInfileList("in", "Input VCF ro VCG.GZ files to merge.", false);

//optional
addOutfile("out", "Output VCF file with all variants.", true);
addString("filter", "Tag variants from all but the first input file with this filter entry.", true);
addString("filter_desc", "Description used in the filter header - use underscore instead of spaces.", true);
addFlag("skip_duplicates", "Skip variants if they occur more than once.");

//TODO Marc: add gzip support > remove VcfMerge in ngs-bits and megSAP
changeLog(2025, 1, 17, "Added support for gzipped VCFs and removing duplicates if there is only one input file.");
changeLog(2022, 12, 8, "Initial implementation.");
}

Expand All @@ -48,30 +48,59 @@ class ConcreteTool
bool filter_used = !filter.isEmpty();
bool skip_duplicates = getFlag("skip_duplicates");

//variables to store infos from 'in'
//variables to store infos
int column_count = -1;
QSet<QByteArray> filters_defined;
QSet<QByteArray> vars;
bool is_first = true;
const int buffer_size = 1048576; //1MB buffer
char* buffer = new char[buffer_size];

//counts
int c_written = 0;
int c_dup = 0;
int c_filter = 0;

//copy in to out
for (int i=0; i<in_files.count();++i)
foreach(QString in, in_files)
{
QSharedPointer<QFile> in_p = Helper::openFileForReading(in_files[i], true);
while (!in_p->atEnd())
FILE* instream = fopen(in.toUtf8().data(), "rb");
if (instream==nullptr) THROW(FileAccessException, "Could not open file '" + in + "' for reading!");
gzFile file = gzdopen(fileno(instream), "rb"); //read binary: always open in binary mode because windows and mac open in text mode
if (file==nullptr) THROW(FileAccessException, "Could not open file '" + in + "' for reading!");

while(!gzeof(file))
{
bool is_first = (i==0);
QByteArray line = in_p->readLine();
char* char_array = gzgets(file, buffer, buffer_size);
//handle errors like truncated GZ file
if (char_array==nullptr)
{
int error_no = Z_OK;
QByteArray error_message = gzerror(file, &error_no);
if (error_no!=Z_OK && error_no!=Z_STREAM_END)
{
THROW(FileParseException, "Error while reading file '" + in + "': " + error_message);
}

continue;
}

//determine end of read line
int i=0;
while(i<buffer_size && char_array[i]!='\0' && char_array[i]!='\n' && char_array[i]!='\r')
{
++i;
}

QByteArray line = QByteArray::fromRawData(char_array, i);
while (line.endsWith('\n') || line.endsWith('\r')) line.chop(1);

//skip empty lines
if (line.isEmpty()) continue;

QByteArrayList parts = line.split('\t');
//split line to tab-separated parts if we need it
QByteArrayList parts;
if(skip_duplicates || filter_used || (line[0]=='#' && !line.startsWith("##"))) parts = line.split('\t');

//header lines
if (line[0]=='#')
Expand All @@ -88,7 +117,7 @@ class ConcreteTool
if (!line.startsWith("##"))
{
//store column count
column_count = line.split('\t').count();
column_count = parts.count();

//add filter header if missing
if (filter_used && !filters_defined.contains(filter))
Expand All @@ -99,19 +128,15 @@ class ConcreteTool
out_p->write(line);
out_p->write("\n");
}
else
else if (!line.startsWith("##")) //check number of columns matches in all other files
{
if (!line.startsWith("##"))
{
if (parts.count()!=column_count) THROW(ArgumentException, "VCF files with differing column count cannot be combined! First file has " + QString::number(column_count) + " columns, but second as " + QString::number(parts.count()) + " columns!");
}
continue;
if (parts.count()!=column_count) THROW(ArgumentException, "VCF files with differing column count cannot be combined! First file has " + QString::number(column_count) + " columns, but second as " + QString::number(parts.count()) + " columns!");
}

continue;
}

//content lines
//skip duplicate variants
if (skip_duplicates)
{
QByteArray tag = parts[VcfFile::CHROM] + '\t' + parts[VcfFile::POS] + '\t' + parts[VcfFile::REF] + '\t' + parts[VcfFile::ALT];
Expand Down Expand Up @@ -144,20 +169,26 @@ class ConcreteTool
out_p->write(line);
out_p->write("\n");
}
in_p->close();
gzclose(file);

is_first = false;
}

//clean up
out_p->close();
delete[] buffer;

//Statistics output
QTextStream stream(stdout);
stream << "Variants written: " << c_written << endl;
if (filter_used)
{
stream << "Filter entries added to variants: " << c_filter << endl;
}
if (skip_duplicates)
{
stream << "Duplicate variants skipped: " << c_dup << endl;
}
if (filter_used)
{
stream << "Filter entries added to variants: " << c_filter << endl;
}
}
};

Expand Down
14 changes: 0 additions & 14 deletions src/VcfMerge/VcfMerge.pro

This file was deleted.

99 changes: 0 additions & 99 deletions src/VcfMerge/main.cpp

This file was deleted.

6 changes: 3 additions & 3 deletions src/tools-TEST/VcfAdd_Test.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,21 @@ private slots:

void default_mode()
{
EXECUTE("VcfAdd", "-in " + TESTDATA("data_in/VcfAdd_in1.vcf") + " " + TESTDATA("data_in/VcfAdd_in2.vcf") + " -out out/VcfAdd_out1.vcf");
EXECUTE("VcfAdd", "-in " + TESTDATA("data_in/VcfAdd_in1.vcf") + " " + TESTDATA("data_in/VcfAdd_in2.vcf.gz") + " -out out/VcfAdd_out1.vcf");
COMPARE_FILES("out/VcfAdd_out1.vcf", TESTDATA("data_out/VcfAdd_out1.vcf"));
VCF_IS_VALID_HG19("out/VcfAdd_out1.vcf");
}

void with_filters()
{
EXECUTE("VcfAdd", "-in " + TESTDATA("data_in/VcfAdd_in1.vcf") + " " + TESTDATA("data_in/VcfAdd_in2.vcf") + " -filter mosaic -filter_desc bli_bla_bluff. -out out/VcfAdd_out2.vcf");
EXECUTE("VcfAdd", "-in " + TESTDATA("data_in/VcfAdd_in1.vcf") + " " + TESTDATA("data_in/VcfAdd_in2.vcf.gz") + " -filter mosaic -filter_desc bli_bla_bluff. -out out/VcfAdd_out2.vcf");
COMPARE_FILES("out/VcfAdd_out2.vcf", TESTDATA("data_out/VcfAdd_out2.vcf"));
VCF_IS_VALID_HG19("out/VcfAdd_out2.vcf");
}

void with_filters_and_skip_duplicates()
{
EXECUTE("VcfAdd", "-in " + TESTDATA("data_in/VcfAdd_in1.vcf") + " " + TESTDATA("data_in/VcfAdd_in2.vcf") + " -filter mosaic -filter_desc bli_bla_bluff. -skip_duplicates -out out/VcfAdd_out3.vcf");
EXECUTE("VcfAdd", "-in " + TESTDATA("data_in/VcfAdd_in1.vcf") + " " + TESTDATA("data_in/VcfAdd_in2.vcf.gz") + " -filter mosaic -filter_desc bli_bla_bluff. -skip_duplicates -out out/VcfAdd_out3.vcf");
COMPARE_FILES("out/VcfAdd_out3.vcf", TESTDATA("data_out/VcfAdd_out3.vcf"));
VCF_IS_VALID_HG19("out/VcfAdd_out3.vcf");
}
Expand Down
21 changes: 0 additions & 21 deletions src/tools-TEST/VcfMerge_Test.h

This file was deleted.

Loading

0 comments on commit 80fc08e

Please sign in to comment.