Skip to content

Commit

Permalink
Merge pull request #4496 from vgteam/select-first-gam
Browse files Browse the repository at this point in the history
Add filter option to pick first alignment of a read
  • Loading branch information
xchang1 authored Jan 20, 2025
2 parents e91be89 + 457e2d4 commit db3cfbb
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 2 deletions.
18 changes: 17 additions & 1 deletion src/readfilter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,12 @@ class ReadFilter {

/// Filter to only correctly mapped reads
bool only_correctly_mapped = false;

/// Filter to only keep the first alignment for each read
// This must be done single-threaded
bool only_first_alignment = false;
/// If we are picking only the first alignment for each read, keep track of what we've seen
unordered_set<string> seen_read_names;

/**
* Run all the filters on an alignment. The alignment may get modified in-place by the defray filter
Expand Down Expand Up @@ -316,7 +322,7 @@ struct Counts {

enum FilterName { read = 0, wrong_name, wrong_refpos, excluded_feature, min_score, min_sec_score, max_length, max_overhang,
min_end_matches, min_mapq, split, repeat, defray, defray_all, random, min_base_qual, subsequence,
proper_pair, unmapped, annotation, incorrectly_mapped, max_reads, filtered, last};
proper_pair, unmapped, annotation, incorrectly_mapped, first_alignment, max_reads, filtered, last};
vector<size_t> counts;
Counts () : counts(FilterName::last, 0) {}
Counts& operator+=(const Counts& other) {
Expand Down Expand Up @@ -706,6 +712,16 @@ Counts ReadFilter<Read>::filter_alignment(Read& read) {
keep = false;
}
}

if ((keep || verbose) && only_first_alignment) {
assert(threads == 1);
if (seen_read_names.count(read.name()) != 0) {
++counts.counts[Counts::FilterName::first_alignment];
keep = false;
} else {
seen_read_names.emplace(read.name());
}
}

if (!keep) {
++counts.counts[Counts::FilterName::filtered];
Expand Down
13 changes: 12 additions & 1 deletion src/subcommand/filter_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ void help_filter(char** argv) {
<< " -G, --annotation K[:V] keep reads if the annotation is present and not false or empty. If a value is given, keep reads if the values are equal" << endl
<< " similar to running jq 'select(.annotation.K==V)' on the json" << endl
<< " -c, --correctly-mapped keep only reads that are marked as correctly-mapped" << endl
<< " -l, --first-alignment keep only the first alignment for each read. Must be run with 1 thread" << endl
<< " -U, --complement apply the complement of the filter implied by the other arguments." << endl
<< " -B, --batch-size work in batches of the given number of reads [default=" << vg::io::DEFAULT_PARALLEL_BATCHSIZE << "]" << endl
<< " -t, --threads N number of threads [1]" << endl;
Expand Down Expand Up @@ -119,6 +120,7 @@ int main_filter(int argc, char** argv) {
string annotation = "";
string output_fields = "";
bool correctly_mapped = false;
bool first_alignment = false;

size_t batch_size = vg::io::DEFAULT_PARALLEL_BATCHSIZE;

Expand Down Expand Up @@ -164,14 +166,15 @@ int main_filter(int argc, char** argv) {
{"min-base-quality", required_argument, 0, 'b'},
{"annotation", required_argument, 0, 'G'},
{"correctly-mapped", no_argument, 0, 'c'},
{"first-alignment", no_argument, 0, 'l'},
{"complement", no_argument, 0, 'U'},
{"batch-size", required_argument, 0, 'B'},
{"threads", required_argument, 0, 't'},
{0, 0, 0, 0}
};

int option_index = 0;
c = getopt_long (argc, argv, "Mn:N:ea:A:pPX:F:s:r:L:Od:fauo:m:Sx:vVT:q:E:D:C:d:R:iIb:G:cUB:t:",
c = getopt_long (argc, argv, "Mn:N:ea:A:pPX:F:s:r:L:Od:fauo:m:Sx:vVT:q:E:D:C:d:R:iIb:G:clUB:t:",
long_options, &option_index);

/* Detect the end of the options. */
Expand Down Expand Up @@ -350,6 +353,9 @@ int main_filter(int argc, char** argv) {
case 'c':
correctly_mapped = true;
break;
case 'l':
first_alignment = true;
break;
case 'U':
complement_filter = true;
break;
Expand Down Expand Up @@ -380,6 +386,10 @@ int main_filter(int argc, char** argv) {
if (interleaved && max_reads != std::numeric_limits<size_t>::max() && max_reads % 2 != 0) {
std::cerr << "warning [vg filter]: max read count is not divisible by 2, but reads are paired." << std::endl;
}
if (first_alignment) {
std::cerr << "warning [vg filter]: setting --threads 1 because --first-alignment requires one thread." << std::endl;
omp_set_num_threads(1);
}

// What should our return code be?
int error_code = 0;
Expand Down Expand Up @@ -472,6 +482,7 @@ int main_filter(int argc, char** argv) {
}
filter.annotation_to_match = annotation;
filter.only_correctly_mapped = correctly_mapped;
filter.only_first_alignment = first_alignment;
filter.complement_filter = complement_filter;
filter.batch_size = batch_size;
filter.threads = get_thread_count();
Expand Down

1 comment on commit db3cfbb

@adamnovak
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for merge to master. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 17302 seconds

Please sign in to comment.