Skip to content

Commit

Permalink
Cut haplotypes leaving subgraph and don't lose path range end node
Browse files Browse the repository at this point in the history
  • Loading branch information
adamnovak committed Jan 23, 2025
1 parent e2cdf7c commit 35b9ad9
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 5 deletions.
20 changes: 17 additions & 3 deletions src/haplotype_extracter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ void trace_haplotypes(const PathHandleGraph& source, const gbwt::GBWT& haplotype
MutablePathMutableHandleGraph& out_graph,
map<string, int>& out_thread_frequencies) {
// get our haplotypes
// TODO: Tell it to keep haplotypes that end before triggering stop_fn!
vector<pair<thread_t, gbwt::SearchState> > haplotypes = list_haplotypes(source, haplotype_database, start_handle, stop_fn);

#ifdef debug
Expand All @@ -41,6 +40,13 @@ void trace_haplotypes(const PathHandleGraph& source, const gbwt::GBWT& haplotype
const thread_t& thread = haplotypes[i].first;
for (int j = 0; j < thread.size(); j++) {
// Copy in all the steps
if (!out_graph.has_node(gbwt::Node::id(thread[j]))) {
// It's possible for us to be extracting haplotypes that leave the
// subgraph we extracted. Maybe the end node for ID range extraction
// had an alternative with a higher ID. If we find that that is
// happening, we cut the haplotype short.
break;
}
out_graph.append_step(path_handle, out_graph.get_handle(gbwt::Node::id(thread[j]), gbwt::Node::is_reverse(thread[j])));
}
}
Expand Down Expand Up @@ -210,7 +216,15 @@ vector<pair<vector<gbwt::node_type>, gbwt::SearchState> > list_haplotypes(const
#endif

if (!first_state.empty()) {
search_intermediates.push_back(make_pair(first_thread, first_state));
if (stop_fn(first_thread)) {
// We immediately want to stop the search
#ifdef debug
cerr << "\tImmediately hit limit with " << first_state.size() << " results; emitting" << endl;
#endif
search_results.push_back(make_pair(std::move(first_thread), first_state));
} else {
search_intermediates.push_back(make_pair(first_thread, first_state));
}
}

while(!search_intermediates.empty()) {
Expand Down Expand Up @@ -271,7 +285,7 @@ vector<pair<vector<gbwt::node_type>, gbwt::SearchState> > list_haplotypes(const

if (continued_members != last.second.size()) {
#ifdef debug
cerr << "Stopped " (last.second.size() - continued_members) << " results here; emitting" << endl;
cerr << "Stopped " << (last.second.size() - continued_members) << " results here; emitting" << endl;
#endif
// We need to make a result with *only* the stopping results.
// So we extend with a sentinel (0, false) GBWT node, which we don't put in our output vector for the haplotype.
Expand Down
18 changes: 16 additions & 2 deletions src/subcommand/chunk_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -764,6 +764,9 @@ int main_chunk(int argc, char** argv) {
trace_start = graph->get_handle(output_regions[i].start, false);
trace_end = graph->get_handle(output_regions[i].end, false);
trace_steps = output_regions[i].end - output_regions[i].start;
#ifdef debug
std::cerr << "Looking for ID range " << output_regions[i].start << " to " << output_regions[i].end << ", up to " << trace_steps << " steps" << std::endl;
#endif
} else {
path_handle_t path_handle = graph->get_path_handle(output_regions[i].seq);
step_handle_t trace_start_step = graph->get_step_at_position(path_handle, output_regions[i].start);
Expand All @@ -773,7 +776,7 @@ int main_chunk(int argc, char** argv) {
swap(trace_start_step, trace_end_step);
}
trace_start = graph->get_handle_of_step(trace_start_step);
trace_end = graph->get_handle_of_step(trace_start_step);
trace_end = graph->get_handle_of_step(trace_end_step);
if (output_regions[i].start > output_regions[i].end) {
// Actually we need to look in reverse alogn the path.
trace_start = graph->flip(trace_start);
Expand All @@ -783,13 +786,19 @@ int main_chunk(int argc, char** argv) {
// Count the number of steps along the backbone path, which we use to limit graph expansion.
++trace_steps;
}
#ifdef debug
std::cerr << "Looking for position range " << output_regions[i].start << " to " << output_regions[i].end << ", nodes " << graph->get_id(trace_start) << " to " << graph->get_id(trace_end) << ", up to " << trace_steps << " steps" << std::endl;
#endif
}

// Stop extending haplotypes when either they get too long or they arrive at the end.
auto stop_function = [&](const vector<gbwt::node_type>& candidate) {
if (candidate.size() >= trace_steps) {
// If you go more steps than the backbone path, stop there.
// TODO: We should add some padding here to account for insertions of manageable size.
#ifdef debug
std::cerr << "Stop because " << candidate.size() << " is enough steps" << std::endl;
#endif
return true;
}
if (gbwt::Node::id(candidate.back()) == graph->get_id(trace_end) &&
Expand All @@ -798,12 +807,17 @@ int main_chunk(int argc, char** argv) {
// The path being traced has reached the last node on our
// extracted path region, so stop here and avoid leaving
// the region.
#ifdef debug
std::cerr << "Stop because node " << gbwt::Node::id(candidate.back()) << " orientation " << gbwt::Node::is_reverse(candidate.back()) << " is our end node" << std::endl;
#endif
return true;
}
#ifdef debug
std::cerr << "Continue because node " << gbwt::Node::id(candidate.back()) << " orientation " << gbwt::Node::is_reverse(candidate.back()) << " is not our end node " << graph->get_id(trace_end) << " orientation " << graph->get_is_reverse(trace_end) << std::endl;
#endif
return false;
};

Graph g;
trace_haplotypes(*graph, *gbwt_index, trace_start, stop_function,
*subgraph, trace_thread_frequencies);
#ifdef debug
Expand Down

0 comments on commit 35b9ad9

Please sign in to comment.