diff --git a/README.md b/README.md index 55556f6f44d..1769d290e59 100644 --- a/README.md +++ b/README.md @@ -396,14 +396,6 @@ vg index hla.vg -x hla.xg vg deconstruct hla.xg -e -p "gi|568815592:29791752-29792749" > hla_variants.vcf ``` -Variants can also be inferred strictly from topology by not using `-e`, though unlike the above example, cycles are not supported. "Deconstruct" the VCF variants that were used to construct the graph. The output will be similar but identical to `small/x.vcf.gz` as `vg construct` can add edges between adjacent alts and/or do some normalization: - - -```sh -# using the same graph from the `map` example -vg deconstruct x.xg -p x > x.vcf -``` - Haplotype paths from `.gbz` or `.gbwt` indexes input can be considered using `-z` and `-g', respectively. As with `vg call`, it is best to compute snarls separately and pass them in with `-r` when working with large graphs. diff --git a/src/clip.cpp b/src/clip.cpp index 0d2c5efd78d..8b078619fc9 100644 --- a/src/clip.cpp +++ b/src/clip.cpp @@ -135,7 +135,7 @@ void visit_contained_snarls(const PathPositionHandleGraph* graph, const vector Deconstructor::get_alleles(vcflib::Variant& v, - const pair, - vector>>& path_travs, + const vector& travs, + const vector>& trav_steps, int ref_path_idx, const vector& use_trav, char prev_char, bool use_start) const { - auto& travs = path_travs.first; assert(ref_path_idx >=0 && ref_path_idx < travs.size()); // map strings to allele numbers (and their traversal) @@ -42,12 +40,11 @@ vector Deconstructor::get_alleles(vcflib::Variant& v, vector trav_to_allele(travs.size()); // compute the allele as a string - auto trav_to_string = [&](const SnarlTraversal& trav) { + auto trav_to_string = [&](const Traversal& trav) { string allele; // we skip the snarl endpoints - for (int j = 1; j < trav.visit_size() - 1; ++j) { - const string& node_sequence = graph->get_sequence(graph->get_handle(trav.visit(j).node_id())); - allele += trav.visit(j).backward() ? reverse_complement(node_sequence) : node_sequence; + for (int j = 1; j < trav.size() - 1; ++j) { + allele += graph->get_sequence(trav[j]); } return toUppercase(allele); }; @@ -133,15 +130,15 @@ vector Deconstructor::get_alleles(vcflib::Variant& v, if (untangle_allele_traversals) { // set up for reference position context mapping across allele traversals - path_handle_t ref_path = graph->get_path_handle_of_step(path_travs.second.at(ref_path_idx).first); + path_handle_t ref_path = graph->get_path_handle_of_step(trav_steps.at(ref_path_idx).first); unordered_map>>> ref_dup_nodes; unordered_map ref_simple_pos; { auto& trav = travs.at(ref_path_idx); - for (size_t i = 0; i < trav.visit_size(); ++i) { - size_t j = !reversed ? i : trav.visit_size() - 1 - i; - const Visit& visit = trav.visit(j); - nid_t node_id = visit.node_id(); + for (size_t i = 0; i < trav.size(); ++i) { + size_t j = !reversed ? i : trav.size() - 1 - i; + const handle_t& handle = trav[j]; + nid_t node_id = graph->get_id(handle); if (ref_simple_pos.find(node_id) != ref_simple_pos.end()) continue; if (ref_dup_nodes.find(node_id) != ref_dup_nodes.end()) continue; handle_t h = graph->get_handle(node_id); @@ -188,8 +185,8 @@ vector Deconstructor::get_alleles(vcflib::Variant& v, for (size_t i = 0; i < allele_idx_unfolded.size(); i++) { int allele_no = i; int allele_trav_no = allele_idx_unfolded[i]; - auto start_step = path_travs.second.at(allele_trav_no).first; - auto end_step = path_travs.second.at(allele_trav_no).second; + auto start_step = trav_steps.at(allele_trav_no).first; + auto end_step = trav_steps.at(allele_trav_no).second; auto start_pos = graph->get_position_of_step(start_step); auto end_pos = graph->get_position_of_step(end_step); bool flip_path = start_pos > end_pos; @@ -273,7 +270,7 @@ vector Deconstructor::get_alleles(vcflib::Variant& v, int allele_no = ai_pair.second.first; int allele_trav_no = ai_pair.second.second; // update the traversal info - add_allele_path_to_info(v, allele_no, travs[allele_trav_no], reversed, !substitution); + add_allele_path_to_info(graph, v, allele_no, travs[allele_trav_no], reversed, !substitution); } } @@ -293,7 +290,7 @@ void Deconstructor::get_genotypes(vcflib::Variant& v, const vector& name assert(names.size() == trav_to_allele.size()); // set up our variant fields v.format.push_back("GT"); - if (show_path_info && path_to_sample_phase && path_restricted) { + if (show_path_info && path_to_sample_phase) { v.format.push_back("PI"); } @@ -364,7 +361,7 @@ void Deconstructor::get_genotypes(vcflib::Variant& v, const vector& name } } v.samples[sample_name]["GT"] = {blank_gt}; - if (show_path_info && path_to_sample_phase && path_restricted) { + if (show_path_info && path_to_sample_phase) { v.samples[sample_name]["PI"] = {blank_gt}; } } @@ -422,9 +419,9 @@ pair, bool> Deconstructor::choose_traversals(const string& sample_na std::any_of(gbwt_phases.begin(), gbwt_phases.end(), [](int i) { return i >= 0; }); //|| path_to_sample_phase; bool phasing_conflict = false; - int sample_ploidy = ploidy; + int sample_ploidy = 1; int min_phase = 1; - int max_phase = ploidy; + int max_phase = 1; if (has_phasing || path_to_sample_phase) { if (has_phasing) { // override ploidy with information about all phases found in input @@ -564,69 +561,40 @@ vector Deconstructor::get_context( return context; } -vector Deconstructor::get_context( - const pair, - vector>>& path_travs, - const int& trav_idx) const { - step_handle_t start_step = path_travs.second[trav_idx].first; - step_handle_t end_step = path_travs.second[trav_idx].second; - return get_context(start_step, end_step); -} - -bool Deconstructor::deconstruct_site(const Snarl* snarl) const { - auto contents = snarl_manager->shallow_contents(snarl, *graph, false); - if (contents.first.empty()) { - // Nothing but the boundary nodes in this snarl +void Deconstructor::get_traversals(const handle_t& snarl_start, const handle_t& snarl_end, + vector& out_travs, + vector& out_trav_path_names, + vector>& out_trav_steps) const { + // empty snarl check + vector next_handles; + graph->follow_edges(snarl_start, false, [&](handle_t handle) { + next_handles.push_back(handle); + }); + if (next_handles.size() == 1 && next_handles.back() == snarl_end) { #ifdef debug #pragma omp critical (cerr) - cerr << "Skipping empty site " << pb2json(*snarl) << endl; -#endif - return false; + cerr << "Skipping empty site " << graph_interval_to_string(graph, snarl_start, snarl_end) << endl; +#endif + return; } + #ifdef debug #pragma omp crtiical (cerr) - cerr << "Computing traversals of site " << pb2json(*snarl) << endl; + cerr << "Computing traversals of site " << graph_interval_to_string(graph, snarl_start, snarl_end) << endl; #endif // find every traversal that runs through a path in the graph - pair, vector > > path_travs; - path_travs = path_trav_finder->find_path_traversals(*snarl); - vector path_trav_names; - for (const pair& trav_ends : path_travs.second) { - path_trav_names.push_back(graph->get_path_name(graph->get_path_handle_of_step(trav_ends.first))); - } - - // pick out the traversal corresponding to a reference path, breaking ties consistently - string ref_trav_name; - for (int i = 0; i < path_travs.first.size(); ++i) { - const string& path_trav_name = path_trav_names.at(i); -#ifdef debug -#pragma omp critical (cerr) - { - cerr << "Traversal " << i << ": name=" << path_trav_name << ", size=" << path_travs.first[i].visit_size() - << ", start=" << graph->get_position_of_step(path_travs.second[i].first) - << ", end=" << graph->get_position_of_step(path_travs.second[i].second) << endl - << " trav=" << pb2json(path_travs.first[i]) << endl; - } -#endif - if (ref_paths.count(path_trav_name) && - (ref_trav_name.empty() || path_trav_name < ref_trav_name)) { - ref_trav_name = path_trav_name; -#ifdef debug -#pragma omp critical (cerr) - cerr << "Setting ref_trav_name " << ref_trav_name << endl; -#endif - } + std::tie(out_travs, out_trav_steps) = path_trav_finder->find_path_traversals(snarl_start, snarl_end); + for (const pair& trav_steps : out_trav_steps) { + out_trav_path_names.push_back(graph->get_path_name(graph->get_path_handle_of_step(trav_steps.first))); } // add in the gbwt traversals // after this, all traversals are treated the same, with metadata embedded in their names - int64_t first_gbwt_trav_idx = path_trav_names.size(); - vector gbwt_path_ids; if (gbwt_trav_finder.get() != nullptr) { const gbwt::GBWT& gbwt_index = gbwt_trav_finder->get_gbwt(); - pair, vector> thread_travs = gbwt_trav_finder->find_path_traversals(*snarl); + pair, vector> thread_travs = gbwt_trav_finder->find_path_traversals(snarl_start, snarl_end); for (int i = 0; i < thread_travs.first.size(); ++i) { // We need to get a bunch of metadata about the path, but the GBWT // we have might not even have structured path names stored. @@ -635,7 +603,6 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) const { continue; } - gbwt_path_ids.push_back(path_id); PathSense sense = gbwtgraph::get_path_sense(gbwt_index, path_id, gbwt_reference_samples); if (sense == PathSense::HAPLOTYPE) { @@ -648,13 +615,51 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) const { gbwtgraph::get_path_haplotype(gbwt_index, path_id, sense), gbwtgraph::get_path_phase_block(gbwt_index, path_id, sense), gbwtgraph::get_path_subrange(gbwt_index, path_id, sense)); - path_trav_names.push_back(path_name); - path_travs.first.push_back(thread_travs.first[i]); - // dummy handles so we can use the same code as the named path traversals above - path_travs.second.push_back(make_pair(step_handle_t(), step_handle_t())); + out_trav_path_names.push_back(path_name); + out_travs.push_back(std::move(thread_travs.first[i])); } } } +} + + +bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t& snarl_end) const { + + vector travs; + vector trav_path_names; + // note that this vector (unlike the above two) is for embedded paths only (not GBWT) + vector> trav_steps; + + // compute all the traversals from embedded paths and gbwt + this->get_traversals(snarl_start, snarl_end, travs, trav_path_names, trav_steps); + + if (travs.empty()) { + return false; + } + + // pick out the traversal corresponding to an embedded reference path, breaking ties consistently + string ref_trav_name; + for (int i = 0; i < trav_steps.size(); ++i) { + const string& path_trav_name = trav_path_names[i]; +#ifdef debug +#pragma omp critical (cerr) + { + cerr << "Traversal " << i << ": name=" << path_trav_name << ", size=" << travs[i].size() + << ", start=" << graph->get_position_of_step(trav_steps[i].first) + << ", end=" << graph->get_position_of_step(trav_steps[i].second) << endl + << " trav=" << traversal_to_string(travs[i]) << endl; + } +#endif + if (ref_paths.count(path_trav_name) && + (ref_trav_name.empty() || path_trav_name < ref_trav_name)) { + ref_trav_name = path_trav_name; +#ifdef debug +#pragma omp critical (cerr) + cerr << "Setting ref_trav_name " << ref_trav_name << endl; +#endif + } + } + // remember all the reference traversals (there can be more than one only in the case of a // cycle in the reference path @@ -666,8 +671,8 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) const { // hacky subpath support -- gets added to variant on output vector ref_offsets; if (!ref_trav_name.empty()) { - for (int i = 0; i < path_travs.first.size(); ++i) { - const string& path_trav_name = path_trav_names.at(i); + for (int i = 0; i < trav_steps.size(); ++i) { + const string& path_trav_name = trav_path_names.at(i); subrange_t subrange ; Paths::strip_subrange(path_trav_name, &subrange); int64_t sub_offset = subrange == PathMetadata::NO_SUBRANGE ? 0 : subrange.first; @@ -687,46 +692,16 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) const { if (ref_travs.empty()) { #ifdef debug #pragma omp critical (cerr) - cerr << "Skipping site because no reference traversal was found " << pb2json(*snarl) << endl; + cerr << "Skipping site because no reference traversal was found " << graph_interval_to_string(graph, snarl_start, snarl_end) << endl; #endif return false; } - - bool exhaustive = !path_restricted && gbwt_trav_finder.get() == nullptr; - if (exhaustive) { - // add in the exhaustive traversals - vector additional_travs; - - // exhaustive traversal can't do all snarls - if (snarl->type() != ULTRABUBBLE) { - return false; - } - if (!check_max_nodes(snarl)) { -#pragma omp critical (cerr) - cerr << "Warning: Skipping site because it is too complex for exhaustive traversal enumeration: " << pb2json(*snarl) << endl << " Consider using -e to traverse embedded paths" << endl; - return false; - } - additional_travs = explicit_exhaustive_traversals(snarl); - - // happens when there was a nested non-ultrabubble snarl - if (additional_travs.empty()) { - return false; - } - path_travs.first.insert(path_travs.first.end(), additional_travs.begin(), additional_travs.end()); - for (int i = 0; i < additional_travs.size(); ++i) { - // dummy names so we can use the same code as the named path traversals above - path_trav_names.push_back(" >>" + std::to_string(i)); - // dummy handles so we can use the same code as the named path traversals above - path_travs.second.push_back(make_pair(step_handle_t(), step_handle_t())); - } - - } // there's not alt path through the snarl, so we can't make an interesting variant - if (path_travs.first.size() < 2) { + if (travs.size() < 2) { #ifdef debug #pragma omp critical (cerr) - cerr << "Skipping site because to alt traversal was found " << pb2json(*snarl) << endl; + cerr << "Skipping site because to alt traversal was found " << graph_interval_to_string(graph, snarl_start, snarl_end) << endl; #endif return false; } @@ -737,190 +712,141 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) const { // to compare with equivalent windows from the alternate allele paths // we will associate these 1:1 with reference traversals - // remember that path_travs := pair, vector > > path_travs; + // remember that path_travs := pair, vector > > path_travs; // map from each path_trav index to the ref_trav index it best maps to vector path_trav_to_ref_trav; - if (ref_travs.size() > 1 && this->path_jaccard_window && exhaustive && !exhaustive_jaccard_warning) { -#pragma omp critical (cerr) - cerr << "warning [vg deconstruct]: Conext Jaccard logic for multiple references disabled with exhaustive traversals. Use -e, -g or GBZ input to switch to path-based traversals only (recommended)." << endl; - exhaustive_jaccard_warning = true; - } - if (ref_travs.size() > 1 && this->path_jaccard_window && !exhaustive) { - path_trav_to_ref_trav.resize(path_travs.first.size()); + if (ref_travs.size() > 1 && this->path_jaccard_window) { + path_trav_to_ref_trav.resize(trav_steps.size()); #ifdef debug #pragma omp critical (cerr) cerr << "Multiple ref traversals!" << endl; #endif - { - vector> ref_contexts(ref_travs.size()); + vector> ref_contexts(ref_travs.size()); #pragma omp parallel for schedule(dynamic,1) - for (size_t i = 0; i < ref_travs.size(); ++i) { - auto& trav_id = ref_travs[i]; - ref_contexts[i] = get_context(path_travs, trav_id); - } - // now for each traversal, we compute and equivalent context and match it to a ref context - // using a jaccard metric over node ids + for (size_t i = 0; i < ref_travs.size(); ++i) { + ref_contexts[i] = get_context(trav_steps[ref_travs[i]].first, trav_steps[ref_travs[i]].second); + } + + // now for each traversal, we compute and equivalent context and match it to a ref context + // using a jaccard metric over node ids #pragma omp parallel for schedule(dynamic,1) - for (size_t i = 0; i < path_travs.first.size(); ++i) { - vector context = get_context(path_travs, i); - // map jaccard metric to the index of the ref_trav - vector> ref_mappings; - for (uint64_t j = 0; j < ref_travs.size(); ++j) { - ref_mappings.push_back(make_pair( - context_jaccard( - ref_contexts[j], - context), - ref_travs[j])); - } - std::sort(ref_mappings.begin(), ref_mappings.end()); - // the best is the last, which has the highest jaccard - path_trav_to_ref_trav[i] = ref_mappings.back().second; + for (size_t i = 0; i < trav_steps.size(); ++i) { + vector context = get_context(trav_steps[i].first, trav_steps[i].second); + // map jaccard metric to the index of the ref_trav + vector> ref_mappings; + for (uint64_t j = 0; j < ref_travs.size(); ++j) { + ref_mappings.push_back(make_pair( + context_jaccard( + ref_contexts[j], + context), + ref_travs[j])); } + std::sort(ref_mappings.begin(), ref_mappings.end()); + // the best is the last, which has the highest jaccard + path_trav_to_ref_trav[i] = ref_mappings.back().second; } } // we write a variant for every reference traversal // (optionally) selecting the subset of path traversals that are 1:1 -//#pragma omp parallel for for (size_t i = 0; i < ref_travs.size(); ++i) { -//#pragma omp task firstprivate(i) - { - auto& ref_trav_idx = ref_travs[i]; - auto& ref_trav_offset = ref_offsets[i]; + auto& ref_trav_idx = ref_travs[i]; + auto& ref_trav_offset = ref_offsets[i]; - const SnarlTraversal& ref_trav = path_travs.first[ref_trav_idx]; + const Traversal& ref_trav = travs[ref_trav_idx]; - vcflib::Variant v; - v.quality = 60; + vcflib::Variant v; + v.quality = 60; - // in VCF we usually just want the contig - string contig_name = PathMetadata::parse_locus_name(ref_trav_name); - if (contig_name == PathMetadata::NO_LOCUS_NAME) { + // in VCF we usually just want the contig + string contig_name = PathMetadata::parse_locus_name(ref_trav_name); + if (contig_name == PathMetadata::NO_LOCUS_NAME) { + contig_name = ref_trav_name; + } else if (long_ref_contig) { + // the sample name isn't unique enough, so put a full ugly name in the vcf + if (PathMetadata::parse_sense(ref_trav_name) == PathSense::GENERIC) { contig_name = ref_trav_name; - } else if (long_ref_contig) { - // the sample name isn't unique enough, so put a full ugly name in the vcf - if (PathMetadata::parse_sense(ref_trav_name) == PathSense::GENERIC) { - contig_name = ref_trav_name; - } else { - contig_name = PathMetadata::create_path_name(PathSense::REFERENCE, - PathMetadata::parse_sample_name(ref_trav_name), - contig_name, - PathMetadata::parse_haplotype(ref_trav_name), - PathMetadata::NO_PHASE_BLOCK, - PathMetadata::NO_SUBRANGE); - } + } else { + contig_name = PathMetadata::create_path_name(PathSense::REFERENCE, + PathMetadata::parse_sample_name(ref_trav_name), + contig_name, + PathMetadata::parse_haplotype(ref_trav_name), + PathMetadata::NO_PHASE_BLOCK, + PathMetadata::NO_SUBRANGE); } + } - // write variant's sequenceName (VCF contig) - v.sequenceName = contig_name; - - // Map our snarl endpoints to oriented positions in the embedded path in the graph - handle_t first_path_handle; - size_t first_path_pos; - bool use_start; - assert(ref_trav_idx < first_gbwt_trav_idx); - step_handle_t start_step = path_travs.second[ref_trav_idx].first; - step_handle_t end_step = path_travs.second[ref_trav_idx].second; - handle_t start_handle = graph->get_handle_of_step(start_step); - handle_t end_handle = graph->get_handle_of_step(end_step); - size_t start_pos = graph->get_position_of_step(start_step); - size_t end_pos = graph->get_position_of_step(end_step); - use_start = start_pos < end_pos; - first_path_handle = use_start ? start_handle : end_handle; - first_path_pos = use_start ? start_pos : end_pos; + // write variant's sequenceName (VCF contig) + v.sequenceName = contig_name; + + // Map our snarl endpoints to oriented positions in the embedded path in the graph + handle_t first_path_handle; + size_t first_path_pos; + bool use_start; + step_handle_t start_step = trav_steps[ref_trav_idx].first; + step_handle_t end_step = trav_steps[ref_trav_idx].second; + handle_t start_handle = graph->get_handle_of_step(start_step); + handle_t end_handle = graph->get_handle_of_step(end_step); + size_t start_pos = graph->get_position_of_step(start_step); + size_t end_pos = graph->get_position_of_step(end_step); + use_start = start_pos < end_pos; + first_path_handle = use_start ? start_handle : end_handle; + first_path_pos = use_start ? start_pos : end_pos; - // Get the first visit of our snarl traversal - const Visit& first_trav_visit = use_start ? ref_trav.visit(0) : ref_trav.visit(ref_trav.visit_size() - 1); - - char prev_char; - if ((use_start && first_trav_visit.backward() == graph->get_is_reverse(first_path_handle)) || - (!use_start && first_trav_visit.backward() != graph->get_is_reverse(first_path_handle))) { - // Our path and traversal have consistent orientation. leave off the end of the start node going forward - first_path_pos += graph->get_length(first_path_handle); - prev_char = ::toupper(graph->get_sequence(first_path_handle)[graph->get_length(first_path_handle) - 1]); - } else { - // They are flipped: leave off the beginning of the start node going backward - prev_char = reverse_complement(::toupper(graph->get_sequence(first_path_handle)[0])); - } + // Get the first visit of our snarl traversal + const handle_t& first_trav_handle = use_start ? ref_trav.front() : ref_trav.back(); + + char prev_char; + if ((use_start && graph->get_is_reverse(first_trav_handle) == graph->get_is_reverse(first_path_handle)) || + (!use_start && graph->get_is_reverse(first_trav_handle) != graph->get_is_reverse(first_path_handle))) { + // Our path and traversal have consistent orientation. leave off the end of the start node going forward + first_path_pos += graph->get_length(first_path_handle); + prev_char = ::toupper(graph->get_sequence(first_path_handle)[graph->get_length(first_path_handle) - 1]); + } else { + // They are flipped: leave off the beginning of the start node going backward + prev_char = reverse_complement(::toupper(graph->get_sequence(first_path_handle)[0])); + } - // shift from 0-based to 1-based for VCF - first_path_pos += 1; + // shift from 0-based to 1-based for VCF + first_path_pos += 1; - v.position = first_path_pos + ref_trav_offset; + v.position = first_path_pos + ref_trav_offset; - v.id = print_snarl(*snarl); + v.id = print_snarl(graph, snarl_start, snarl_end); - // Convert the snarl traversals to strings and add them to the variant - vector use_trav(path_travs.first.size()); - if (path_trav_to_ref_trav.size()) { - for (uint64_t i = 0; i < use_trav.size(); ++i) { - use_trav[i] = (ref_trav_idx == path_trav_to_ref_trav[i]); - } - } else { - for (uint64_t i = 0; i < use_trav.size(); ++i) { - use_trav[i] = true; - } + // Convert the snarl traversals to strings and add them to the variant + vector use_trav(travs.size()); + if (path_trav_to_ref_trav.size()) { + for (uint64_t i = 0; i < use_trav.size(); ++i) { + use_trav[i] = (ref_trav_idx == path_trav_to_ref_trav[i]); } + } else { + for (uint64_t i = 0; i < use_trav.size(); ++i) { + use_trav[i] = true; + } + } - vector trav_to_allele = get_alleles(v, path_travs, - ref_trav_idx, - use_trav, - prev_char, use_start); + vector trav_to_allele = get_alleles(v, travs, trav_steps, + ref_trav_idx, + use_trav, + prev_char, use_start); - // Fill in the genotypes - if (path_restricted || gbwt_trav_finder.get()) { - get_genotypes(v, path_trav_names, trav_to_allele); - } + // Fill in the genotypes + get_genotypes(v, trav_path_names, trav_to_allele); - // we only bother printing out sites with at least 1 non-reference allele - if (!std::all_of(trav_to_allele.begin(), trav_to_allele.end(), [](int i) { return (i == 0 || i == -1); })) { - if (path_restricted || gbwt_trav_finder.get()) { - // run vcffixup to add some basic INFO like AC - vcf_fixup(v); - } - add_variant(v); - } + // we only bother printing out sites with at least 1 non-reference allele + if (!std::all_of(trav_to_allele.begin(), trav_to_allele.end(), [](int i) { return (i == 0 || i == -1); })) { + // run vcffixup to add some basic INFO like AC + vcf_fixup(v); + add_variant(v); } } -//#pragma omp taskwait return true; } -/** - * Convenience wrapper function for deconstruction of multiple paths. - */ -void Deconstructor::deconstruct(vector ref_paths, const PathPositionHandleGraph* graph, SnarlManager* snarl_manager, - bool path_restricted_traversals, - int ploidy, - bool include_nested, - int context_jaccard_window, - bool untangle_traversals, - bool keep_conflicted, - bool strict_conflicts, - bool long_ref_contig, - gbwt::GBWT* gbwt) { - - this->graph = graph; - this->snarl_manager = snarl_manager; - this->path_restricted = path_restricted_traversals; - this->ploidy = ploidy; - this->ref_paths = set(ref_paths.begin(), ref_paths.end()); - this->include_nested = include_nested; - this->path_jaccard_window = context_jaccard_window; - this->untangle_allele_traversals = untangle_traversals; - this->keep_conflicted_genotypes = keep_conflicted; - this->strict_conflict_checking = strict_conflicts; - if (gbwt) { - this->gbwt_reference_samples = gbwtgraph::parse_reference_samples_tag(*gbwt); - } - - // the need to use nesting is due to a problem with omp tasks and shared state - // which results in extremely high memory costs (ex. ~10x RAM for 2 threads vs. 1) - omp_set_nested(1); - omp_set_max_active_levels(3); - +string Deconstructor::get_vcf_header() { // Keep track of the non-reference paths in the graph. They'll be our sample names ref_samples.clear(); set ref_haplotypes; @@ -984,6 +910,13 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand } } + if (sample_to_haps.empty()) { + cerr << "Error [vg deconstruct]: No paths found for alt alleles in the graph. Note that " + << "exhaustive path-free traversal finding is no longer supported, and vg deconstruct " + << "now only works on embedded paths and GBWT threads" << endl; + exit(1); + } + // find some stats about the haplotypes for each sample gbwt_sample_to_phase_range.clear(); sample_ploidys.clear(); @@ -995,10 +928,8 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand // print the VCF header stringstream stream; stream << "##fileformat=VCFv4.2" << endl; - if (path_restricted || gbwt) { - stream << "##FORMAT=" << endl; - } - if (show_path_info && path_to_sample_phase && path_restricted) { + stream << "##FORMAT=" << endl; + if (show_path_info && path_to_sample_phase) { stream << "##FORMAT=" << endl; } if (path_to_sample_phase || gbwt) { @@ -1008,12 +939,10 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand } stream << "\">" << endl; } - if (path_restricted || gbwt) { - stream << "##INFO=" << endl; - stream << "##INFO=" << endl; - stream << "##INFO=" << endl; - stream << "##INFO=" << endl; - } + stream << "##INFO=" << endl; + stream << "##INFO=" << endl; + stream << "##INFO=" << endl; + stream << "##INFO=" << endl; if (include_nested) { stream << "##INFO=" << endl; stream << "##INFO=" << endl; @@ -1091,125 +1020,132 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand } stream << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"; - if (path_restricted || gbwt) { - for (auto& sample_name : sample_names) { - stream << "\t" << sample_name; - } + for (auto& sample_name : sample_names) { + stream << "\t" << sample_name; } stream << endl; + return stream.str(); +} + +void Deconstructor::deconstruct_graph(SnarlManager* snarl_manager) { + + vector snarls; + vector queue; + + // read all our snarls into a list + snarl_manager->for_each_top_level_snarl([&](const Snarl* snarl) { + queue.push_back(snarl); + }); + if (include_nested) { + while (!queue.empty()) { + const Snarl* snarl = queue.back(); + queue.pop_back(); + snarls.push_back(snarl); + const vector& children = snarl_manager->children_of(snarl); + snarls.insert(snarls.end(), children.begin(), children.end()); + } + } else { + swap(snarls, queue); + } + + // process the whole shebang in parallel +#pragma omp parallel for schedule(dynamic,1) + for (size_t i = 0; i < snarls.size(); i++) { + deconstruct_site(graph->get_handle(snarls[i]->start().node_id(), snarls[i]->start().backward()), + graph->get_handle(snarls[i]->end().node_id(), snarls[i]->end().backward())); + } +} + +void Deconstructor::deconstruct_graph_top_down(SnarlManager* snarl_manager) { + // logic copied from vg call (graph_caller.cpp) + + // Used to recurse on children of parents that can't be called + size_t thread_count = get_thread_count(); + vector> snarl_queue(thread_count); + + // Run the deconstructor on a snarl, and queue up the children if it fails + auto process_snarl = [&](const Snarl* snarl) { + if (!snarl_manager->is_trivial(snarl, *graph)) { + bool was_deconstructed = deconstruct_site(graph->get_handle(snarl->start().node_id(), snarl->start().backward()), + graph->get_handle(snarl->end().node_id(), snarl->end().backward())); + if (include_nested || !was_deconstructed) { + const vector& children = snarl_manager->children_of(snarl); + vector& thread_queue = snarl_queue[omp_get_thread_num()]; + thread_queue.insert(thread_queue.end(), children.begin(), children.end()); + } + } + }; + + // Start with the top level snarls + snarl_manager->for_each_top_level_snarl_parallel(process_snarl); + + // Then recurse on any children the snarl caller failed to handle + while (!std::all_of(snarl_queue.begin(), snarl_queue.end(), + [](const vector& snarl_vec) {return snarl_vec.empty();})) { + vector cur_queue; + for (vector& thread_queue : snarl_queue) { + cur_queue.reserve(cur_queue.size() + thread_queue.size()); + std::move(thread_queue.begin(), thread_queue.end(), std::back_inserter(cur_queue)); + thread_queue.clear(); + } +#pragma omp parallel for schedule(dynamic, 1) + for (int i = 0; i < cur_queue.size(); ++i) { + process_snarl(cur_queue[i]); + } + } +} + +/** + * Convenience wrapper function for deconstruction of multiple paths. + */ +void Deconstructor::deconstruct(vector ref_paths, const PathPositionHandleGraph* graph, SnarlManager* snarl_manager, + bool include_nested, + int context_jaccard_window, + bool untangle_traversals, + bool keep_conflicted, + bool strict_conflicts, + bool long_ref_contig, + gbwt::GBWT* gbwt) { + + this->graph = graph; + this->ref_paths = set(ref_paths.begin(), ref_paths.end()); + this->include_nested = include_nested; + this->path_jaccard_window = context_jaccard_window; + this->untangle_allele_traversals = untangle_traversals; + this->keep_conflicted_genotypes = keep_conflicted; + this->strict_conflict_checking = strict_conflicts; + if (gbwt) { + this->gbwt_reference_samples = gbwtgraph::parse_reference_samples_tag(*gbwt); + } + this->gbwt = gbwt; + + // the need to use nesting is due to a problem with omp tasks and shared state + // which results in extremely high memory costs (ex. ~10x RAM for 2 threads vs. 1) + omp_set_nested(1); + omp_set_max_active_levels(3); - string hstr = stream.str(); + string hstr = this->get_vcf_header(); assert(output_vcf.openForOutput(hstr)); cout << output_vcf.header << endl; // create the traversal finder map reads_by_name; - path_trav_finder = unique_ptr(new PathTraversalFinder(*graph, - *snarl_manager)); - - if (!path_restricted && !gbwt) { - trav_finder = unique_ptr(new ExhaustiveTraversalFinder(*graph, - *snarl_manager, - true)); - - } - + path_trav_finder = unique_ptr(new PathTraversalFinder(*graph)); + if (gbwt != nullptr) { gbwt_trav_finder = unique_ptr(new GBWTTraversalFinder(*graph, *gbwt)); } - vector snarls_todo; - // Do the top-level snarls in parallel - snarl_manager->for_each_top_level_snarl([&](const Snarl* snarl) { - vector todo(1, snarl); - vector next; - while (!todo.empty()) { - for (auto next_snarl : todo) { - // if we can't make a variant from the snarl due to not finding - // paths through it, we try again on the children - // note: we may want to push the parallelism down a bit -#pragma omp critical (snarls_todo) - snarls_todo.push_back(next_snarl); - if (include_nested) { - // n.b. we no longer attempt to deconstruct the site to determine if we nest - const vector& children = snarl_manager->children_of(next_snarl); - next.insert(next.end(), children.begin(), children.end()); - } - } - swap(todo, next); - next.clear(); - } - }); - -//#pragma omp parallel -//#pragma omp single - { -#pragma omp parallel for schedule(dynamic,1) - for (size_t i = 0; i < snarls_todo.size(); i++) { -//#pragma omp task firstprivate(i) - { - auto& snarl = snarls_todo[i]; - deconstruct_site(snarl); - } - } + if (include_nested) { + deconstruct_graph(snarl_manager); + } else { + deconstruct_graph_top_down(snarl_manager); } -//#pragma omp taskwait // write variants in sorted order write_variants(cout, snarl_manager); } -bool Deconstructor::check_max_nodes(const Snarl* snarl) const { - unordered_set nodeset = snarl_manager->deep_contents(snarl, *graph, false).first; - int node_count = 0; - for (auto node_id : nodeset) { - handle_t node = graph->get_handle(node_id); - if (graph->get_degree(node, true) > 1 || graph->get_degree(node, false) > 1) { - ++node_count; - if (node_count > max_nodes_for_exhaustive) { - return false; - } - } - } - return true; -}; - -vector Deconstructor::explicit_exhaustive_traversals(const Snarl* snarl) const { - vector out_travs; - bool ultra_all_the_way_down = true; - function extend_trav = - [&](const SnarlTraversal& trav, const Snarl& nested_snarl) { - // exhaustive traversal finder is limited. if we find something - // that's not an ultrabubble, not much we can do - if (nested_snarl.type() != ULTRABUBBLE) { - ultra_all_the_way_down = false; - return; - } - vector nested_travs = trav_finder->find_traversals(nested_snarl); - for (auto& nested_trav : nested_travs) { - SnarlTraversal extended_trav = trav; - bool is_explicit = true; - for (int i = 0; i < nested_trav.visit_size(); ++i) { - if (nested_trav.visit(i).node_id() != 0) { - Visit* visit = extended_trav.add_visit(); - *visit = nested_trav.visit(i); - } else { - extend_trav(extended_trav, nested_trav.visit(i).snarl()); - is_explicit = false; - } - } - if (is_explicit) { - out_travs.push_back(extended_trav); - } - } - }; - SnarlTraversal trav; - extend_trav(trav, *snarl); - if (!ultra_all_the_way_down) { - out_travs.clear(); - } - return out_travs; -} } diff --git a/src/deconstructor.hpp b/src/deconstructor.hpp index a8c66e369bb..3ae015b45a1 100644 --- a/src/deconstructor.hpp +++ b/src/deconstructor.hpp @@ -38,9 +38,7 @@ class Deconstructor : public VCFOutputCaller { // deconstruct the entire graph to cout. // Not even a little bit thread safe. - void deconstruct(vector refpaths, const PathPositionHandleGraph* grpah, SnarlManager* snarl_manager, - bool path_restricted_traversals, - int ploidy, + void deconstruct(vector refpaths, const PathPositionHandleGraph* graph, SnarlManager* snarl_manager, bool include_nested, int context_jaccard_window, bool untangle_traversals, @@ -51,14 +49,34 @@ class Deconstructor : public VCFOutputCaller { private: + // initialize the vcf and get the header + string get_vcf_header(); + + // deconstruct all snarls in parallel (ie nesting relationship ignored) + void deconstruct_graph(SnarlManager* snarl_manager); + + // deconstruct all top-level snarls in parallel + // nested snarls are processed after their parents in the same thread + // (same logic as vg call) + void deconstruct_graph_top_down(SnarlManager* snarl_manager); + // write a vcf record for the given site. returns true if a record was written // (need to have a path going through the site) - bool deconstruct_site(const Snarl* site) const; + bool deconstruct_site(const handle_t& snarl_start, const handle_t& snarl_end) const; + + // get the traversals for a given site + // this returns a combination of embedded path traversals and gbwt traversals + // the embedded paths come first, and only they get trav_steps. + // so you can use trav_steps.size() to find the index of the first gbwt traversal... + void get_traversals(const handle_t& snarl_start, const handle_t& snarl_end, + vector& out_travs, + vector& out_trav_path_names, + vector>& out_trav_steps) const; // convert traversals to strings. returns mapping of traversal (offset in travs) to allele vector get_alleles(vcflib::Variant& v, - const pair, - vector>>& path_travs, + const vector& travs, + const vector>& trav_steps, int ref_path_idx, const vector& use_trav, char prev_char, bool use_start) const; @@ -74,19 +92,6 @@ class Deconstructor : public VCFOutputCaller { const vector& trav_to_name, const vector& gbwt_phases) const; - // check to see if a snarl is too big to exhaustively traverse - bool check_max_nodes(const Snarl* snarl) const; - - // get traversals from the exhaustive finder. if they have nested visits, fill them in (exhaustively) - // with node visits - vector explicit_exhaustive_traversals(const Snarl* snarl) const; - - // gets a sorted node id context for a given path - vector get_context( - const pair, - vector>>& path_travs, - const int& trav_idx) const; - // the underlying context-getter vector get_context( step_handle_t start_step, @@ -101,22 +106,14 @@ class Deconstructor : public VCFOutputCaller { const dac_vector<>& target, const vector& query) const; - // toggle between exhaustive and path restricted traversal finder - bool path_restricted = false; - - // the max ploidy we expect. - int ploidy; - // the graph const PathPositionHandleGraph* graph; - // the snarl manager - SnarlManager* snarl_manager; + // the gbwt + gbwt::GBWT* gbwt; // the traversal finders. we always use a path traversal finder to get the reference path unique_ptr path_trav_finder; - // we optionally use another (exhaustive for now) traversal finder if we don't want to rely on paths - unique_ptr trav_finder; // we can also use a gbwt for traversals unique_ptr gbwt_trav_finder; // When using the gbwt we need some precomputed information to ask about stored paths. @@ -143,9 +140,6 @@ class Deconstructor : public VCFOutputCaller { // the sample ploidys given in the phases in our path names unordered_map sample_ploidys; - // upper limit of degree-2+ nodes for exhaustive traversal - int max_nodes_for_exhaustive = 100; - // target window size for determining the correct reference position for allele traversals with path jaccard int path_jaccard_window = 10000; @@ -160,9 +154,6 @@ class Deconstructor : public VCFOutputCaller { // should we keep conflicted genotypes or not bool keep_conflicted_genotypes = false; - - // warn about context jaccard not working with exhaustive traversals - mutable atomic exhaustive_jaccard_warning; }; // helpel for measuring set intersectiond and union size diff --git a/src/graph_caller.cpp b/src/graph_caller.cpp index 921c941f02e..355bfceb9ac 100644 --- a/src/graph_caller.cpp +++ b/src/graph_caller.cpp @@ -366,6 +366,17 @@ void VCFOutputCaller::set_nested(bool nested) { include_nested = nested; } +void VCFOutputCaller::add_allele_path_to_info(const HandleGraph* graph, vcflib::Variant& v, int allele, const Traversal& trav, + bool reversed, bool one_based) const { + SnarlTraversal proto_trav; + for (const handle_t& handle : trav) { + Visit* visit = proto_trav.add_visit(); + visit->set_node_id(graph->get_id(handle)); + visit->set_backward(graph->get_is_reverse(handle)); + } + this->add_allele_path_to_info(v, allele, proto_trav, reversed, one_based); +} + void VCFOutputCaller::add_allele_path_to_info(vcflib::Variant& v, int allele, const SnarlTraversal& trav, bool reversed, bool one_based) const { auto& trav_info = v.info["AT"]; @@ -726,6 +737,18 @@ void VCFOutputCaller::flatten_common_allele_ends(vcflib::Variant& variant, bool } } +string VCFOutputCaller::print_snarl(const HandleGraph* graph, const handle_t& snarl_start, + const handle_t& snarl_end, bool in_brackets) const { + Snarl snarl; + Visit* start = snarl.mutable_start(); + start->set_node_id(graph->get_id(snarl_start)); + start->set_backward(graph->get_is_reverse(snarl_start)); + Visit* end = snarl.mutable_end(); + end->set_node_id(graph->get_id(snarl_end)); + end->set_backward(graph->get_is_reverse(snarl_end)); + return this->print_snarl(snarl, in_brackets); +} + string VCFOutputCaller::print_snarl(const Snarl& snarl, bool in_brackets) const { // todo, should we canonicalize here by putting lexicographic lowest node first? nid_t start_node_id = snarl.start().node_id(); diff --git a/src/graph_caller.hpp b/src/graph_caller.hpp index 7bdcfe9d880..0485d03db3f 100644 --- a/src/graph_caller.hpp +++ b/src/graph_caller.hpp @@ -103,8 +103,12 @@ class VCFOutputCaller { protected: /// add a traversal to the VCF info field in the format of a GFA W-line or GAF path + void add_allele_path_to_info(const HandleGraph* graph, vcflib::Variant& v, int allele, + const Traversal& trav, bool reversed, bool one_based) const; + /// legacy version of above void add_allele_path_to_info(vcflib::Variant& v, int allele, const SnarlTraversal& trav, bool reversed, bool one_based) const; + /// convert a traversal into an allele string string trav_string(const HandleGraph& graph, const SnarlTraversal& trav) const; @@ -131,6 +135,8 @@ class VCFOutputCaller { /// print a snarl in a consistent form like >3435<12222 /// if in_brackets set to true, do (>3435<12222) instead (this is only used for nested caller) + string print_snarl(const HandleGraph* grpah, const handle_t& snarl_start, const handle_t& snarl_end, bool in_brackets = false) const; + /// legacy version of above string print_snarl(const Snarl& snarl, bool in_brackets = false) const; /// do the opposite of above diff --git a/src/mcmc_caller.cpp b/src/mcmc_caller.cpp index 956123d6f3c..de526ce3955 100644 --- a/src/mcmc_caller.cpp +++ b/src/mcmc_caller.cpp @@ -96,7 +96,7 @@ namespace vg { return false; } - PathTraversalFinder trav_finder(*path_position_handle_graph, snarl_manager, ref_paths); + PathTraversalFinder trav_finder(*path_position_handle_graph, ref_paths); auto trav_results = trav_finder.find_path_traversals(snarl); vector ref_path = trav_results.first; diff --git a/src/subcommand/deconstruct_main.cpp b/src/subcommand/deconstruct_main.cpp index 975336f6aa4..497922e5b70 100644 --- a/src/subcommand/deconstruct_main.cpp +++ b/src/subcommand/deconstruct_main.cpp @@ -42,12 +42,10 @@ void help_deconstruct(char** argv){ << " -P, --path-prefix NAME All paths [excluding GBWT threads / non-reference GBZ paths] beginning with NAME used as reference (multiple allowed)." << endl << " Other non-ref paths not considered as samples. " << endl << " -r, --snarls FILE Snarls file (from vg snarls) to avoid recomputing." << endl - << " -g, --gbwt FILE only consider alt traversals that correspond to GBWT threads FILE (not needed for GBZ graph input)." << endl + << " -g, --gbwt FILE consider alt traversals that correspond to GBWT haplotypes in FILE (not needed for GBZ graph input)." << endl << " -T, --translation FILE Node ID translation (as created by vg gbwt --translation) to apply to snarl names and AT fields in output" << endl << " -O, --gbz-translation Use the ID translation from the input gbz to apply snarl names to snarl names and AT fields in output" << endl - << " -e, --path-traversals Only consider traversals that correspond to paths in the graph." << endl << " -a, --all-snarls Process all snarls, including nested snarls (by default only top-level snarls reported)." << endl - << " -d, --ploidy N Expected ploidy. If more traversals found, they will be flagged as conflicts (default: 2)" << endl << " -c, --context-jaccard N Set context mapping size used to disambiguate alleles at sites with multiple reference traversals (default: 10000)." << endl << " -u, --untangle-travs Use context mapping to determine the reference-relative positions of each step in allele traversals (AP INFO field)." << endl << " -K, --keep-conflicted Retain conflicted genotypes in output." << endl @@ -73,8 +71,6 @@ int main_deconstruct(int argc, char** argv){ bool gbz_translation = false; bool path_restricted_traversals = false; bool show_progress = false; - int ploidy = 2; - bool set_ploidy = false; bool all_snarls = false; bool keep_conflicted = false; bool strict_conflicts = false; @@ -96,7 +92,6 @@ int main_deconstruct(int argc, char** argv){ {"translation", required_argument, 0, 'T'}, {"gbz-translation", no_argument, 0, 'O'}, {"path-traversals", no_argument, 0, 'e'}, - {"ploidy", required_argument, 0, 'd'}, {"context-jaccard", required_argument, 0, 'c'}, {"untangle-travs", no_argument, 0, 'u'}, {"all-snarls", no_argument, 0, 'a'}, @@ -140,12 +135,12 @@ int main_deconstruct(int argc, char** argv){ gbz_translation = true; break; case 'e': - path_restricted_traversals = true; + cerr << "Warning [vg deconstruct]: -e is deprecated as it's now on default" << endl; break; case 'd': - ploidy = parse(optarg); - set_ploidy = true; + cerr << "Warning [vg deconstruct]: -d is deprecated - ploidy now inferred from haplotypes in path names" << endl; break; + break; case 'c': context_jaccard_window = parse(optarg); break; @@ -206,16 +201,6 @@ int main_deconstruct(int argc, char** argv){ cerr << "Error [vg deconstruct]: -O can only be used when input graph is in GBZ format" << endl; } - if (set_ploidy && !path_restricted_traversals && gbwt_file_name.empty() && !gbz_graph) { - cerr << "Error [vg deconstruct]: -d can only be used with -e or -g or GBZ input" << endl; - return 1; - } - - if ((!gbwt_file_name.empty() || gbz_graph) && path_restricted_traversals && !gbz_graph) { - cerr << "Error [vg deconstruct]: -e cannot be used with -g or GBZ input" << endl; - return 1; - } - if (!gbwt_file_name.empty() || gbz_graph) { // context jaccard depends on having steps for each alt traversal, which is // not something we have on hand when getting traversals from the GBWT/GBZ @@ -352,7 +337,7 @@ int main_deconstruct(int argc, char** argv){ } dd.set_translation(translation.get()); dd.set_nested(all_snarls); - dd.deconstruct(refpaths, graph, snarl_manager.get(), path_restricted_traversals, ploidy, + dd.deconstruct(refpaths, graph, snarl_manager.get(), all_snarls, context_jaccard_window, untangle_traversals, diff --git a/src/subcommand/snarls_main.cpp b/src/subcommand/snarls_main.cpp index 8587e93d2d2..20772eb328b 100644 --- a/src/subcommand/snarls_main.cpp +++ b/src/subcommand/snarls_main.cpp @@ -299,7 +299,7 @@ int main_snarl(int argc, char** argv) { if (path_traversals) { // Limit traversals to embedded paths - trav_finder.reset(new PathTraversalFinder(*graph, snarl_manager)); + trav_finder.reset(new PathTraversalFinder(*graph)); } else if (vcf_filename.empty()) { // This finder works with any backing graph trav_finder.reset(new ExhaustiveTraversalFinder(*graph, snarl_manager)); diff --git a/src/traversal_finder.cpp b/src/traversal_finder.cpp index ac5b20e7681..4a2b7498f9b 100644 --- a/src/traversal_finder.cpp +++ b/src/traversal_finder.cpp @@ -10,6 +10,36 @@ namespace vg { using namespace std; +string traversal_to_string(const PathHandleGraph* graph, const Traversal& traversal, bool max_steps) { + string s; + function handle_to_string = [&](handle_t handle) { + string ss = graph->get_is_reverse(handle) ? "<" : ">"; + return ss + std::to_string(graph->get_id(handle)); + }; + if (traversal.size() <= max_steps) { + for (const handle_t& handle : traversal) { + s += handle_to_string(handle); + } + } else { + for (int64_t i = 0; i < max_steps / 2; ++i) { + s += handle_to_string(traversal[i]); + } + s += "..."; + for (int64_t i = 0; i < max_steps / 2; ++i) { + s += handle_to_string(traversal[traversal.size() - 1 - i]); + } + } + return s; +} + +string graph_interval_to_string(const HandleGraph* graph, const handle_t& start_handle, const handle_t& end_handle) { + function handle_to_string = [&](handle_t handle) { + string ss = graph->get_is_reverse(handle) ? "<" : ">"; + return ss + std::to_string(graph->get_id(handle)); + }; + return handle_to_string(start_handle) + handle_to_string(end_handle); +} + PathBasedTraversalFinder::PathBasedTraversalFinder(const PathHandleGraph& g, SnarlManager& sm) : graph(g), snarlmanager(sm){ } @@ -815,9 +845,9 @@ vector ReadRestrictedTraversalFinder::find_traversals(const Snar return to_return; } -PathTraversalFinder::PathTraversalFinder(const PathHandleGraph& graph, SnarlManager& snarl_manager, +PathTraversalFinder::PathTraversalFinder(const PathHandleGraph& graph, const vector& path_names) : - graph(graph), snarl_manager(snarl_manager) { + graph(graph) { for (const string& path_name : path_names) { assert(graph.has_path(path_name)); paths.insert(graph.get_path_handle(path_name)); @@ -828,15 +858,36 @@ vector PathTraversalFinder::find_traversals(const Snarl& site) { return find_path_traversals(site).first; } -pair, vector > > PathTraversalFinder::find_path_traversals(const Snarl& site) { +vector PathTraversalFinder::find_traversals(const handle_t& snarl_start, const handle_t& snarl_end) { + return find_path_traversals(snarl_start, snarl_end).first; +} - handle_t start_handle = graph.get_handle(site.start().node_id(), site.start().backward()); - handle_t end_handle = graph.get_handle(site.end().node_id(), site.end().backward()); - - vector start_steps = graph.steps_of_handle(start_handle); - vector end_steps = graph.steps_of_handle(end_handle); +pair, vector > > find_path_traversals(const Snarl& site); + +pair, vector> PathTraversalFinder::find_path_traversals(const Snarl& site) { - pair, unordered_set > snarl_contents = snarl_manager.deep_contents(&site, graph, true); + pair, vector> path_travs = this->find_path_traversals( + graph.get_handle(site.start().node_id(), site.start().backward()), + graph.get_handle(site.end().node_id(), site.end().backward())); + + vector travs(path_travs.first.size()); + + for (int64_t i = 0; i < path_travs.first.size(); ++i) { + SnarlTraversal& proto_trav = travs[i]; + const Traversal& handle_trav = path_travs.first[i]; + for (int64_t j = 0; j < handle_trav.size(); ++j) { + Visit* visit = proto_trav.add_visit(); + visit->set_node_id(graph.get_id(handle_trav[j])); + visit->set_backward(graph.get_is_reverse(handle_trav[j])); + } + } + return make_pair(travs, path_travs.second); +} + +pair, vector> PathTraversalFinder::find_path_traversals(const handle_t& snarl_start, const handle_t& snarl_end) { + + vector start_steps = graph.steps_of_handle(snarl_start); + vector end_steps = graph.steps_of_handle(snarl_end); // use this to skip paths that don't reach the end node unordered_set end_path_handles; @@ -845,40 +896,48 @@ pair, vector > > PathT } #ifdef debug - cerr << "Finding traversals of " << pb2json(site) << " using PathTraversalFinder" << endl + cerr << "Finding traversals of " << (graph->get_is_reverse(snarl_start) ? "<" : ">") << graph->get_id(snarl_start) + << (graph->get_is_reverse(snarl_end) ? "<" : ">") << graph->get_id(snarl_edn) << " using PathTraversalFinder" << endl << " - there are " << start_steps.size() << " start_steps, " << end_steps.size() << " end_steps" << " and " << end_path_handles.size() << " end_path_handles" << endl; #endif - vector out_travs; + vector out_travs; vector > out_steps; + // collect edges that lead out the snarl to make sure we don't search in the wrong direction + unordered_set block_edges; + graph.follow_edges(snarl_start, true, [&](handle_t prev) { + block_edges.insert(graph.edge_handle(prev, snarl_start)); + }); + graph.follow_edges(snarl_end, false, [&](handle_t next) { + block_edges.insert(graph.edge_handle(snarl_end, next)); + }); + for (const step_handle_t& start_step : start_steps) { path_handle_t start_path_handle = graph.get_path_handle_of_step(start_step); // only crawl paths that have a chance of reaching the end if ((paths.empty() || paths.count(start_path_handle)) && end_path_handles.count(start_path_handle)) { - handle_t end_check = end_handle; + handle_t end_check = snarl_end; #ifdef debug cerr << " - considering path " << graph.get_path_name(start_path_handle) << endl; #endif // try to make a traversal by walking forward - SnarlTraversal trav; + Traversal trav; bool can_continue = true; step_handle_t step = start_step; while (can_continue) { handle_t handle = graph.get_handle_of_step(step); - Visit* start_visit = trav.add_visit(); - start_visit->set_node_id(graph.get_id(handle)); - start_visit->set_backward(graph.get_is_reverse(handle)); + trav.push_back(handle); can_continue = false; - if (graph.has_next_step(step) && handle != end_handle) { + if (graph.has_next_step(step) && handle != snarl_end) { step_handle_t next_step = graph.get_next_step(step); handle_t next_handle = graph.get_handle_of_step(next_step); - if (snarl_contents.first.count(graph.get_id(next_handle)) && - snarl_contents.second.count(graph.edge_handle(handle, next_handle))) { + // todo: we only need this check once + if (!block_edges.count(graph.edge_handle(handle, next_handle))) { step = next_step; can_continue = true; } @@ -890,25 +949,21 @@ pair, vector > > PathT cerr << " - failed to find forward traversal of path " << graph.get_path_name(start_path_handle) << endl; #endif // try to make a traversal by walking backward - end_check = graph.flip(end_handle); + end_check = graph.flip(snarl_end); - trav.Clear(); + trav.clear(); can_continue = true; step = start_step; while (can_continue) { handle_t handle = graph.flip(graph.get_handle_of_step(step)); - - Visit* start_visit = trav.add_visit(); - start_visit->set_node_id(graph.get_id(handle)); - start_visit->set_backward(graph.get_is_reverse(handle)); + trav.push_back(handle); can_continue = false; - if (graph.has_previous_step(step) && handle != end_handle) { + if (graph.has_previous_step(step) && handle != snarl_end) { step_handle_t prev_step = graph.get_previous_step(step); handle_t prev_handle = graph.flip(graph.get_handle_of_step(prev_step)); - - if (snarl_contents.first.count(graph.get_id(prev_handle)) && - snarl_contents.second.count(graph.edge_handle(handle, prev_handle))) { + // todo: we only need this check once + if (!block_edges.count(graph.edge_handle(prev_handle, handle))) { step = prev_step; can_continue = true; } @@ -2556,7 +2611,7 @@ VCFTraversalFinder::VCFTraversalFinder(const PathHandleGraph& graph, SnarlManage snarl_manager(snarl_manager), skip_alt(skip_alt), max_traversal_cutoff(max_traversal_cutoff), - path_finder(graph, snarl_manager, ref_path_names) { + path_finder(graph, ref_path_names) { create_variant_index(vcf, ref_fasta, ins_fasta); } @@ -3380,15 +3435,38 @@ GBWTTraversalFinder::~GBWTTraversalFinder() { pair, vector>> GBWTTraversalFinder::find_gbwt_traversals(const Snarl& site, bool return_paths) { + + pair, vector>> gbwt_travs = this->find_gbwt_traversals( + graph.get_handle(site.start().node_id(), site.start().backward()), + graph.get_handle(site.end().node_id(), site.end().backward()), + return_paths); + + vector travs(gbwt_travs.first.size()); + + for (int64_t i = 0; i < gbwt_travs.first.size(); ++i) { + SnarlTraversal& proto_trav = travs[i]; + const Traversal& handle_trav = gbwt_travs.first[i]; + for (int64_t j = 0; j < handle_trav.size(); ++j) { + Visit* visit = proto_trav.add_visit(); + visit->set_node_id(graph.get_id(handle_trav[j])); + visit->set_backward(graph.get_is_reverse(handle_trav[j])); + } + } + + return make_pair(travs, gbwt_travs.second); +} + +pair, vector>> +GBWTTraversalFinder::find_gbwt_traversals(const handle_t& snarl_start, const handle_t& snarl_end, bool return_paths) { // follow all gbwt threads from start to end vector, gbwt::SearchState> > forward_traversals = list_haplotypes( graph, gbwt, - graph.get_handle(site.start().node_id(), site.start().backward()), + snarl_start, [&] (const vector& new_thread) { - return gbwt::Node::id(new_thread.back()) == site.end().node_id() && - gbwt::Node::is_reverse(new_thread.back()) == site.end().backward(); + return gbwt::Node::id(new_thread.back()) == graph.get_id(snarl_end) && + gbwt::Node::is_reverse(new_thread.back()) == graph.get_is_reverse(snarl_end); }); // follow all gbwt threads from end to start @@ -3397,15 +3475,15 @@ GBWTTraversalFinder::find_gbwt_traversals(const Snarl& site, bool return_paths) backward_traversals = list_haplotypes( graph, gbwt, - graph.get_handle(site.end().node_id(), !site.end().backward()), + graph.flip(snarl_end), [&] (const vector& new_thread) { - return gbwt::Node::id(new_thread.back()) == site.start().node_id() && - gbwt::Node::is_reverse(new_thread.back()) == !site.start().backward(); + return gbwt::Node::id(new_thread.back()) == graph.get_id(snarl_start) && + gbwt::Node::is_reverse(new_thread.back()) == !graph.get_is_reverse(snarl_start); }); } // store them all as snarltraversals - vector traversals; + vector traversals; vector> gbwt_paths; traversals.reserve(forward_traversals.size() + backward_traversals.size()); @@ -3413,8 +3491,7 @@ GBWTTraversalFinder::find_gbwt_traversals(const Snarl& site, bool return_paths) for (int i = 0; i < forward_traversals.size(); ++i) { traversals.emplace_back(); for (auto j = forward_traversals[i].first.begin(); j != forward_traversals[i].first.end(); ++j) { - Visit* visit = traversals.back().add_visit(); - *visit = to_visit(gbwt::Node::id(*j), gbwt::Node::is_reverse(*j)); + traversals.back().push_back(graph.get_handle(gbwt::Node::id(*j), gbwt::Node::is_reverse(*j))); } if (return_paths) { gbwt_paths.push_back(gbwt.locate(forward_traversals[i].second)); @@ -3458,8 +3535,7 @@ GBWTTraversalFinder::find_gbwt_traversals(const Snarl& site, bool return_paths) // insert if not duplicate of existing forward traversal traversals.emplace_back(); for (auto j = backward_traversals[i].first.begin(); j != backward_traversals[i].first.end(); ++j) { - Visit* visit = traversals.back().add_visit(); - *visit = to_visit(gbwt::Node::id(*j), gbwt::Node::is_reverse(*j)); + traversals.back().push_back(graph.get_handle(gbwt::Node::id(*j), gbwt::Node::is_reverse(*j))); } if (return_paths) { gbwt_paths.push_back(gbwt.locate(backward_traversals[i].second)); @@ -3474,6 +3550,10 @@ vector GBWTTraversalFinder::find_traversals(const Snarl& site) { return find_gbwt_traversals(site, false).first; } +vector GBWTTraversalFinder::find_traversals(const handle_t& snarl_start, const handle_t& snarl_end) { + return find_gbwt_traversals(snarl_start, snarl_end, false).first; +} + pair, vector> GBWTTraversalFinder::find_path_traversals(const Snarl& site) { // get the unique traversals pair, vector>> gbwt_traversals = find_gbwt_traversals(site, true); @@ -3492,6 +3572,26 @@ pair, vector> GBWTTraversalFinder::find_ return path_traversals; } +pair, vector> GBWTTraversalFinder::find_path_traversals(const handle_t& snarl_start, const handle_t& snarl_end) { + // get the unique traversals + pair, vector>> gbwt_traversals = this->find_gbwt_traversals(snarl_start, snarl_end, true); + + // expand them out to one per path (this is to be consistent with PathTraversalFinder as used in deconstruct) + pair, vector> path_traversals; + for (size_t i = 0; i < gbwt_traversals.first.size(); ++i) { + Traversal& trav = gbwt_traversals.first[i]; + vector& paths = gbwt_traversals.second[i]; + for (size_t j = 0; j < paths.size(); ++j) { + path_traversals.first.push_back(trav); + path_traversals.second.push_back(paths[j]); + } + } + + return path_traversals; +} + + + } diff --git a/src/traversal_finder.hpp b/src/traversal_finder.hpp index 47ad31e80ae..349d6c27f00 100644 --- a/src/traversal_finder.hpp +++ b/src/traversal_finder.hpp @@ -36,6 +36,13 @@ using namespace std; class AugmentedGraph; +// some Protobuf replacements +using Traversal = vector; +using PathInterval = pair; +string traversal_to_string(const PathHandleGraph* graph, const Traversal& traversal, bool max_steps = 10); +// replaces pb2json(snarl) +string graph_interval_to_string(const HandleGraph* graph, const handle_t& start_handle, const handle_t& end_handle); + /** * Represents a strategy for finding traversals of (nested) sites. Polymorphic * base class/interface. @@ -44,7 +51,12 @@ class TraversalFinder { public: virtual ~TraversalFinder() = default; - virtual vector find_traversals(const Snarl& site) = 0; + virtual vector find_traversals(const Snarl& site) = 0; + + // new, protobuf-free interface. hope is to eventually deprecate the old one. for now it is only supported in a few places + virtual vector find_traversals(const handle_t& snarl_start, const handle_t& snarl_end) { + assert(false); return{}; + } }; class ExhaustiveTraversalFinder : public TraversalFinder { @@ -201,25 +213,25 @@ class PathTraversalFinder : public TraversalFinder { // our graph with indexed path positions const PathHandleGraph& graph; - SnarlManager& snarl_manager; - // restrict to these paths unordered_set paths; public: // if path_names not empty, only those paths will be considered - PathTraversalFinder(const PathHandleGraph& graph, SnarlManager& snarl_manager, + PathTraversalFinder(const PathHandleGraph& graph, const vector& path_names = {}); /** * Return all traversals through the site that are sub-paths of embedded paths in the graph */ virtual vector find_traversals(const Snarl& site); + virtual vector find_traversals(const handle_t& snarl_start, const handle_t& snarl_end); /** * Like above, but return the path steps for the for the traversal endpoints */ - virtual pair, vector > > find_path_traversals(const Snarl& site); + virtual pair, vector> find_path_traversals(const Snarl& site); + virtual pair, vector> find_path_traversals(const handle_t& snarl_start, const handle_t& snarl_end); }; @@ -636,18 +648,22 @@ class GBWTTraversalFinder : public TraversalFinder { /* Return a traversal for every gbwt thread through the snarl */ virtual vector find_traversals(const Snarl& site); + virtual vector find_traversals(const handle_t& snarl_start, const handle_t& snarl_end); /** Return the traversals, paired with their path identifiers in the gbwt. The traversals are * unique, but there can be more than one path along each one (hence the vector) */ virtual pair, vector>> find_gbwt_traversals(const Snarl& site, bool return_paths = true); + virtual pair, vector>> + find_gbwt_traversals(const handle_t& snarl_start, const handle_t& snarl_end, bool return_paths = true); /** Return traversals paired with path identifiers from the GBWT. The traversals are *not* unique * (which is consistent with PathTraversalFinder) * To get the sample name from the path identifier id, use gbwtgraph::get_path_sample_name(); */ virtual pair, vector> find_path_traversals(const Snarl& site); + virtual pair, vector> find_path_traversals(const handle_t& snarl_start, const handle_t& snarl_end); const gbwt::GBWT& get_gbwt() { return gbwt; } diff --git a/src/unittest/snarls.cpp b/src/unittest/snarls.cpp index 77d06a9dd73..670de4ad76b 100644 --- a/src/unittest/snarls.cpp +++ b/src/unittest/snarls.cpp @@ -3933,7 +3933,7 @@ namespace vg { CactusSnarlFinder snarl_finder(graph); SnarlManager snarl_manager = snarl_finder.find_snarls(); - PathTraversalFinder trav_finder(graph, snarl_manager); + PathTraversalFinder trav_finder(graph); Snarl snarl; snarl.mutable_start()->set_node_id(2); @@ -3972,7 +3972,7 @@ namespace vg { CactusSnarlFinder snarl_finder(graph); SnarlManager snarl_manager = snarl_finder.find_snarls(); - PathTraversalFinder trav_finder(graph, snarl_manager); + PathTraversalFinder trav_finder(graph); Snarl snarl; snarl.mutable_start()->set_node_id(7); @@ -4013,7 +4013,7 @@ namespace vg { CactusSnarlFinder snarl_finder(graph); SnarlManager snarl_manager = snarl_finder.find_snarls(); - PathTraversalFinder trav_finder(graph, snarl_manager); + PathTraversalFinder trav_finder(graph); Snarl snarl; snarl.mutable_start()->set_node_id(8); @@ -4053,7 +4053,7 @@ namespace vg { CactusSnarlFinder snarl_finder(graph); SnarlManager snarl_manager = snarl_finder.find_snarls(); - PathTraversalFinder trav_finder(graph, snarl_manager); + PathTraversalFinder trav_finder(graph); Snarl snarl; snarl.mutable_start()->set_node_id(11); diff --git a/src/variant_recall.cpp b/src/variant_recall.cpp index 340f86c394a..e85821a3d14 100644 --- a/src/variant_recall.cpp +++ b/src/variant_recall.cpp @@ -82,7 +82,7 @@ void genotype_svs(VG* graph, Deconstructor decon; CactusSnarlFinder finder(xg_index); SnarlManager snarl_manager = finder.find_snarls(); - decon.deconstruct({refpath}, &xg_index, &snarl_manager, false, 1, false, 10000, false, false, false, false); + decon.deconstruct({refpath}, &xg_index, &snarl_manager, false, 10000, false, false, false, false); } direct_ins.clear(); diff --git a/test/t/26_deconstruct.t b/test/t/26_deconstruct.t index aa11d17d092..1b2cafb7460 100644 --- a/test/t/26_deconstruct.t +++ b/test/t/26_deconstruct.t @@ -5,22 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 24 - -vg construct -r tiny/tiny.fa -v tiny/tiny.vcf.gz > tiny.vg -vg index tiny.vg -x tiny.xg -vg deconstruct tiny.xg -p x -t 1 > tiny_decon.vcf -# we pop out that GC allele because it gets invented by the adjacent snps in the graph -gzip -dc tiny/tiny.vcf.gz | tail -3 | awk '{print $1 "\t" $2 "\t" $4 "\t" $5}' > tiny_orig.tsv -cat tiny_decon.vcf | grep -v "#" | grep -v GC | awk '{print $1 "\t" $2 "\t" $4 "\t" $5}' > tiny_dec.tsv -diff tiny_orig.tsv tiny_dec.tsv -is "$?" 0 "deconstruct retrieved original VCF (modulo adjacent snp allele)" -grep '>1>6' tiny_decon.vcf | awk '{print $8}' > allele_travs -printf "AT=>1>3>5>6,>1>3>4>6,>1>2>5>6,>1>2>4>6\n" > expected_allele_travs -diff allele_travs expected_allele_travs -is "$?" 0 "deconstruct produced expected AT field" - -rm -f tiny.vg tiny.xg tiny_decon.vcf tiny_orig.tsv tiny_dec.tsv allele_travs expected_allele_travs +plan tests 22 vg msga -f GRCh38_alts/FASTA/HLA/V-352962.fa -t 1 -k 16 | vg mod -U 10 - | vg mod -c - > hla.vg vg index hla.vg -x hla.xg diff --git a/vgci/vgci.sh b/vgci/vgci.sh index 192d6d19839..54b52f09911 100755 --- a/vgci/vgci.sh +++ b/vgci/vgci.sh @@ -340,12 +340,14 @@ then # Dependencies for running tests. Need numpy, scipy and sklearn for # running toil-vg mapeval, and dateutils and reqests for - # ./mins_since_last_build.py. Make sure to get the giant buildable modules - # as binaries only to avoid wasting CI time building some slightly nicer - # version Pip prefers. - pip3 install pytest timeout_decorator requests dateutils + # ./mins_since_last_build.py. Need exactly the right version of requests + # for the Python Docker API to work (see + # ). + pip3 install pytest timeout_decorator 'requests==2.31.0' dateutils # TODO: To upgrade to Numpy 1.24+, we need to remove usages of `numpy.int` # AKA `np.int` from toil-vg. See + # Make sure to get the giant buildable modules as binaries only to avoid + # wasting CI time building some slightly nicer version Pip prefers. pip3 install 'numpy==1.23.5' scipy scikit-learn --only-binary :all: # Install Toil