From 5f1dc81be1411abd6ca6e9d2d66f33b9a0eddeda Mon Sep 17 00:00:00 2001 From: Xian Date: Wed, 27 Nov 2024 11:14:32 +0100 Subject: [PATCH 01/10] Update libbdsg and add unused option to only store distances along the top-level chain --- deps/libbdsg | 2 +- src/snarl_distance_index.cpp | 13 +++++++------ src/snarl_distance_index.hpp | 6 +++--- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/deps/libbdsg b/deps/libbdsg index 5d12710fcd..def1c35a74 160000 --- a/deps/libbdsg +++ b/deps/libbdsg @@ -1 +1 @@ -Subproject commit 5d12710fcdff1090c16479210a1d25d244d6b129 +Subproject commit def1c35a748506fe6668115a41f5d7b994bfbbf0 diff --git a/src/snarl_distance_index.cpp b/src/snarl_distance_index.cpp index 5fc4dcd022..215437a38f 100644 --- a/src/snarl_distance_index.cpp +++ b/src/snarl_distance_index.cpp @@ -20,11 +20,12 @@ size_t maximum_distance(const SnarlDistanceIndex& distance_index, pos_t pos1, po get_id(pos2), get_is_rev(pos2), get_offset(pos2)); } -void fill_in_distance_index(SnarlDistanceIndex* distance_index, const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit, bool silence_warnings) { +void fill_in_distance_index(SnarlDistanceIndex* distance_index, const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit, bool only_top_level_chain_distances, bool silence_warnings) { distance_index->set_snarl_size_limit(size_limit); + distance_index->set_only_top_level_chain_distances(only_top_level_chain_distances); //Build the temporary distance index from the graph - SnarlDistanceIndex::TemporaryDistanceIndex temp_index = make_temporary_distance_index(graph, snarl_finder, size_limit); + SnarlDistanceIndex::TemporaryDistanceIndex temp_index = make_temporary_distance_index(graph, snarl_finder, size_limit, only_top_level_chain_distances); if (!silence_warnings && temp_index.use_oversized_snarls) { cerr << "warning: distance index uses oversized snarls, which may make mapping slow" << endl; @@ -37,7 +38,7 @@ void fill_in_distance_index(SnarlDistanceIndex* distance_index, const HandleGrap distance_index->get_snarl_tree_records(indexes, graph); } SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( - const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit) { + const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit, bool only_top_level_chain_distances) { #ifdef debug_distance_indexing cerr << "Creating new distance index for nodes between " << graph->min_node_id() << " and " << graph->max_node_id() << endl; @@ -484,7 +485,7 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( temp_index.temp_snarl_records.at(chain_child_index.second); //Fill in this snarl's distances - populate_snarl_index(temp_index, chain_child_index, size_limit, graph); + populate_snarl_index(temp_index, chain_child_index, size_limit, only_top_level_chain_distances, graph); bool new_component = temp_snarl_record.min_length == std::numeric_limits::max(); if (new_component){ @@ -733,7 +734,7 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( for (pair& component_index : temp_index.components) { if (component_index.first == SnarlDistanceIndex::TEMP_SNARL) { SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.temp_snarl_records.at(component_index.second); - populate_snarl_index(temp_index, component_index, size_limit, graph); + populate_snarl_index(temp_index, component_index, size_limit, only_top_level_chain_distances, graph); temp_snarl_record.min_length = std::numeric_limits::max(); } } @@ -756,7 +757,7 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( void populate_snarl_index( SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, pair snarl_index, size_t size_limit, - const HandleGraph* graph) { + bool only_top_level_chain_distances, const HandleGraph* graph) { #ifdef debug_distance_indexing cerr << "Getting the distances for snarl " << temp_index.structure_start_end_as_string(snarl_index) << endl; assert(snarl_index.first == SnarlDistanceIndex::TEMP_SNARL); diff --git a/src/snarl_distance_index.hpp b/src/snarl_distance_index.hpp index 13e8777301..33764d5eb8 100644 --- a/src/snarl_distance_index.hpp +++ b/src/snarl_distance_index.hpp @@ -22,13 +22,13 @@ size_t maximum_distance(const SnarlDistanceIndex& distance_index, pos_t pos1, po //Fill in the index //size_limit is a limit on the number of nodes in a snarl, after which the index won't store pairwise distances -void fill_in_distance_index(SnarlDistanceIndex* distance_index, const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit = 50000, bool silence_warnings=true); +void fill_in_distance_index(SnarlDistanceIndex* distance_index, const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit = 50000, bool only_top_level_chain_distances = false, bool silence_warnings=true); //Fill in the temporary snarl record with distances void populate_snarl_index(SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, - pair snarl_index, size_t size_limit, const HandleGraph* graph) ; + pair snarl_index, size_t size_limit, bool only_top_level_chain_distances, const HandleGraph* graph) ; -SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index(const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit); +SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index(const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit, bool only_top_level_chain_distances); //Define wang_hash for net_handle_t's so that we can use a hash_map template<> struct wang_hash { From 64d7e82e0f40a42792a3b89f3ac9d40d98ae8742 Mon Sep 17 00:00:00 2001 From: Xian Chang Date: Wed, 27 Nov 2024 07:01:08 -0800 Subject: [PATCH 02/10] Add option to not store nested distances --- deps/libbdsg | 2 +- src/subcommand/index_main.cpp | 49 +++++++++++++++++++++-------------- 2 files changed, 30 insertions(+), 21 deletions(-) diff --git a/deps/libbdsg b/deps/libbdsg index def1c35a74..507aa3af14 160000 --- a/deps/libbdsg +++ b/deps/libbdsg @@ -1 +1 @@ -Subproject commit def1c35a748506fe6668115a41f5d7b994bfbbf0 +Subproject commit 507aa3af14185673ef80fb9d9fab343d4db86fdf diff --git a/src/subcommand/index_main.cpp b/src/subcommand/index_main.cpp index 731fdcff26..3e744a7cc6 100644 --- a/src/subcommand/index_main.cpp +++ b/src/subcommand/index_main.cpp @@ -38,28 +38,29 @@ void help_index(char** argv) { << "Creates an index on the specified graph or graphs. All graphs indexed must " << endl << "already be in a joint ID space." << endl << "general options:" << endl - << " -b, --temp-dir DIR use DIR for temporary files" << endl - << " -t, --threads N number of threads to use" << endl - << " -p, --progress show progress" << endl + << " -b, --temp-dir DIR use DIR for temporary files" << endl + << " -t, --threads N number of threads to use" << endl + << " -p, --progress show progress" << endl << "xg options:" << endl - << " -x, --xg-name FILE use this file to store a succinct, queryable version of the graph(s), or read for GCSA or distance indexing" << endl - << " -L, --xg-alts include alt paths in xg" << endl + << " -x, --xg-name FILE use this file to store a succinct, queryable version of the graph(s), or read for GCSA or distance indexing" << endl + << " -L, --xg-alts include alt paths in xg" << endl << "gcsa options:" << endl - << " -g, --gcsa-out FILE output a GCSA2 index to the given file" << endl - //<< " -i, --dbg-in FILE use kmers from FILE instead of input VG (may repeat)" << endl - << " -f, --mapping FILE use this node mapping in GCSA2 construction" << endl - << " -k, --kmer-size N index kmers of size N in the graph (default " << gcsa::Key::MAX_LENGTH << ")" << endl - << " -X, --doubling-steps N use this number of doubling steps for GCSA2 construction (default " << gcsa::ConstructionParameters::DOUBLING_STEPS << ")" << endl - << " -Z, --size-limit N limit temporary disk space usage to N gigabytes (default " << gcsa::ConstructionParameters::SIZE_LIMIT << ")" << endl - << " -V, --verify-index validate the GCSA2 index using the input kmers (important for testing)" << endl + << " -g, --gcsa-out FILE output a GCSA2 index to the given file" << endl + //<< " -i, --dbg-in FILE use kmers from FILE instead of input VG (may repeat)" << endl + << " -f, --mapping FILE use this node mapping in GCSA2 construction" << endl + << " -k, --kmer-size N index kmers of size N in the graph (default " << gcsa::Key::MAX_LENGTH << ")" << endl + << " -X, --doubling-steps N use this number of doubling steps for GCSA2 construction (default " << gcsa::ConstructionParameters::DOUBLING_STEPS << ")" << endl + << " -Z, --size-limit N limit temporary disk space usage to N gigabytes (default " << gcsa::ConstructionParameters::SIZE_LIMIT << ")" << endl + << " -V, --verify-index validate the GCSA2 index using the input kmers (important for testing)" << endl << "gam indexing options:" << endl - << " -l, --index-sorted-gam input is sorted .gam format alignments, store a GAI index of the sorted GAM in INPUT.gam.gai" << endl + << " -l, --index-sorted-gam input is sorted .gam format alignments, store a GAI index of the sorted GAM in INPUT.gam.gai" << endl << "vg in-place indexing options:" << endl - << " --index-sorted-vg input is ID-sorted .vg format graph chunks, store a VGI index of the sorted vg in INPUT.vg.vgi" << endl + << " --index-sorted-vg input is ID-sorted .vg format graph chunks, store a VGI index of the sorted vg in INPUT.vg.vgi" << endl << "snarl distance index options" << endl - << " -j --dist-name FILE use this file to store a snarl-based distance index" << endl - << " --snarl-limit N don't store snarl distances for snarls with more than N nodes (default 10000)" << endl - << " if N is 0 then don't store distances, only the snarl tree" << endl; + << " -j --dist-name FILE use this file to store a snarl-based distance index" << endl + << " --snarl-limit N don't store snarl distances for snarls with more than N nodes (default 10000)" << endl + << " if N is 0 then don't store distances, only the snarl tree" << endl + << " --no-nested-distance only store distances along the top-level chain" << endl; } int main_index(int argc, char** argv) { @@ -72,6 +73,7 @@ int main_index(int argc, char** argv) { #define OPT_BUILD_VGI_INDEX 1000 #define OPT_RENAME_VARIANTS 1001 #define OPT_DISTANCE_SNARL_LIMIT 1002 + #define OPT_DISTANCE_NESTING 1003 // Which indexes to build. bool build_xg = false, build_gcsa = false, build_dist = false; @@ -103,6 +105,8 @@ int main_index(int argc, char** argv) { //Distance index size_t snarl_limit = 50000; + bool only_top_level_chain_distances = false; + int c; optind = 2; // force optind past command positional argument while (true) { @@ -155,6 +159,7 @@ int main_index(int argc, char** argv) { //Snarl distance index {"snarl-limit", required_argument, 0, OPT_DISTANCE_SNARL_LIMIT}, {"dist-name", required_argument, 0, 'j'}, + {"no-nested-distance", required_argument, 0, OPT_DISTANCE_SNARL_LIMIT}, {0, 0, 0, 0} }; @@ -254,6 +259,10 @@ int main_index(int argc, char** argv) { snarl_limit = parse(optarg); break; + case OPT_DISTANCE_NESTING: + only_top_level_chain_distances = true; + break; + case 'h': case '?': help_index(argv); @@ -531,7 +540,7 @@ int main_index(int argc, char** argv) { SnarlDistanceIndex distance_index; //Fill it in - fill_in_distance_index(&distance_index, xg.get(), &snarl_finder, snarl_limit, false); + fill_in_distance_index(&distance_index, xg.get(), &snarl_finder, snarl_limit, only_top_level_chain_distances, false); // Save it distance_index.serialize(dist_name); } else { @@ -547,7 +556,7 @@ int main_index(int argc, char** argv) { //Make a distance index and fill it in SnarlDistanceIndex distance_index; - fill_in_distance_index(&distance_index, &(gbz->graph), &snarl_finder, snarl_limit); + fill_in_distance_index(&distance_index, &(gbz->graph), &snarl_finder, snarl_limit, only_top_level_chain_distances, false); // Save it distance_index.serialize(dist_name); } else if (get<1>(options)) { @@ -559,7 +568,7 @@ int main_index(int argc, char** argv) { //Make a distance index and fill it in SnarlDistanceIndex distance_index; - fill_in_distance_index(&distance_index, graph.get(), &snarl_finder, snarl_limit); + fill_in_distance_index(&distance_index, graph.get(), &snarl_finder, snarl_limit, only_top_level_chain_distances, false); // Save it distance_index.serialize(dist_name); } else { From 13c08f2aae7ac4a18d56f2fe0657a0838ebb2a13 Mon Sep 17 00:00:00 2001 From: Xian Date: Wed, 27 Nov 2024 18:52:05 +0100 Subject: [PATCH 03/10] add unit test for top-level only distances --- src/unittest/snarl_distance_index.cpp | 39 +++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/src/unittest/snarl_distance_index.cpp b/src/unittest/snarl_distance_index.cpp index 9fbbe322cd..d3583d51ee 100644 --- a/src/unittest/snarl_distance_index.cpp +++ b/src/unittest/snarl_distance_index.cpp @@ -150,7 +150,7 @@ namespace vg { REQUIRE(std::get<2>(traceback.second.back()) == -5); } } - TEST_CASE( "Nested chain with loop", "[snarl_distance]" ) { + TEST_CASE( "Nested chain with loop", "[snarl_distance][bug]" ) { VG graph; @@ -188,6 +188,7 @@ namespace vg { Edge* e17 = graph.create_edge(n11, n12); Edge* e18 = graph.create_edge(n12, n13); + graph.serialize_to_file("test_graph.vg"); //get the snarls IntegratedSnarlFinder snarl_finder(graph); SECTION("Traversal of chain") { @@ -276,6 +277,40 @@ namespace vg { } } } + SECTION("Top-level distances only index") { + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder, 500, true); + + //Nodes on the top-level chain have distances + for (auto& id : {n1->id(), n2->id(), n4->id(), n5->id(), n12->id(), n13->id()}) { + net_handle_t n = distance_index.get_node_net_handle(id); + net_handle_t parent = distance_index.get_parent(n); + assert(distance_index.is_root(distance_index.get_parent(parent))); + distance_index.minimum_length(n); + distance_index.minimum_length(parent); + REQUIRE(true); + } + + //Nested nodes don't have distances + for (auto& id : {n3->id(), n6->id(), n7->id(), n8->id(), n9->id(), n10->id(), n11->id()}) { + net_handle_t n = distance_index.get_node_net_handle(id); + net_handle_t parent = distance_index.get_parent(n); + assert(!distance_index.is_root(distance_index.get_parent(parent))); + try { + distance_index.minimum_length(n); + distance_index.minimum_length(parent); + REQUIRE(false); + } catch (const std::runtime_error& err) { + REQUIRE(true); + } + } + + SECTION("Find minimum distances along the top-level chain") { + REQUIRE(distance_index.minimum_distance(n2->id(),true, 0, n2->id(), false, 0) == 1); + REQUIRE(distance_index.minimum_distance(n4->id(),true, 0, n5->id(), true, 0) == 19); + REQUIRE(distance_index.minimum_distance(n5->id(),false, 0, n5->id(), true, 0) == 9); + } + } } TEST_CASE( "Snarl decomposition can deal with multiple connected components", "[snarl_distance]" ) { @@ -7143,7 +7178,7 @@ namespace vg { default_random_engine generator(test_seed_source()); - for (size_t repeat = 0; repeat < 0; repeat++) { + for (size_t repeat = 0; repeat < 100; repeat++) { uniform_int_distribution bases_dist(100, 1000); size_t bases = bases_dist(generator); From 77a9826c2ad5756313ca12c27f25c5bc4c109acb Mon Sep 17 00:00:00 2001 From: Xian Chang Date: Wed, 27 Nov 2024 11:54:23 -0800 Subject: [PATCH 04/10] TUrn off random unit tests --- deps/libbdsg | 2 +- src/unittest/snarl_distance_index.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/deps/libbdsg b/deps/libbdsg index 507aa3af14..f626d12b23 160000 --- a/deps/libbdsg +++ b/deps/libbdsg @@ -1 +1 @@ -Subproject commit 507aa3af14185673ef80fb9d9fab343d4db86fdf +Subproject commit f626d12b23dfa0db9d8c01c61119af3e4170fb33 diff --git a/src/unittest/snarl_distance_index.cpp b/src/unittest/snarl_distance_index.cpp index d3583d51ee..ee0b854b74 100644 --- a/src/unittest/snarl_distance_index.cpp +++ b/src/unittest/snarl_distance_index.cpp @@ -7178,7 +7178,7 @@ namespace vg { default_random_engine generator(test_seed_source()); - for (size_t repeat = 0; repeat < 100; repeat++) { + for (size_t repeat = 0; repeat < 0; repeat++) { uniform_int_distribution bases_dist(100, 1000); size_t bases = bases_dist(generator); From 33bc76ed033e796c546aa8eaa7243226e69eb491 Mon Sep 17 00:00:00 2001 From: Xian Chang Date: Wed, 27 Nov 2024 13:15:50 -0800 Subject: [PATCH 05/10] Fix index subcommand --- src/subcommand/index_main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/subcommand/index_main.cpp b/src/subcommand/index_main.cpp index 3e744a7cc6..2bc331395a 100644 --- a/src/subcommand/index_main.cpp +++ b/src/subcommand/index_main.cpp @@ -159,7 +159,7 @@ int main_index(int argc, char** argv) { //Snarl distance index {"snarl-limit", required_argument, 0, OPT_DISTANCE_SNARL_LIMIT}, {"dist-name", required_argument, 0, 'j'}, - {"no-nested-distance", required_argument, 0, OPT_DISTANCE_SNARL_LIMIT}, + {"no-nested-distance", no_argument, 0, OPT_DISTANCE_NESTING}, {0, 0, 0, 0} }; From 5a915ddbe8973d2c3b7c1f4997763d69001a344c Mon Sep 17 00:00:00 2001 From: Xian Date: Sun, 1 Dec 2024 10:55:31 +0100 Subject: [PATCH 06/10] Don't reserve space for distances in a snarl if we arent going to save them --- src/snarl_distance_index.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/snarl_distance_index.cpp b/src/snarl_distance_index.cpp index 215437a38f..5f0437c664 100644 --- a/src/snarl_distance_index.cpp +++ b/src/snarl_distance_index.cpp @@ -1064,10 +1064,13 @@ void populate_snarl_index( */ - //Reserve enough space to store all possible distances - temp_snarl_record.distances.reserve( (temp_snarl_record.node_count > size_limit || size_limit == 0) - ? temp_snarl_record.node_count * 2 - : temp_snarl_record.node_count * temp_snarl_record.node_count); + if (size_limit != 0 && !only_top_level_chain_distances) { + //If we are saving distances + //Reserve enough space to store all possible distances + temp_snarl_record.distances.reserve( temp_snarl_record.node_count > size_limit + ? temp_snarl_record.node_count * 2 + : temp_snarl_record.node_count * temp_snarl_record.node_count); + } if (size_limit != 0 && temp_snarl_record.node_count > size_limit) { temp_index.use_oversized_snarls = true; @@ -1138,7 +1141,7 @@ void populate_snarl_index( // assert(start_rank != 0 && start_rank != 1); //} - if ( (temp_snarl_record.node_count > size_limit || size_limit == 0) && (temp_snarl_record.is_root_snarl || (!start_is_tip && + if ( (temp_snarl_record.node_count > size_limit || size_limit == 0 || only_top_level_chain_distances) && (temp_snarl_record.is_root_snarl || (!start_is_tip && start_rank != 0 && start_rank != 1))) { //If we don't care about internal distances, and we also are not at a boundary or tip //TODO: Why do we care about tips specifically? From c8007a2a23ad9d1da01049aa93851d1ba1559d97 Mon Sep 17 00:00:00 2001 From: Xian Chang Date: Tue, 3 Dec 2024 01:00:58 -0800 Subject: [PATCH 07/10] Update libbdsg to estimate the size of distanceless indexes better --- deps/libbdsg | 2 +- src/snarl_distance_index.cpp | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/deps/libbdsg b/deps/libbdsg index f626d12b23..982848f76a 160000 --- a/deps/libbdsg +++ b/deps/libbdsg @@ -1 +1 @@ -Subproject commit f626d12b23dfa0db9d8c01c61119af3e4170fb33 +Subproject commit 982848f76a8b628223c4d67fd633b564b66d56ad diff --git a/src/snarl_distance_index.cpp b/src/snarl_distance_index.cpp index 5f0437c664..5ef5e1ab22 100644 --- a/src/snarl_distance_index.cpp +++ b/src/snarl_distance_index.cpp @@ -211,9 +211,12 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( temp_chain_record.end_node_id = node_id; temp_chain_record.end_node_rev = graph->get_is_reverse(chain_end_handle); temp_chain_record.end_node_length = graph->get_length(chain_end_handle); + + bool is_root_chain = false; if (stack.empty()) { //If this was the last thing on the stack, then this was a root + is_root_chain = true; //Check to see if there is anything connected to the ends of the chain vector reachable_nodes; @@ -280,7 +283,7 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( parent_snarl_record.children.emplace_back(chain_index); } - temp_index.max_index_size += temp_chain_record.get_max_record_length(); + temp_index.max_index_size += temp_chain_record.get_max_record_length(!only_top_level_chain_distances || is_root_chain ? true : false ); #ifdef debug_distance_indexing cerr << " Ending new " << (temp_chain_record.is_trivial ? "trivial " : "") << "chain " << temp_index.structure_start_end_as_string(chain_index) << endl << " that is a child of " << temp_index.structure_start_end_as_string(temp_chain_record.parent) << endl; @@ -1070,6 +1073,8 @@ void populate_snarl_index( temp_snarl_record.distances.reserve( temp_snarl_record.node_count > size_limit ? temp_snarl_record.node_count * 2 : temp_snarl_record.node_count * temp_snarl_record.node_count); + } else { + temp_snarl_record.include_distances = false; } if (size_limit != 0 && temp_snarl_record.node_count > size_limit) { From 0916459d22ffa7d0fcfe9b0a4a7b0c499932bfa2 Mon Sep 17 00:00:00 2001 From: Xian Date: Fri, 6 Dec 2024 13:22:07 +0100 Subject: [PATCH 08/10] Update libbdsg to get the python bindings to work --- deps/libbdsg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/libbdsg b/deps/libbdsg index 982848f76a..6cff67667e 160000 --- a/deps/libbdsg +++ b/deps/libbdsg @@ -1 +1 @@ -Subproject commit 982848f76a8b628223c4d67fd633b564b66d56ad +Subproject commit 6cff67667eefaf1a573cd3b64777d2d78ad2a572 From fc1a6a34892ab8fabcb2becb93421381d9d5e4de Mon Sep 17 00:00:00 2001 From: Xian Date: Fri, 6 Dec 2024 15:38:00 +0100 Subject: [PATCH 09/10] Use one vector for all chain distances when building the distance index --- deps/libbdsg | 2 +- src/snarl_distance_index.cpp | 147 +++++++++++++++++++---------------- 2 files changed, 81 insertions(+), 68 deletions(-) diff --git a/deps/libbdsg b/deps/libbdsg index 6cff67667e..84a80f4b48 160000 --- a/deps/libbdsg +++ b/deps/libbdsg @@ -1 +1 @@ -Subproject commit 6cff67667eefaf1a573cd3b64777d2d78ad2a572 +Subproject commit 84a80f4b4867421c19301df44d76d0b6fc5c61aa diff --git a/src/snarl_distance_index.cpp b/src/snarl_distance_index.cpp index 5ef5e1ab22..eba1a48113 100644 --- a/src/snarl_distance_index.cpp +++ b/src/snarl_distance_index.cpp @@ -460,10 +460,13 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( #endif //Add the first values for the prefix sum and backwards loop vectors - temp_chain_record.prefix_sum.emplace_back(0); - temp_chain_record.max_prefix_sum.emplace_back(0); - temp_chain_record.backward_loops.emplace_back(std::numeric_limits::max()); - temp_chain_record.chain_components.emplace_back(0); + //This adds the forward loop, which will be set later + temp_chain_record.per_node_distances.emplace_back(); + temp_chain_record.per_node_distances.back().prefix_sum=0; + temp_chain_record.per_node_distances.back().max_prefix_sum=0; + temp_chain_record.per_node_distances.back().forward_loop=std::numeric_limits::max(); + temp_chain_record.per_node_distances.back().backward_loop=std::numeric_limits::max(); + temp_chain_record.per_node_distances.back().chain_component=0; /*First, go through each of the snarls in the chain in the forward direction and @@ -477,7 +480,7 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( //Go through each of the children in the chain, skipping nodes //The snarl may be trivial, in which case don't fill in the distances #ifdef debug_distance_indexing - cerr << " Looking at child " << temp_index.structure_start_end_as_string(chain_child_index) << " current max prefi xum " << temp_chain_record.max_prefix_sum.back() << endl; + cerr << " Looking at child " << temp_index.structure_start_end_as_string(chain_child_index) << " current max prefi xum " << temp_chain_record.per_node_distances.back().max_prefix_sum << endl; #endif if (chain_child_index.first == SnarlDistanceIndex::TEMP_SNARL){ @@ -501,29 +504,39 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( //tracking the distance vectors here //Update the maximum distance - temp_index.max_distance = std::max(temp_index.max_distance, temp_chain_record.max_prefix_sum.back()); + temp_index.max_distance = std::max(temp_index.max_distance, temp_chain_record.per_node_distances.back().max_prefix_sum); + + temp_chain_record.per_node_distances.emplace_back(); + temp_chain_record.per_node_distances.back().prefix_sum=0; + temp_chain_record.per_node_distances.back().max_prefix_sum=0; + temp_chain_record.per_node_distances.back().backward_loop=temp_snarl_record.distance_end_end; + temp_chain_record.per_node_distances.back().forward_loop=std::numeric_limits::max(); + - temp_chain_record.prefix_sum.emplace_back(0); - temp_chain_record.max_prefix_sum.emplace_back(0); - temp_chain_record.backward_loops.emplace_back(temp_snarl_record.distance_end_end); //If the chain is disconnected, the max length is infinite temp_chain_record.max_length = std::numeric_limits::max(); } else { - temp_chain_record.prefix_sum.emplace_back(SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( - temp_chain_record.prefix_sum.back(), + const auto& prev_node_distances = temp_chain_record.per_node_distances.back(); + temp_chain_record.per_node_distances.emplace_back(); + temp_chain_record.per_node_distances.back().forward_loop=std::numeric_limits::max(); + temp_chain_record.per_node_distances.back().prefix_sum = SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( + prev_node_distances.prefix_sum, temp_snarl_record.min_length), - temp_snarl_record.start_node_length)); - temp_chain_record.max_prefix_sum.emplace_back(SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( - temp_chain_record.max_prefix_sum.back(), + temp_snarl_record.start_node_length); + + temp_chain_record.per_node_distances.back().max_prefix_sum = SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( + prev_node_distances.max_prefix_sum, temp_snarl_record.max_length), - temp_snarl_record.start_node_length)); - temp_chain_record.backward_loops.emplace_back(std::min(temp_snarl_record.distance_end_end, - SnarlDistanceIndex::sum(temp_chain_record.backward_loops.back() - , 2 * (temp_snarl_record.start_node_length + temp_snarl_record.min_length)))); + temp_snarl_record.start_node_length); + + temp_chain_record.per_node_distances.back().backward_loop = std::min(temp_snarl_record.distance_end_end, + SnarlDistanceIndex::sum(prev_node_distances.backward_loop + , 2 * (temp_snarl_record.start_node_length + temp_snarl_record.min_length))); + temp_chain_record.max_length = SnarlDistanceIndex::sum(temp_chain_record.max_length, temp_snarl_record.max_length); } - temp_chain_record.chain_components.emplace_back(curr_component); + temp_chain_record.per_node_distances.back().chain_component = curr_component; if (chain_child_i == temp_chain_record.children.size() - 2 && temp_snarl_record.min_length == std::numeric_limits::max()) { temp_chain_record.loopable = false; } @@ -546,16 +559,20 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( } }); - temp_chain_record.prefix_sum.emplace_back(SnarlDistanceIndex::sum(temp_chain_record.prefix_sum.back(), last_node_length)); - temp_chain_record.max_prefix_sum.emplace_back(SnarlDistanceIndex::sum(temp_chain_record.max_prefix_sum.back(), last_node_length)); - temp_chain_record.backward_loops.emplace_back(std::min(backward_loop, - SnarlDistanceIndex::sum(temp_chain_record.backward_loops.back(), 2 * last_node_length))); + const auto& prev_node_distances = temp_chain_record.per_node_distances.back(); + temp_chain_record.per_node_distances.emplace_back(); + + temp_chain_record.per_node_distances.back().prefix_sum=SnarlDistanceIndex::sum(prev_node_distances.prefix_sum, last_node_length); + temp_chain_record.per_node_distances.back().max_prefix_sum=SnarlDistanceIndex::sum(prev_node_distances.max_prefix_sum, last_node_length); + temp_chain_record.per_node_distances.back().backward_loop= std::min(backward_loop, + SnarlDistanceIndex::sum(prev_node_distances.backward_loop, 2 * last_node_length)); + temp_chain_record.per_node_distances.back().forward_loop=std::numeric_limits::max(); if (chain_child_i == temp_chain_record.children.size()-1) { //If this is the last node temp_chain_record.loopable=false; } - temp_chain_record.chain_components.emplace_back(curr_component); + temp_chain_record.per_node_distances.back().chain_component=curr_component; } last_node_length = temp_index.temp_node_records.at(chain_child_index.second - temp_index.min_node_id).node_length; //And update the chains max length @@ -563,57 +580,51 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( last_node_length); } } //Finished walking through chain - if (temp_chain_record.start_node_id == temp_chain_record.end_node_id && temp_chain_record.chain_components.back() != 0) { + if (temp_chain_record.start_node_id == temp_chain_record.end_node_id && temp_chain_record.per_node_distances.back().chain_component != 0) { //If this is a looping, multicomponent chain, the start/end node could end up in separate chain components //despite being the same node. //Since the first component will always be 0, set the first node's component to be whatever the last //component was - temp_chain_record.chain_components[0] = temp_chain_record.chain_components.back(); + temp_chain_record.per_node_distances[0].chain_component = temp_chain_record.per_node_distances.back().chain_component; } //For a multicomponent chain, the actual minimum length will always be infinite, but since we sometimes need //the length of the last component, save that here temp_chain_record.min_length = !temp_chain_record.is_trivial && temp_chain_record.start_node_id == temp_chain_record.end_node_id - ? temp_chain_record.prefix_sum.back() - : SnarlDistanceIndex::sum(temp_chain_record.prefix_sum.back() , temp_chain_record.end_node_length); + ? temp_chain_record.per_node_distances.back().prefix_sum + : SnarlDistanceIndex::sum(temp_chain_record.per_node_distances.back().prefix_sum , temp_chain_record.end_node_length); -#ifdef debug_distance_indexing - assert(temp_chain_record.prefix_sum.size() == temp_chain_record.backward_loops.size()); - assert(temp_chain_record.prefix_sum.size() == temp_chain_record.chain_components.size()); -#endif /*Now that we've gone through all the snarls in the chain, fill in the forward loop vector * by going through the chain in the backwards direction */ - temp_chain_record.forward_loops.resize(temp_chain_record.prefix_sum.size(), - std::numeric_limits::max()); if (temp_chain_record.start_node_id == temp_chain_record.end_node_id && temp_chain_record.children.size() > 1) { //If this is a looping chain, then check the first snarl for a loop if (temp_chain_record.children.at(1).first == SnarlDistanceIndex::TEMP_SNARL) { SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.temp_snarl_records.at(temp_chain_record.children.at(1).second); - temp_chain_record.forward_loops[temp_chain_record.forward_loops.size()-1] = temp_snarl_record.distance_start_start; + temp_chain_record.per_node_distances[temp_chain_record.per_node_distances.size()-1].forward_loop = temp_snarl_record.distance_start_start; } } - size_t node_i = temp_chain_record.prefix_sum.size() - 2; + size_t node_i = temp_chain_record.per_node_distances.size() - 2; // We start at the next to last node because we need to look at this record and the next one. last_node_length = 0; for (int j = (int)temp_chain_record.children.size() - 1 ; j >= 0 ; j--) { auto& child = temp_chain_record.children.at(j); if (child.first == SnarlDistanceIndex::TEMP_SNARL){ SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.temp_snarl_records.at(child.second); - if (temp_chain_record.chain_components.at(node_i) != temp_chain_record.chain_components.at(node_i+1) && - temp_chain_record.chain_components.at(node_i+1) != 0){ + if (temp_chain_record.per_node_distances.at(node_i).chain_component != temp_chain_record.per_node_distances.at(node_i+1).chain_component && + temp_chain_record.per_node_distances.at(node_i+1).chain_component != 0){ //If this is a new chain component, then add the loop distance from the snarl //If the component of the next node is 0, then we're still in the same component since we're going backwards - temp_chain_record.forward_loops.at(node_i) = temp_snarl_record.distance_start_start; + temp_chain_record.per_node_distances.at(node_i).forward_loop = temp_snarl_record.distance_start_start; } else { - temp_chain_record.forward_loops.at(node_i) = + temp_chain_record.per_node_distances.at(node_i).forward_loop = std::min(SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( - temp_chain_record.forward_loops.at(node_i+1), + temp_chain_record.per_node_distances.at(node_i+1).forward_loop, 2* temp_snarl_record.min_length), 2*temp_snarl_record.end_node_length), temp_snarl_record.distance_start_start); @@ -635,8 +646,8 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( forward_loop = 0; } }); - temp_chain_record.forward_loops.at(node_i) = std::min( forward_loop, - SnarlDistanceIndex::sum(temp_chain_record.forward_loops.at(node_i+1) , + temp_chain_record.per_node_distances.at(node_i).forward_loop = std::min( forward_loop, + SnarlDistanceIndex::sum(temp_chain_record.per_node_distances.at(node_i+1).forward_loop, 2*last_node_length)); node_i--; } @@ -652,8 +663,8 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( //Also check if the reverse loop values would be improved if we went around again - if (temp_chain_record.backward_loops.back() < temp_chain_record.backward_loops.front()) { - temp_chain_record.backward_loops[0] = temp_chain_record.backward_loops.back(); + if (temp_chain_record.per_node_distances.back().backward_loop < temp_chain_record.per_node_distances.front().backward_loop) { + temp_chain_record.per_node_distances[0].backward_loop = temp_chain_record.per_node_distances.back().backward_loop; size_t node_i = 1; size_t last_node_length = 0; for (size_t i = 1 ; i < temp_chain_record.children.size()-1 ; i++ ) { @@ -661,61 +672,61 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( if (child.first == SnarlDistanceIndex::TEMP_SNARL) { SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.temp_snarl_records.at(child.second); size_t new_loop_distance = SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( - temp_chain_record.backward_loops.at(node_i-1), + temp_chain_record.per_node_distances.at(node_i-1).backward_loop, 2*temp_snarl_record.min_length), 2*temp_snarl_record.start_node_length); - if (temp_chain_record.chain_components.at(node_i)!= 0 || new_loop_distance >= temp_chain_record.backward_loops.at(node_i)) { + if (temp_chain_record.per_node_distances.at(node_i).chain_component!= 0 || new_loop_distance >= temp_chain_record.per_node_distances.at(node_i).backward_loop) { //If this is a new chain component or it doesn't improve, stop break; } else { //otherwise record the better distance - temp_chain_record.backward_loops.at(node_i) = new_loop_distance; + temp_chain_record.per_node_distances.at(node_i).backward_loop = new_loop_distance; } node_i++; last_node_length = 0; } else { if (last_node_length != 0) { - size_t new_loop_distance = SnarlDistanceIndex::sum(temp_chain_record.backward_loops.at(node_i-1), + size_t new_loop_distance = SnarlDistanceIndex::sum(temp_chain_record.per_node_distances.at(node_i-1).backward_loop, 2*last_node_length); - size_t old_loop_distance = temp_chain_record.backward_loops.at(node_i); - temp_chain_record.backward_loops.at(node_i) = std::min(old_loop_distance,new_loop_distance); + size_t old_loop_distance = temp_chain_record.per_node_distances.at(node_i).backward_loop; + temp_chain_record.per_node_distances.at(node_i).backward_loop = std::min(old_loop_distance,new_loop_distance); node_i++; } last_node_length = temp_index.temp_node_records.at(child.second - temp_index.min_node_id).node_length; } } } - if (temp_chain_record.forward_loops.front() < temp_chain_record.forward_loops.back()) { + if (temp_chain_record.per_node_distances.front().forward_loop < temp_chain_record.per_node_distances.back().forward_loop) { //If this is a looping chain and looping improves the forward loops, //then we have to keep going around to update distance - temp_chain_record.forward_loops.back() = temp_chain_record.forward_loops.front(); + temp_chain_record.per_node_distances.back().forward_loop = temp_chain_record.per_node_distances.front().forward_loop; size_t last_node_length = 0; - node_i = temp_chain_record.prefix_sum.size() - 2; + node_i = temp_chain_record.per_node_distances.size() - 2; for (int j = (int)temp_chain_record.children.size() - 1 ; j >= 0 ; j--) { auto& child = temp_chain_record.children.at(j); if (child.first == SnarlDistanceIndex::TEMP_SNARL){ SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.temp_snarl_records.at(child.second); size_t new_distance = SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( - temp_chain_record.forward_loops.at(node_i+1), + temp_chain_record.per_node_distances.at(node_i+1).forward_loop, 2* temp_snarl_record.min_length), 2*temp_snarl_record.end_node_length); - if (temp_chain_record.chain_components.at(node_i) != temp_chain_record.chain_components.at(node_i+1) || - new_distance >= temp_chain_record.forward_loops.at(node_i)){ + if (temp_chain_record.per_node_distances.at(node_i).chain_component != temp_chain_record.per_node_distances.at(node_i+1).chain_component || + new_distance >= temp_chain_record.per_node_distances.at(node_i).forward_loop){ //If this is a new component or the distance doesn't improve, stop looking break; } else { //otherwise, update the distance - temp_chain_record.forward_loops.at(node_i) = new_distance; + temp_chain_record.per_node_distances.at(node_i).forward_loop = new_distance; } node_i --; last_node_length =0; } else { if (last_node_length != 0) { - size_t new_distance = SnarlDistanceIndex::sum(temp_chain_record.forward_loops.at(node_i+1) , 2* last_node_length); - size_t old_distance = temp_chain_record.forward_loops.at(node_i); - temp_chain_record.forward_loops.at(node_i) = std::min(old_distance, new_distance); + size_t new_distance = SnarlDistanceIndex::sum(temp_chain_record.per_node_distances.at(node_i+1).forward_loop , 2* last_node_length); + size_t old_distance = temp_chain_record.per_node_distances.at(node_i).forward_loop; + temp_chain_record.per_node_distances.at(node_i).forward_loop = std::min(old_distance, new_distance); node_i--; } last_node_length = temp_index.temp_node_records.at(child.second - temp_index.min_node_id).node_length; @@ -724,9 +735,11 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( } } - temp_index.max_distance = std::max(temp_index.max_distance, temp_chain_record.max_prefix_sum.back()); - temp_index.max_distance = temp_chain_record.forward_loops.back() == std::numeric_limits::max() ? temp_index.max_distance : std::max(temp_index.max_distance, temp_chain_record.forward_loops.back()); - temp_index.max_distance = temp_chain_record.backward_loops.front() == std::numeric_limits::max() ? temp_index.max_distance : std::max(temp_index.max_distance, temp_chain_record.backward_loops.front()); + temp_index.max_distance = std::max(temp_index.max_distance, temp_chain_record.per_node_distances.back().max_prefix_sum); + temp_index.max_distance = temp_chain_record.per_node_distances.back().forward_loop == std::numeric_limits::max() ? temp_index.max_distance : std::max(temp_index.max_distance, temp_chain_record.per_node_distances.back().forward_loop); + temp_index.max_distance = temp_chain_record.per_node_distances.front().backward_loop == std::numeric_limits::max() + ? temp_index.max_distance + : std::max(temp_index.max_distance, temp_chain_record.per_node_distances.front().backward_loop); assert(temp_index.max_distance <= 2742664019); } @@ -1376,7 +1389,7 @@ void populate_snarl_index( size_t next_node_length = next_index.first == SnarlDistanceIndex::TEMP_NODE ? graph->get_length(next_handle) : temp_index.temp_chain_records[next_index.second].min_length; if (next_index.first == SnarlDistanceIndex::TEMP_CHAIN && - temp_index.temp_chain_records[next_index.second].chain_components.back() != 0) { + temp_index.temp_chain_records[next_index.second].per_node_distances.back().chain_component != 0) { //If there are multiple components, then the chain is not start-end reachable so its length //is actually infinite next_node_length = std::numeric_limits::max(); @@ -1387,8 +1400,8 @@ void populate_snarl_index( } } if (next_index.first == SnarlDistanceIndex::TEMP_CHAIN) { - size_t loop_distance = next_rev ? temp_index.temp_chain_records[next_index.second].backward_loops.back() - : temp_index.temp_chain_records[next_index.second].forward_loops.front(); + size_t loop_distance = next_rev ? temp_index.temp_chain_records[next_index.second].per_node_distances.back().backward_loop + : temp_index.temp_chain_records[next_index.second].per_node_distances.front().forward_loop; if (loop_distance != std::numeric_limits::max() && visited_nodes.count(make_pair(next_index, !next_rev)) == 0 && graph->get_id(next_handle) != temp_snarl_record.start_node_id && From ae92b91e7fe9abe49bd0982048dd3a5125a99032 Mon Sep 17 00:00:00 2001 From: Xian Date: Fri, 6 Dec 2024 15:55:13 +0100 Subject: [PATCH 10/10] Undo last commit because it has the same problem that I literally just had with python bindings --- deps/libbdsg | 2 +- src/snarl_distance_index.cpp | 147 ++++++++++++++++------------------- 2 files changed, 68 insertions(+), 81 deletions(-) diff --git a/deps/libbdsg b/deps/libbdsg index 84a80f4b48..6cff67667e 160000 --- a/deps/libbdsg +++ b/deps/libbdsg @@ -1 +1 @@ -Subproject commit 84a80f4b4867421c19301df44d76d0b6fc5c61aa +Subproject commit 6cff67667eefaf1a573cd3b64777d2d78ad2a572 diff --git a/src/snarl_distance_index.cpp b/src/snarl_distance_index.cpp index eba1a48113..5ef5e1ab22 100644 --- a/src/snarl_distance_index.cpp +++ b/src/snarl_distance_index.cpp @@ -460,13 +460,10 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( #endif //Add the first values for the prefix sum and backwards loop vectors - //This adds the forward loop, which will be set later - temp_chain_record.per_node_distances.emplace_back(); - temp_chain_record.per_node_distances.back().prefix_sum=0; - temp_chain_record.per_node_distances.back().max_prefix_sum=0; - temp_chain_record.per_node_distances.back().forward_loop=std::numeric_limits::max(); - temp_chain_record.per_node_distances.back().backward_loop=std::numeric_limits::max(); - temp_chain_record.per_node_distances.back().chain_component=0; + temp_chain_record.prefix_sum.emplace_back(0); + temp_chain_record.max_prefix_sum.emplace_back(0); + temp_chain_record.backward_loops.emplace_back(std::numeric_limits::max()); + temp_chain_record.chain_components.emplace_back(0); /*First, go through each of the snarls in the chain in the forward direction and @@ -480,7 +477,7 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( //Go through each of the children in the chain, skipping nodes //The snarl may be trivial, in which case don't fill in the distances #ifdef debug_distance_indexing - cerr << " Looking at child " << temp_index.structure_start_end_as_string(chain_child_index) << " current max prefi xum " << temp_chain_record.per_node_distances.back().max_prefix_sum << endl; + cerr << " Looking at child " << temp_index.structure_start_end_as_string(chain_child_index) << " current max prefi xum " << temp_chain_record.max_prefix_sum.back() << endl; #endif if (chain_child_index.first == SnarlDistanceIndex::TEMP_SNARL){ @@ -504,39 +501,29 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( //tracking the distance vectors here //Update the maximum distance - temp_index.max_distance = std::max(temp_index.max_distance, temp_chain_record.per_node_distances.back().max_prefix_sum); - - temp_chain_record.per_node_distances.emplace_back(); - temp_chain_record.per_node_distances.back().prefix_sum=0; - temp_chain_record.per_node_distances.back().max_prefix_sum=0; - temp_chain_record.per_node_distances.back().backward_loop=temp_snarl_record.distance_end_end; - temp_chain_record.per_node_distances.back().forward_loop=std::numeric_limits::max(); - + temp_index.max_distance = std::max(temp_index.max_distance, temp_chain_record.max_prefix_sum.back()); + temp_chain_record.prefix_sum.emplace_back(0); + temp_chain_record.max_prefix_sum.emplace_back(0); + temp_chain_record.backward_loops.emplace_back(temp_snarl_record.distance_end_end); //If the chain is disconnected, the max length is infinite temp_chain_record.max_length = std::numeric_limits::max(); } else { - const auto& prev_node_distances = temp_chain_record.per_node_distances.back(); - temp_chain_record.per_node_distances.emplace_back(); - temp_chain_record.per_node_distances.back().forward_loop=std::numeric_limits::max(); - temp_chain_record.per_node_distances.back().prefix_sum = SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( - prev_node_distances.prefix_sum, + temp_chain_record.prefix_sum.emplace_back(SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( + temp_chain_record.prefix_sum.back(), temp_snarl_record.min_length), - temp_snarl_record.start_node_length); - - temp_chain_record.per_node_distances.back().max_prefix_sum = SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( - prev_node_distances.max_prefix_sum, + temp_snarl_record.start_node_length)); + temp_chain_record.max_prefix_sum.emplace_back(SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( + temp_chain_record.max_prefix_sum.back(), temp_snarl_record.max_length), - temp_snarl_record.start_node_length); - - temp_chain_record.per_node_distances.back().backward_loop = std::min(temp_snarl_record.distance_end_end, - SnarlDistanceIndex::sum(prev_node_distances.backward_loop - , 2 * (temp_snarl_record.start_node_length + temp_snarl_record.min_length))); - + temp_snarl_record.start_node_length)); + temp_chain_record.backward_loops.emplace_back(std::min(temp_snarl_record.distance_end_end, + SnarlDistanceIndex::sum(temp_chain_record.backward_loops.back() + , 2 * (temp_snarl_record.start_node_length + temp_snarl_record.min_length)))); temp_chain_record.max_length = SnarlDistanceIndex::sum(temp_chain_record.max_length, temp_snarl_record.max_length); } - temp_chain_record.per_node_distances.back().chain_component = curr_component; + temp_chain_record.chain_components.emplace_back(curr_component); if (chain_child_i == temp_chain_record.children.size() - 2 && temp_snarl_record.min_length == std::numeric_limits::max()) { temp_chain_record.loopable = false; } @@ -559,20 +546,16 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( } }); - const auto& prev_node_distances = temp_chain_record.per_node_distances.back(); - temp_chain_record.per_node_distances.emplace_back(); - - temp_chain_record.per_node_distances.back().prefix_sum=SnarlDistanceIndex::sum(prev_node_distances.prefix_sum, last_node_length); - temp_chain_record.per_node_distances.back().max_prefix_sum=SnarlDistanceIndex::sum(prev_node_distances.max_prefix_sum, last_node_length); - temp_chain_record.per_node_distances.back().backward_loop= std::min(backward_loop, - SnarlDistanceIndex::sum(prev_node_distances.backward_loop, 2 * last_node_length)); - temp_chain_record.per_node_distances.back().forward_loop=std::numeric_limits::max(); + temp_chain_record.prefix_sum.emplace_back(SnarlDistanceIndex::sum(temp_chain_record.prefix_sum.back(), last_node_length)); + temp_chain_record.max_prefix_sum.emplace_back(SnarlDistanceIndex::sum(temp_chain_record.max_prefix_sum.back(), last_node_length)); + temp_chain_record.backward_loops.emplace_back(std::min(backward_loop, + SnarlDistanceIndex::sum(temp_chain_record.backward_loops.back(), 2 * last_node_length))); if (chain_child_i == temp_chain_record.children.size()-1) { //If this is the last node temp_chain_record.loopable=false; } - temp_chain_record.per_node_distances.back().chain_component=curr_component; + temp_chain_record.chain_components.emplace_back(curr_component); } last_node_length = temp_index.temp_node_records.at(chain_child_index.second - temp_index.min_node_id).node_length; //And update the chains max length @@ -580,51 +563,57 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( last_node_length); } } //Finished walking through chain - if (temp_chain_record.start_node_id == temp_chain_record.end_node_id && temp_chain_record.per_node_distances.back().chain_component != 0) { + if (temp_chain_record.start_node_id == temp_chain_record.end_node_id && temp_chain_record.chain_components.back() != 0) { //If this is a looping, multicomponent chain, the start/end node could end up in separate chain components //despite being the same node. //Since the first component will always be 0, set the first node's component to be whatever the last //component was - temp_chain_record.per_node_distances[0].chain_component = temp_chain_record.per_node_distances.back().chain_component; + temp_chain_record.chain_components[0] = temp_chain_record.chain_components.back(); } //For a multicomponent chain, the actual minimum length will always be infinite, but since we sometimes need //the length of the last component, save that here temp_chain_record.min_length = !temp_chain_record.is_trivial && temp_chain_record.start_node_id == temp_chain_record.end_node_id - ? temp_chain_record.per_node_distances.back().prefix_sum - : SnarlDistanceIndex::sum(temp_chain_record.per_node_distances.back().prefix_sum , temp_chain_record.end_node_length); + ? temp_chain_record.prefix_sum.back() + : SnarlDistanceIndex::sum(temp_chain_record.prefix_sum.back() , temp_chain_record.end_node_length); +#ifdef debug_distance_indexing + assert(temp_chain_record.prefix_sum.size() == temp_chain_record.backward_loops.size()); + assert(temp_chain_record.prefix_sum.size() == temp_chain_record.chain_components.size()); +#endif /*Now that we've gone through all the snarls in the chain, fill in the forward loop vector * by going through the chain in the backwards direction */ + temp_chain_record.forward_loops.resize(temp_chain_record.prefix_sum.size(), + std::numeric_limits::max()); if (temp_chain_record.start_node_id == temp_chain_record.end_node_id && temp_chain_record.children.size() > 1) { //If this is a looping chain, then check the first snarl for a loop if (temp_chain_record.children.at(1).first == SnarlDistanceIndex::TEMP_SNARL) { SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.temp_snarl_records.at(temp_chain_record.children.at(1).second); - temp_chain_record.per_node_distances[temp_chain_record.per_node_distances.size()-1].forward_loop = temp_snarl_record.distance_start_start; + temp_chain_record.forward_loops[temp_chain_record.forward_loops.size()-1] = temp_snarl_record.distance_start_start; } } - size_t node_i = temp_chain_record.per_node_distances.size() - 2; + size_t node_i = temp_chain_record.prefix_sum.size() - 2; // We start at the next to last node because we need to look at this record and the next one. last_node_length = 0; for (int j = (int)temp_chain_record.children.size() - 1 ; j >= 0 ; j--) { auto& child = temp_chain_record.children.at(j); if (child.first == SnarlDistanceIndex::TEMP_SNARL){ SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.temp_snarl_records.at(child.second); - if (temp_chain_record.per_node_distances.at(node_i).chain_component != temp_chain_record.per_node_distances.at(node_i+1).chain_component && - temp_chain_record.per_node_distances.at(node_i+1).chain_component != 0){ + if (temp_chain_record.chain_components.at(node_i) != temp_chain_record.chain_components.at(node_i+1) && + temp_chain_record.chain_components.at(node_i+1) != 0){ //If this is a new chain component, then add the loop distance from the snarl //If the component of the next node is 0, then we're still in the same component since we're going backwards - temp_chain_record.per_node_distances.at(node_i).forward_loop = temp_snarl_record.distance_start_start; + temp_chain_record.forward_loops.at(node_i) = temp_snarl_record.distance_start_start; } else { - temp_chain_record.per_node_distances.at(node_i).forward_loop = + temp_chain_record.forward_loops.at(node_i) = std::min(SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( - temp_chain_record.per_node_distances.at(node_i+1).forward_loop, + temp_chain_record.forward_loops.at(node_i+1), 2* temp_snarl_record.min_length), 2*temp_snarl_record.end_node_length), temp_snarl_record.distance_start_start); @@ -646,8 +635,8 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( forward_loop = 0; } }); - temp_chain_record.per_node_distances.at(node_i).forward_loop = std::min( forward_loop, - SnarlDistanceIndex::sum(temp_chain_record.per_node_distances.at(node_i+1).forward_loop, + temp_chain_record.forward_loops.at(node_i) = std::min( forward_loop, + SnarlDistanceIndex::sum(temp_chain_record.forward_loops.at(node_i+1) , 2*last_node_length)); node_i--; } @@ -663,8 +652,8 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( //Also check if the reverse loop values would be improved if we went around again - if (temp_chain_record.per_node_distances.back().backward_loop < temp_chain_record.per_node_distances.front().backward_loop) { - temp_chain_record.per_node_distances[0].backward_loop = temp_chain_record.per_node_distances.back().backward_loop; + if (temp_chain_record.backward_loops.back() < temp_chain_record.backward_loops.front()) { + temp_chain_record.backward_loops[0] = temp_chain_record.backward_loops.back(); size_t node_i = 1; size_t last_node_length = 0; for (size_t i = 1 ; i < temp_chain_record.children.size()-1 ; i++ ) { @@ -672,61 +661,61 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( if (child.first == SnarlDistanceIndex::TEMP_SNARL) { SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.temp_snarl_records.at(child.second); size_t new_loop_distance = SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( - temp_chain_record.per_node_distances.at(node_i-1).backward_loop, + temp_chain_record.backward_loops.at(node_i-1), 2*temp_snarl_record.min_length), 2*temp_snarl_record.start_node_length); - if (temp_chain_record.per_node_distances.at(node_i).chain_component!= 0 || new_loop_distance >= temp_chain_record.per_node_distances.at(node_i).backward_loop) { + if (temp_chain_record.chain_components.at(node_i)!= 0 || new_loop_distance >= temp_chain_record.backward_loops.at(node_i)) { //If this is a new chain component or it doesn't improve, stop break; } else { //otherwise record the better distance - temp_chain_record.per_node_distances.at(node_i).backward_loop = new_loop_distance; + temp_chain_record.backward_loops.at(node_i) = new_loop_distance; } node_i++; last_node_length = 0; } else { if (last_node_length != 0) { - size_t new_loop_distance = SnarlDistanceIndex::sum(temp_chain_record.per_node_distances.at(node_i-1).backward_loop, + size_t new_loop_distance = SnarlDistanceIndex::sum(temp_chain_record.backward_loops.at(node_i-1), 2*last_node_length); - size_t old_loop_distance = temp_chain_record.per_node_distances.at(node_i).backward_loop; - temp_chain_record.per_node_distances.at(node_i).backward_loop = std::min(old_loop_distance,new_loop_distance); + size_t old_loop_distance = temp_chain_record.backward_loops.at(node_i); + temp_chain_record.backward_loops.at(node_i) = std::min(old_loop_distance,new_loop_distance); node_i++; } last_node_length = temp_index.temp_node_records.at(child.second - temp_index.min_node_id).node_length; } } } - if (temp_chain_record.per_node_distances.front().forward_loop < temp_chain_record.per_node_distances.back().forward_loop) { + if (temp_chain_record.forward_loops.front() < temp_chain_record.forward_loops.back()) { //If this is a looping chain and looping improves the forward loops, //then we have to keep going around to update distance - temp_chain_record.per_node_distances.back().forward_loop = temp_chain_record.per_node_distances.front().forward_loop; + temp_chain_record.forward_loops.back() = temp_chain_record.forward_loops.front(); size_t last_node_length = 0; - node_i = temp_chain_record.per_node_distances.size() - 2; + node_i = temp_chain_record.prefix_sum.size() - 2; for (int j = (int)temp_chain_record.children.size() - 1 ; j >= 0 ; j--) { auto& child = temp_chain_record.children.at(j); if (child.first == SnarlDistanceIndex::TEMP_SNARL){ SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.temp_snarl_records.at(child.second); size_t new_distance = SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( - temp_chain_record.per_node_distances.at(node_i+1).forward_loop, + temp_chain_record.forward_loops.at(node_i+1), 2* temp_snarl_record.min_length), 2*temp_snarl_record.end_node_length); - if (temp_chain_record.per_node_distances.at(node_i).chain_component != temp_chain_record.per_node_distances.at(node_i+1).chain_component || - new_distance >= temp_chain_record.per_node_distances.at(node_i).forward_loop){ + if (temp_chain_record.chain_components.at(node_i) != temp_chain_record.chain_components.at(node_i+1) || + new_distance >= temp_chain_record.forward_loops.at(node_i)){ //If this is a new component or the distance doesn't improve, stop looking break; } else { //otherwise, update the distance - temp_chain_record.per_node_distances.at(node_i).forward_loop = new_distance; + temp_chain_record.forward_loops.at(node_i) = new_distance; } node_i --; last_node_length =0; } else { if (last_node_length != 0) { - size_t new_distance = SnarlDistanceIndex::sum(temp_chain_record.per_node_distances.at(node_i+1).forward_loop , 2* last_node_length); - size_t old_distance = temp_chain_record.per_node_distances.at(node_i).forward_loop; - temp_chain_record.per_node_distances.at(node_i).forward_loop = std::min(old_distance, new_distance); + size_t new_distance = SnarlDistanceIndex::sum(temp_chain_record.forward_loops.at(node_i+1) , 2* last_node_length); + size_t old_distance = temp_chain_record.forward_loops.at(node_i); + temp_chain_record.forward_loops.at(node_i) = std::min(old_distance, new_distance); node_i--; } last_node_length = temp_index.temp_node_records.at(child.second - temp_index.min_node_id).node_length; @@ -735,11 +724,9 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( } } - temp_index.max_distance = std::max(temp_index.max_distance, temp_chain_record.per_node_distances.back().max_prefix_sum); - temp_index.max_distance = temp_chain_record.per_node_distances.back().forward_loop == std::numeric_limits::max() ? temp_index.max_distance : std::max(temp_index.max_distance, temp_chain_record.per_node_distances.back().forward_loop); - temp_index.max_distance = temp_chain_record.per_node_distances.front().backward_loop == std::numeric_limits::max() - ? temp_index.max_distance - : std::max(temp_index.max_distance, temp_chain_record.per_node_distances.front().backward_loop); + temp_index.max_distance = std::max(temp_index.max_distance, temp_chain_record.max_prefix_sum.back()); + temp_index.max_distance = temp_chain_record.forward_loops.back() == std::numeric_limits::max() ? temp_index.max_distance : std::max(temp_index.max_distance, temp_chain_record.forward_loops.back()); + temp_index.max_distance = temp_chain_record.backward_loops.front() == std::numeric_limits::max() ? temp_index.max_distance : std::max(temp_index.max_distance, temp_chain_record.backward_loops.front()); assert(temp_index.max_distance <= 2742664019); } @@ -1389,7 +1376,7 @@ void populate_snarl_index( size_t next_node_length = next_index.first == SnarlDistanceIndex::TEMP_NODE ? graph->get_length(next_handle) : temp_index.temp_chain_records[next_index.second].min_length; if (next_index.first == SnarlDistanceIndex::TEMP_CHAIN && - temp_index.temp_chain_records[next_index.second].per_node_distances.back().chain_component != 0) { + temp_index.temp_chain_records[next_index.second].chain_components.back() != 0) { //If there are multiple components, then the chain is not start-end reachable so its length //is actually infinite next_node_length = std::numeric_limits::max(); @@ -1400,8 +1387,8 @@ void populate_snarl_index( } } if (next_index.first == SnarlDistanceIndex::TEMP_CHAIN) { - size_t loop_distance = next_rev ? temp_index.temp_chain_records[next_index.second].per_node_distances.back().backward_loop - : temp_index.temp_chain_records[next_index.second].per_node_distances.front().forward_loop; + size_t loop_distance = next_rev ? temp_index.temp_chain_records[next_index.second].backward_loops.back() + : temp_index.temp_chain_records[next_index.second].forward_loops.front(); if (loop_distance != std::numeric_limits::max() && visited_nodes.count(make_pair(next_index, !next_rev)) == 0 && graph->get_id(next_handle) != temp_snarl_record.start_node_id &&