From f491fdb033b1f412b741eca98119c6bd80756709 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 27 May 2024 11:51:05 -0400 Subject: [PATCH] move to weighted jac; add small test --- src/deconstructor.cpp | 17 +++++++++++------ src/subcommand/deconstruct_main.cpp | 19 ++++++++++++++----- src/traversal_clusters.cpp | 27 ++++++++++++++++++++++++++- src/traversal_clusters.hpp | 4 +++- test/t/26_deconstruct.t | 17 ++++++++++++++++- 5 files changed, 70 insertions(+), 14 deletions(-) diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index a68486d9790..9352e1e8364 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -350,11 +350,16 @@ void Deconstructor::get_genotypes(vcflib::Variant& v, const vector& name genotype += (chosen_travs[i] != -1 && (!conflict || keep_conflicted_genotypes)) ? std::to_string(trav_to_allele[chosen_travs[i]]) : "."; if (this->cluster_threshold < 1.0) { - ostringstream ss; - ss.precision(3); - ss << trav_to_cluster_info[chosen_travs[i]].first; - v.samples[sample_name]["TS"].push_back(ss.str()); - v.samples[sample_name]["TL"].push_back(std::to_string(trav_to_cluster_info[chosen_travs[i]].second)); + if (*genotype.rbegin() == '.') { + v.samples[sample_name]["TS"].push_back("."); + v.samples[sample_name]["TL"].push_back("."); + } else { + ostringstream ss; + ss.precision(3); + ss << trav_to_cluster_info[chosen_travs[i]].first; + v.samples[sample_name]["TS"].push_back(ss.str()); + v.samples[sample_name]["TL"].push_back(std::to_string(trav_to_cluster_info[chosen_travs[i]].second)); + } } } v.samples[sample_name]["GT"] = {genotype}; @@ -920,7 +925,7 @@ string Deconstructor::get_vcf_header() { if (sample_to_haps.empty()) { cerr << "Error [vg deconstruct]: No paths found for alt alleles in the graph. Note that " << "exhaustive path-free traversal finding is no longer supported, and vg deconstruct " - << "now only works on embedded paths and GBWT threads" << endl; + << "now only works on embedded paths and GBWT threads." << endl; exit(1); } diff --git a/src/subcommand/deconstruct_main.cpp b/src/subcommand/deconstruct_main.cpp index 0eac567df23..e0cb36dac31 100644 --- a/src/subcommand/deconstruct_main.cpp +++ b/src/subcommand/deconstruct_main.cpp @@ -260,13 +260,22 @@ int main_deconstruct(int argc, char** argv){ } if (refpaths.empty() && refpath_prefixes.empty()) { + bool found_hap; // No paths specified: use them all graph->for_each_path_handle([&](path_handle_t path_handle) { - const string& name = graph->get_path_name(path_handle); - if (!Paths::is_alt(name) && PathMetadata::parse_sense(name) != PathSense::HAPLOTYPE) { - refpaths.push_back(name); - } - }); + const string& name = graph->get_path_name(path_handle); + if (!Paths::is_alt(name) && PathMetadata::parse_sense(name) != PathSense::HAPLOTYPE) { + refpaths.push_back(name); + } else { + found_hap = true; + } + }); + + if (!found_hap) { + cerr << "error [vg deconstruct]: All graph paths selected as references (leaving no alts). Please use -P/-p " + << "to narrow down the reference to a subset of paths, or GBZ/GBWT input that contains haplotype paths" << endl; + return 1; + } } // Read the translation diff --git a/src/traversal_clusters.cpp b/src/traversal_clusters.cpp index fefe78277a3..4ff358cf603 100644 --- a/src/traversal_clusters.cpp +++ b/src/traversal_clusters.cpp @@ -5,6 +5,31 @@ namespace vg { +// specialized version of jaccard_coefficient() that weights jaccard by node lenths. +double weighted_jaccard_coefficient(const PathHandleGraph* graph, + const multiset& target, + const multiset& query) { + vector intersection_handles; + std::set_intersection(target.begin(), target.end(), + query.begin(), query.end(), + std::back_inserter(intersection_handles)); + vector union_handles; + std::set_union(target.begin(), target.end(), + query.begin(), query.end(), + std::back_inserter(union_handles)); + + int64_t isec_size = 0; + for (const handle_t& handle : intersection_handles) { + isec_size += graph->get_length(handle); + } + int64_t union_size = 0; + for (const handle_t& handle : union_handles) { + union_size += graph->get_length(handle); + } + return (double)isec_size / (double)union_size; +} + + vector get_traversal_order(const PathHandleGraph* graph, const vector& traversals, const vector& trav_path_names, @@ -89,7 +114,7 @@ vector> cluster_traversals(const PathHandleGraph* graph, int64_t max_cluster_idx = -1; for (int64_t j = 0; j < clusters.size(); ++j) { const auto& cluster_trav = sorted_traversals[clusters[j][0]]; - double jac = jaccard_coefficient(trav, cluster_trav); + double jac = weighted_jaccard_coefficient(graph, trav, cluster_trav); if (jac > max_jaccard) { max_jaccard = jac; max_cluster_idx = j; diff --git a/src/traversal_clusters.hpp b/src/traversal_clusters.hpp index 15b35aa57f6..31c857f553b 100644 --- a/src/traversal_clusters.hpp +++ b/src/traversal_clusters.hpp @@ -39,9 +39,11 @@ inline double jaccard_coefficient(const T& target, const U& query) { query.begin(), query.end(), count_back_inserter(union_size)); return (double)isec_size / (double)union_size; - } +// specialized version that weights jaccard by node lenths. +double weighted_jaccard_coefficient(const PathHandleGraph* graph, const multiset& target, const multiset& query); + // the information needed from the parent traversal in order to // genotype a child traversal struct ParentGenotypeInfo { diff --git a/test/t/26_deconstruct.t b/test/t/26_deconstruct.t index 1b2cafb7460..bde71f7483e 100644 --- a/test/t/26_deconstruct.t +++ b/test/t/26_deconstruct.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 22 +plan tests 25 vg msga -f GRCh38_alts/FASTA/HLA/V-352962.fa -t 1 -k 16 | vg mod -U 10 - | vg mod -c - > hla.vg vg index hla.vg -x hla.xg @@ -141,4 +141,19 @@ is "$?" 0 "gbz deconstruction gives same output as gbwt deconstruction" rm -f x.vg x.xg x.gbwt x.decon.vcf.gz x.decon.vcf.gz.tbi x.decon.vcf x.gbz.decon.vcf x.giraffe.gbz x.min x.dist small.s1.h1.fa small.s1.h2.fa decon.s1.h1.fa decon.s1.h2.fa +# todo you could argue merging shouldn't happen here because there's no child snarl +# this check should come into play with nesting support +vg construct -r small/x.fa -v small/x.vcf.gz | vg view - > small_cluster.gfa +printf "L\t1\t+\t9\t+\t0M\n" >> small_cluster.gfa +printf "P\ty\t1+,2+,4+,6+,7+,9+\t*\n" >> small_cluster.gfa +printf "P\tz\t1+,9+\t*\n" >> small_cluster.gfa +vg deconstruct small_cluster.gfa -p x > small_cluster_0.vcf +vg deconstruct small_cluster.gfa -p x -L 0.3 > small_cluster_3.vcf +is "$(tail -1 small_cluster_0.vcf | awk '{print $5}')" "GATTTGA,G" "cluster-free deconstruction finds all alt alleles" +is "$(tail -1 small_cluster_3.vcf | awk '{print $5}')" "G" "clustered deconstruction finds fewer alt alleles" +is "$(tail -1 small_cluster_3.vcf | awk '{print $10}')" "0:0.333:0" "clustered deconstruction finds correct allele info" + +rm -f small_cluster.gfa small_cluster_0.vcf small_cluster_3.vcf + +