Skip to content

Commit

Permalink
move to weighted jac; add small test
Browse files Browse the repository at this point in the history
  • Loading branch information
glennhickey committed May 27, 2024
1 parent 9d01127 commit f491fdb
Show file tree
Hide file tree
Showing 5 changed files with 70 additions and 14 deletions.
17 changes: 11 additions & 6 deletions src/deconstructor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -350,11 +350,16 @@ void Deconstructor::get_genotypes(vcflib::Variant& v, const vector<string>& name
genotype += (chosen_travs[i] != -1 && (!conflict || keep_conflicted_genotypes))
? std::to_string(trav_to_allele[chosen_travs[i]]) : ".";
if (this->cluster_threshold < 1.0) {
ostringstream ss;
ss.precision(3);
ss << trav_to_cluster_info[chosen_travs[i]].first;
v.samples[sample_name]["TS"].push_back(ss.str());
v.samples[sample_name]["TL"].push_back(std::to_string(trav_to_cluster_info[chosen_travs[i]].second));
if (*genotype.rbegin() == '.') {
v.samples[sample_name]["TS"].push_back(".");
v.samples[sample_name]["TL"].push_back(".");
} else {
ostringstream ss;
ss.precision(3);
ss << trav_to_cluster_info[chosen_travs[i]].first;
v.samples[sample_name]["TS"].push_back(ss.str());
v.samples[sample_name]["TL"].push_back(std::to_string(trav_to_cluster_info[chosen_travs[i]].second));
}
}
}
v.samples[sample_name]["GT"] = {genotype};
Expand Down Expand Up @@ -920,7 +925,7 @@ string Deconstructor::get_vcf_header() {
if (sample_to_haps.empty()) {
cerr << "Error [vg deconstruct]: No paths found for alt alleles in the graph. Note that "
<< "exhaustive path-free traversal finding is no longer supported, and vg deconstruct "
<< "now only works on embedded paths and GBWT threads" << endl;
<< "now only works on embedded paths and GBWT threads." << endl;
exit(1);
}

Expand Down
19 changes: 14 additions & 5 deletions src/subcommand/deconstruct_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -260,13 +260,22 @@ int main_deconstruct(int argc, char** argv){
}

if (refpaths.empty() && refpath_prefixes.empty()) {
bool found_hap;
// No paths specified: use them all
graph->for_each_path_handle([&](path_handle_t path_handle) {
const string& name = graph->get_path_name(path_handle);
if (!Paths::is_alt(name) && PathMetadata::parse_sense(name) != PathSense::HAPLOTYPE) {
refpaths.push_back(name);
}
});
const string& name = graph->get_path_name(path_handle);
if (!Paths::is_alt(name) && PathMetadata::parse_sense(name) != PathSense::HAPLOTYPE) {
refpaths.push_back(name);
} else {
found_hap = true;
}
});

if (!found_hap) {
cerr << "error [vg deconstruct]: All graph paths selected as references (leaving no alts). Please use -P/-p "
<< "to narrow down the reference to a subset of paths, or GBZ/GBWT input that contains haplotype paths" << endl;
return 1;
}
}

// Read the translation
Expand Down
27 changes: 26 additions & 1 deletion src/traversal_clusters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,31 @@
namespace vg {


// specialized version of jaccard_coefficient() that weights jaccard by node lenths.
double weighted_jaccard_coefficient(const PathHandleGraph* graph,
const multiset<handle_t>& target,
const multiset<handle_t>& query) {
vector<handle_t> intersection_handles;
std::set_intersection(target.begin(), target.end(),
query.begin(), query.end(),
std::back_inserter(intersection_handles));
vector<handle_t> union_handles;
std::set_union(target.begin(), target.end(),
query.begin(), query.end(),
std::back_inserter(union_handles));

int64_t isec_size = 0;
for (const handle_t& handle : intersection_handles) {
isec_size += graph->get_length(handle);
}
int64_t union_size = 0;
for (const handle_t& handle : union_handles) {
union_size += graph->get_length(handle);
}
return (double)isec_size / (double)union_size;
}


vector<int> get_traversal_order(const PathHandleGraph* graph,
const vector<Traversal>& traversals,
const vector<string>& trav_path_names,
Expand Down Expand Up @@ -89,7 +114,7 @@ vector<vector<int>> cluster_traversals(const PathHandleGraph* graph,
int64_t max_cluster_idx = -1;
for (int64_t j = 0; j < clusters.size(); ++j) {
const auto& cluster_trav = sorted_traversals[clusters[j][0]];
double jac = jaccard_coefficient(trav, cluster_trav);
double jac = weighted_jaccard_coefficient(graph, trav, cluster_trav);
if (jac > max_jaccard) {
max_jaccard = jac;
max_cluster_idx = j;
Expand Down
4 changes: 3 additions & 1 deletion src/traversal_clusters.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,11 @@ inline double jaccard_coefficient(const T& target, const U& query) {
query.begin(), query.end(),
count_back_inserter<typename T::value_type>(union_size));
return (double)isec_size / (double)union_size;

}

// specialized version that weights jaccard by node lenths.
double weighted_jaccard_coefficient(const PathHandleGraph* graph, const multiset<handle_t>& target, const multiset<handle_t>& query);

// the information needed from the parent traversal in order to
// genotype a child traversal
struct ParentGenotypeInfo {
Expand Down
17 changes: 16 additions & 1 deletion test/t/26_deconstruct.t
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap

PATH=../bin:$PATH # for vg

plan tests 22
plan tests 25

vg msga -f GRCh38_alts/FASTA/HLA/V-352962.fa -t 1 -k 16 | vg mod -U 10 - | vg mod -c - > hla.vg
vg index hla.vg -x hla.xg
Expand Down Expand Up @@ -141,4 +141,19 @@ is "$?" 0 "gbz deconstruction gives same output as gbwt deconstruction"

rm -f x.vg x.xg x.gbwt x.decon.vcf.gz x.decon.vcf.gz.tbi x.decon.vcf x.gbz.decon.vcf x.giraffe.gbz x.min x.dist small.s1.h1.fa small.s1.h2.fa decon.s1.h1.fa decon.s1.h2.fa

# todo you could argue merging shouldn't happen here because there's no child snarl
# this check should come into play with nesting support
vg construct -r small/x.fa -v small/x.vcf.gz | vg view - > small_cluster.gfa
printf "L\t1\t+\t9\t+\t0M\n" >> small_cluster.gfa
printf "P\ty\t1+,2+,4+,6+,7+,9+\t*\n" >> small_cluster.gfa
printf "P\tz\t1+,9+\t*\n" >> small_cluster.gfa
vg deconstruct small_cluster.gfa -p x > small_cluster_0.vcf
vg deconstruct small_cluster.gfa -p x -L 0.3 > small_cluster_3.vcf
is "$(tail -1 small_cluster_0.vcf | awk '{print $5}')" "GATTTGA,G" "cluster-free deconstruction finds all alt alleles"
is "$(tail -1 small_cluster_3.vcf | awk '{print $5}')" "G" "clustered deconstruction finds fewer alt alleles"
is "$(tail -1 small_cluster_3.vcf | awk '{print $10}')" "0:0.333:0" "clustered deconstruction finds correct allele info"

rm -f small_cluster.gfa small_cluster_0.vcf small_cluster_3.vcf



2 comments on commit f491fdb

@adamnovak
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for branch deconstruct. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 17365 seconds

@adamnovak
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for branch deconstruct. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 17365 seconds

Please sign in to comment.