From 748bfdfe3e7e508a8b13d05aabcfacd385bcffea Mon Sep 17 00:00:00 2001 From: Daniel Cooke Date: Sun, 9 Oct 2016 18:34:46 +0100 Subject: [PATCH 1/8] Replace usage of dead positional_coverage Replace with calculate_positional_coverage. --- src/utils/read_stats.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/utils/read_stats.hpp b/src/utils/read_stats.hpp index dc65b4fdc..d79bebc9f 100644 --- a/src/utils/read_stats.hpp +++ b/src/utils/read_stats.hpp @@ -98,14 +98,14 @@ namespace detail double mean_coverage(const T& reads, const GenomicRegion& region, NonMapTag) { if (reads.empty() || is_empty(region)) return 0; - return maths::mean(positional_coverage(reads, region)); + return maths::mean(calculate_positional_coverage(reads, region)); } template double stdev_coverage(const T& reads, const GenomicRegion& region, NonMapTag) { if (reads.empty() || is_empty(region)) return 0; - return maths::stdev(positional_coverage(reads, region)); + return maths::stdev(calculate_positional_coverage(reads, region)); } template From cee9598dd69569671ed976be947fe33d391cbe55 Mon Sep 17 00:00:00 2001 From: Daniel Cooke Date: Sun, 9 Oct 2016 21:11:37 +0100 Subject: [PATCH 2/8] Switch to log space in somatic prior calculations Doing these calculations in normal space can lead to arithmetic overflow and screw up downstream calculations. --- .../mutation/somatic_mutation_model.cpp | 43 ++++++++++++++----- .../mutation/somatic_mutation_model.hpp | 8 +--- 2 files changed, 35 insertions(+), 16 deletions(-) diff --git a/src/core/models/mutation/somatic_mutation_model.cpp b/src/core/models/mutation/somatic_mutation_model.cpp index 49a00b873..63e476416 100644 --- a/src/core/models/mutation/somatic_mutation_model.cpp +++ b/src/core/models/mutation/somatic_mutation_model.cpp @@ -25,23 +25,46 @@ double SomaticMutationModel::evaluate(const CancerGenotype& genotype) const auto& germline = genotype.germline_genotype(); const auto& somatic = genotype.somatic_element(); const auto germline_log_prior = germline_model_.get().evaluate(germline); - const auto somatic_probability_given_germline = probability_of_somatic(somatic, germline); - return germline_log_prior + std::log(somatic_probability_given_germline); + return germline_log_prior + ln_probability_of_somatic(somatic, germline); } -// p(somatic | germline) = 1 / M sum k = 1 -> M p(somatic | germline_k) (M = germline ploidy) -double SomaticMutationModel::probability_of_somatic(const Haplotype& somatic, const Genotype& germline) const +// p(somatic | germline) = 1 / M sum [k = 1 -> M] p(somatic | germline_k) (M = germline ploidy) +double SomaticMutationModel::ln_probability_of_somatic(const Haplotype& somatic, const Genotype& germline) const { - return std::accumulate(std::cbegin(germline), std::cend(germline), 0.0, - [this, &somatic](const double curr, const Haplotype& germline) { - return curr + probability_of_somatic(somatic, germline); - }) / germline.ploidy(); + switch (germline.ploidy()) { + case 1: return ln_probability_of_somatic(somatic, germline); + case 2: + { + const static double ln2 {std::log(2)}; + const auto a = ln_probability_of_somatic(somatic, germline[0]); + const auto b = ln_probability_of_somatic(somatic, germline[1]); + return maths::log_sum_exp(a, b) - ln2; + } + case 3: + { + const static double ln3 {std::log(2)}; + const auto a = ln_probability_of_somatic(somatic, germline[0]); + const auto b = ln_probability_of_somatic(somatic, germline[1]); + const auto c = ln_probability_of_somatic(somatic, germline[3]); + return maths::log_sum_exp(a, b, c) - ln3; + } + default: + { + std::vector tmp(germline.ploidy()); + std::transform(std::cbegin(germline), std::cend(germline), std::begin(tmp), + [this, &somatic] (const Haplotype& haplotype) { + return ln_probability_of_somatic(somatic, haplotype); + }); + return maths::log_sum_exp(tmp) - std::log(germline.ploidy()); + } + } } -double SomaticMutationModel::probability_of_somatic(const Haplotype& somatic, const Haplotype& germline) const +double SomaticMutationModel::ln_probability_of_somatic(const Haplotype& somatic, const Haplotype& germline) const { + // TODO: implement a proper model for this (snv/indel). const auto variants = difference(somatic, germline); - return std::pow(params_.somatic_mutation_rate, variants.size()); + return variants.size() * params_.somatic_mutation_rate; } } // namespace octopus diff --git a/src/core/models/mutation/somatic_mutation_model.hpp b/src/core/models/mutation/somatic_mutation_model.hpp index 45427733f..a81b9c4f4 100644 --- a/src/core/models/mutation/somatic_mutation_model.hpp +++ b/src/core/models/mutation/somatic_mutation_model.hpp @@ -42,8 +42,8 @@ class SomaticMutationModel Parameters params_; // p(somatic | germline) - double probability_of_somatic(const Haplotype& somatic, const Genotype& germline) const; - double probability_of_somatic(const Haplotype& somatic, const Haplotype& germline) const; + double ln_probability_of_somatic(const Haplotype& somatic, const Genotype& germline) const; + double ln_probability_of_somatic(const Haplotype& somatic, const Haplotype& germline) const; }; template @@ -52,16 +52,12 @@ std::vector calculate_log_priors(const Container& genotypes, { static_assert(std::is_same>::value, "genotypes must contain CancerGenotype's"); - std::vector result(genotypes.size()); - std::transform(std::cbegin(genotypes), std::cend(genotypes), std::begin(result), [&model](const auto& genotype) { return model.evaluate(genotype); }); - maths::normalise_logs(result); - return result; } From 775cfe82c1491c459374e5ea6ed95def2dc0d996 Mon Sep 17 00:00:00 2001 From: Daniel Cooke Date: Sun, 9 Oct 2016 21:12:48 +0100 Subject: [PATCH 3/8] Remove vertical whitspace --- src/core/models/genotype/tumour_model.cpp | 88 ++--------------------- 1 file changed, 4 insertions(+), 84 deletions(-) diff --git a/src/core/models/genotype/tumour_model.cpp b/src/core/models/genotype/tumour_model.cpp index 1f5bf1973..c77962b59 100644 --- a/src/core/models/genotype/tumour_model.cpp +++ b/src/core/models/genotype/tumour_model.cpp @@ -167,12 +167,10 @@ template CompressedAlphas flatten_priors(const TumourModel::Priors& priors, const std::vector& samples) { CompressedAlphas result(samples.size()); - std::transform(std::cbegin(samples), std::cend(samples), std::begin(result), [&priors] (const auto& sample) { return compress(priors.alphas.at(sample)); }); - return result; } @@ -182,20 +180,15 @@ compress(const CancerGenotype& genotype, const SampleName& sample, const HaplotypeLikelihoodCache& haplotype_likelihoods) { CompressedGenotype result; - const Genotype& germline_genotype {genotype.germline_genotype()}; - assert(germline_genotype.ploidy() == (K - 1)); - std::transform(std::cbegin(germline_genotype), std::cend(germline_genotype), std::begin(result), [&sample, &haplotype_likelihoods] (const Haplotype& haplotype) -> std::reference_wrapper { return std::cref(haplotype_likelihoods(sample, haplotype)); }); - result.back() = haplotype_likelihoods(sample, genotype.somatic_element()); - return result; } @@ -205,12 +198,10 @@ compress(const std::vector>& genotypes, const SampleNa const HaplotypeLikelihoodCache& haplotype_likelihoods) { CompressedGenotypes result(genotypes.size()); - std::transform(std::cbegin(genotypes), std::cend(genotypes), std::begin(result), [&sample, &haplotype_likelihoods] (const auto& genotype) { return compress(genotype, sample, haplotype_likelihoods); }); - return result; } @@ -222,12 +213,10 @@ compress(const std::vector>& genotypes, { CompressedReadLikelihoods result {}; result.reserve(samples.size()); - std::transform(std::cbegin(samples), std::cend(samples), std::back_inserter(result), [&genotypes, &haplotype_likelihoods] (const auto& sample) { return compress(genotypes, sample, haplotype_likelihoods); }); - return result; } @@ -273,39 +262,30 @@ init_responsabilities(const CompressedAlphas& prior_alphas, { assert(!read_likelihoods.empty()); assert(prior_alphas.size() == read_likelihoods.size()); - // notation follows documentation const auto S = read_likelihoods.size(); // num samples ResponsabilityVectors result {}; result.reserve(S); - for (std::size_t s {0}; s < S; ++s) { std::array al; // no need to keep recomputing this const auto a0 = sum(prior_alphas[s]); - for (unsigned k {0}; k < K; ++k) { al[k] = digamma_diff(prior_alphas[s][k], a0); } - const auto N = read_likelihoods[s][0][0].size(); // num reads in sample s ResponsabilityVector read_responsabilities(N); std::array ln_rho; - for (std::size_t n {0}; n < N; ++n) { for (unsigned k {0}; k < K; ++k) { ln_rho[k] = al[k] + expectation(genotype_pobabilities, read_likelihoods[s], k, n); } - const auto ln_rho_norm = maths::log_sum_exp(ln_rho); - for (unsigned k {0}; k < K; ++k) { read_responsabilities[n][k] = std::exp(ln_rho[k] - ln_rho_norm); } } - result.emplace_back(std::move(read_responsabilities)); } - return result; } @@ -317,25 +297,19 @@ void update_responsabilities(ResponsabilityVectors& result, const CompressedReadLikelihoods& read_likelihoods) { const auto S = read_likelihoods.size(); - for (std::size_t s {0}; s < S; ++s) { std::array al; const auto a0 = sum(posterior_alphas[s]); - for (unsigned k {0}; k < K; ++k) { al[k] = digamma_diff(posterior_alphas[s][k], a0); } - const auto N = read_likelihoods[s][0][0].size(); std::array ln_rho; - for (std::size_t n {0}; n < N; ++n) { for (unsigned k {0}; k < K; ++k) { ln_rho[k] = al[k] + expectation(genotype_pobabilities, read_likelihoods[s], k, n); } - const auto ln_rho_norm = maths::log_sum_exp(ln_rho); - for (unsigned k {0}; k < K; ++k) { result[s][n][k] = std::exp(ln_rho[k] - ln_rho_norm); } @@ -379,23 +353,18 @@ double marginalise(const ResponsabilityVectors& responsabilities, { double result {0}; const auto S = read_likelihoods.size(); // num samples - assert(S == responsabilities.size()); - for (std::size_t s {0}; s < S; ++s) { const auto N = read_likelihoods[s][0][0].size(); // num reads in sample s - assert(responsabilities[s].size() == N); assert(responsabilities[s][0].size() == K); assert(read_likelihoods[s][g].size() == K); - for (unsigned k {0}; k < K; ++k) { for (std::size_t n {0}; n < N; ++n) { result += responsabilities[s][n][k] * read_likelihoods[s][g][k][n]; } } } - return result; } @@ -409,7 +378,6 @@ void update_genotype_log_posteriors(LogProbabilityVector& result, for (std::size_t g {0}; g < G; ++g) { result[g] = genotype_log_priors[g] + marginalise(responsabilities, read_likelihoods, g); } - maths::normalise_logs(result); } @@ -427,12 +395,10 @@ template auto max_change(const CompressedAlpha& lhs, const CompressedAlpha& rhs) { double result {0}; - for (std::size_t k {0}; k < K; ++k) { const auto curr = std::abs(lhs[k] - rhs[k]); if (curr > result) result = curr; } - return result; } @@ -441,16 +407,12 @@ auto max_change(const CompressedAlphas& prior_alphas, const CompressedAlphas& posterior_alphas) { const auto S = prior_alphas.size(); - assert(S == posterior_alphas.size()); - double result {0}; - for (std::size_t s {0}; s < S; ++s) { const auto curr = max_change(prior_alphas[s], posterior_alphas[s]); if (curr > result) result = curr; } - return result; } @@ -500,14 +462,11 @@ template double expectation(const ResponsabilityVector& taus, const CompressedAlpha& alpha) { using boost::math::digamma; - const auto das = digamma(sum(alpha)); double result {0}; - for (unsigned k {0}; k < K; ++k) { result += (digamma(alpha[k]) - das) * sum(taus, k); } - return result; } @@ -528,7 +487,6 @@ double expectation(const ResponsabilityVectors& taus, const std::size_t g) { double result {0}; - for (std::size_t s {0}; s < taus.size(); ++s) { for (std::size_t n {0}; n < taus[s].size(); ++n) { for (unsigned k {0}; k < K; ++k) { @@ -536,7 +494,6 @@ double expectation(const ResponsabilityVectors& taus, } } } - return result; } @@ -547,11 +504,9 @@ double expectation(const ProbabilityVector& genotype_posteriors, const CompressedReadLikelihoods& log_likelihoods) { double result {0}; - for (std::size_t g {0}; g < genotype_posteriors.size(); ++g) { result += genotype_posteriors[g] * expectation(taus, log_likelihoods, g); } - return result; } @@ -620,9 +575,7 @@ double calculate_lower_bound(const CompressedAlphas& prior_alphas, const auto& genotype_log_posteriors = latents.genotype_log_posteriors; const auto& posterior_alphas = latents.alphas; const auto& taus = latents.responsabilities; - double result {0}; - result += expectation(genotype_posteriors, genotype_log_priors); result += expectation(prior_alphas, posterior_alphas); result += expectation(taus, posterior_alphas); @@ -630,7 +583,6 @@ double calculate_lower_bound(const CompressedAlphas& prior_alphas, result -= expectation(genotype_posteriors, genotype_log_posteriors); result -= expectation(posterior_alphas); result -= q_expectation(taus); - return result; } @@ -651,30 +603,22 @@ run_variational_bayes(const CompressedAlphas& prior_alphas, assert(prior_alphas.size() == log_likelihoods.size()); // num samples assert(log_likelihoods.front().size() == genotype_log_priors.size()); // num genotypes assert(params.max_iterations > 0); - auto genotype_posteriors = exp(genotype_log_posteriors); auto posterior_alphas = prior_alphas; - auto responsabilities = init_responsabilities(posterior_alphas, genotype_posteriors, - log_likelihoods); - + auto responsabilities = init_responsabilities(posterior_alphas, genotype_posteriors, log_likelihoods); assert(responsabilities.size() == log_likelihoods.size()); // num samples - bool is_converged {false}; double max_change {0}; - - // main loop for (unsigned i {0}; i < params.max_iterations; ++i) { update_genotype_log_posteriors(genotype_log_posteriors, genotype_log_priors, responsabilities, log_likelihoods); exp(genotype_log_posteriors, genotype_posteriors); update_alphas(posterior_alphas, prior_alphas, responsabilities); - update_responsabilities(responsabilities, posterior_alphas, genotype_posteriors, - log_likelihoods); + update_responsabilities(responsabilities, posterior_alphas, genotype_posteriors, log_likelihoods); std::tie(is_converged, max_change) = check_convergence(prior_alphas, posterior_alphas, max_change, params.epsilon); if (is_converged) break; } - return CompressedLatents { std::move(genotype_posteriors), std::move(genotype_log_posteriors), std::move(posterior_alphas), std::move(responsabilities) @@ -693,22 +637,16 @@ run_variational_bayes(const CompressedAlphas& prior_alphas, { std::vector> results; results.reserve(seeds.size()); - for (auto& seed : seeds) { - results.emplace_back(run_variational_bayes(prior_alphas, genotype_log_priors, - log_likelihoods, - std::move(seed), - params)); + results.push_back(run_variational_bayes(prior_alphas, genotype_log_priors, log_likelihoods, + std::move(seed), params)); } - std::vector result_evidences(results.size()); - std::transform(std::cbegin(results), std::cend(results), std::begin(result_evidences), [&] (const auto& latents) { return calculate_lower_bound(prior_alphas, genotype_log_priors, log_likelihoods, latents); }); - const auto it = std::max_element(std::cbegin(result_evidences), std::cend(result_evidences)); const auto idx = std::distance(std::cbegin(result_evidences), it); return std::make_pair(std::move(results[idx]), *it); @@ -721,7 +659,6 @@ expand(std::vector>&& genotypes, LogProbabilityVector&& genotype_log_posteriors) { TumourModel::Latents::GenotypeProbabilityMap result {}; - std::transform(std::make_move_iterator(std::begin(genotypes)), std::make_move_iterator(std::end(genotypes)), std::begin(genotype_log_posteriors), @@ -729,7 +666,6 @@ expand(std::vector>&& genotypes, [] (auto&& g, auto p) { return std::make_pair(std::move(g), p); }); - return result; } @@ -744,13 +680,11 @@ TumourModel::Latents::GenotypeMixturesDirichletAlphaMap expand(const std::vector& samples, CompressedAlphas&& alphas) { TumourModel::Latents::GenotypeMixturesDirichletAlphaMap result {}; - std::transform(std::cbegin(samples), std::cend(samples), std::begin(alphas), std::inserter(result, std::begin(result)), [] (const auto& sample, auto&& compressed_alpha) { return std::make_pair(sample, expand(compressed_alpha)); }); - return result; } @@ -763,7 +697,6 @@ expand(const std::vector& samples, std::vector result(genotypes.size()); - std::transform(std::cbegin(genotypes), std::cend(genotypes), std::begin(result), [&] (const auto& genotype) { return likelihood_model.evaluate(genotype.germline_genotype()); }); - maths::normalise_logs(result); - return result; } @@ -794,20 +722,15 @@ auto calculate_log_posteriors_with_germline_model(const SampleName& sample, const SomaticMutationModel& genotype_prior_model) { assert(!genotypes.empty()); - haplotype_log_likelihoods.prime(sample); - const GermlineLikelihoodModel likelihood_model {haplotype_log_likelihoods}; std::vector result(genotypes.size()); - std::transform(std::cbegin(genotypes), std::cend(genotypes), std::begin(result), [&] (const auto& genotype) { return genotype_prior_model.evaluate(genotype) + likelihood_model.evaluate(genotype.germline_genotype()); }); - maths::normalise_logs(result); - return result; } @@ -824,10 +747,8 @@ auto generate_seeds(const std::vector& samples, { std::vector result {}; result.reserve(2 + samples.size()); - result.emplace_back(genotype_log_priors); result.emplace_back(log_uniform_dist(genotypes.size())); - for (const auto& sample : samples) { result.emplace_back(calculate_log_posteriors_with_germline_model(sample, genotypes, haplotype_log_likelihoods)); @@ -835,7 +756,6 @@ auto generate_seeds(const std::vector& samples, haplotype_log_likelihoods, priors.genotype_prior_model)); } - return result; } From 1452932c48d3da61b931f28b05844fe1f20b1c2b Mon Sep 17 00:00:00 2001 From: Daniel Cooke Date: Tue, 11 Oct 2016 19:58:20 +0100 Subject: [PATCH 4/8] Add PloidyMap class For storing sample ploidy information. --- src/CMakeLists.txt | 3 ++- src/basics/ploidy_map.cpp | 46 +++++++++++++++++++++++++++++++++++++++ src/basics/ploidy_map.hpp | 41 ++++++++++++++++++++++++++++++++++ 3 files changed, 89 insertions(+), 1 deletion(-) create mode 100644 src/basics/ploidy_map.cpp create mode 100644 src/basics/ploidy_map.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index cad8c1fc0..daec0cead 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -44,7 +44,8 @@ set(BASICS_SOURCES basics/aligned_read.hpp basics/aligned_read.cpp basics/mappable_reference_wrapper.hpp -) + basics/ploidy_map.hpp + basics/ploidy_map.cpp) set(CONTAINERS_SOURCES containers/mappable_flat_multi_set.hpp diff --git a/src/basics/ploidy_map.cpp b/src/basics/ploidy_map.cpp new file mode 100644 index 000000000..d9ab6fb2d --- /dev/null +++ b/src/basics/ploidy_map.cpp @@ -0,0 +1,46 @@ +// Copyright (c) 2016 Daniel Cooke +// Use of this source code is governed by the MIT license that can be found in the LICENSE file. + +#include "ploidy_map.hpp" + +#include + +namespace octopus { + +PloidyMap::PloidyMap(unsigned organism) +: organism_ {organism} +{} + +void PloidyMap::set(const ContigName& contig, unsigned ploidy) +{ + contigs_.emplace(contig, ploidy); +} + +void PloidyMap::set(const SampleName& sample, const ContigName& contig, unsigned ploidy) +{ + sample_contigs_[sample].emplace(contig, ploidy); +} + +unsigned PloidyMap::organism() const noexcept +{ + return organism_; +} + +unsigned PloidyMap::operator()(const SampleName& sample, const ContigName& contig) const noexcept +{ + const auto sample_itr = sample_contigs_.find(sample); + if (sample_itr != std::cend(sample_contigs_)) { + const auto contig_itr = sample_itr->second.find(contig); + if (contig_itr != std::cend(sample_itr->second)) { + return contig_itr->second; + } + } else { + const auto contig_itr = contigs_.find(contig); + if (contig_itr != std::cend(contigs_)) { + return contig_itr->second; + } + } + return organism_; +} + +} // namespace octopus diff --git a/src/basics/ploidy_map.hpp b/src/basics/ploidy_map.hpp new file mode 100644 index 000000000..2d10e80f8 --- /dev/null +++ b/src/basics/ploidy_map.hpp @@ -0,0 +1,41 @@ +// Copyright (c) 2016 Daniel Cooke +// Use of this source code is governed by the MIT license that can be found in the LICENSE file. + +#ifndef ploidy_map_hpp +#define ploidy_map_hpp + +#include + +#include "config/common.hpp" + +namespace octopus { + +class PloidyMap +{ +public: + PloidyMap(unsigned organism = 2); + + PloidyMap(const PloidyMap&) = default; + PloidyMap& operator=(const PloidyMap&) = default; + PloidyMap(PloidyMap&&) = default; + PloidyMap& operator=(PloidyMap&&) = default; + + ~PloidyMap() = default; + + void set(const SampleName& sample, const ContigName& contig, unsigned ploidy); + void set(const ContigName& contig, unsigned ploidy); + + unsigned organism() const noexcept; + unsigned operator()(const SampleName& sample, const ContigName& contig) const noexcept; + +private: + using ContigPloidyMap = std::unordered_map; + + unsigned organism_; + ContigPloidyMap contigs_; + std::unordered_map sample_contigs_; +}; + +} // namespace octopus + +#endif From 4d1995fe23843a0dd983783c5bcaac291cc2b32a Mon Sep 17 00:00:00 2001 From: Daniel Cooke Date: Tue, 11 Oct 2016 20:01:02 +0100 Subject: [PATCH 5/8] Use PloidyMap to pass sample ploidies to Callers Have CallerBuilder store a copy of the PloidyMap built in option collation and seed the requested Caller with ploidy information. --- src/config/option_collation.cpp | 182 +++++++++++++++++----------- src/core/callers/caller_builder.cpp | 25 ++-- src/core/callers/caller_builder.hpp | 10 +- src/core/callers/caller_factory.cpp | 17 +-- src/core/callers/caller_factory.hpp | 5 +- 5 files changed, 135 insertions(+), 104 deletions(-) diff --git a/src/config/option_collation.cpp b/src/config/option_collation.cpp index 9fdc8fba2..57ca68481 100644 --- a/src/config/option_collation.cpp +++ b/src/config/option_collation.cpp @@ -29,6 +29,7 @@ #include "basics/phred.hpp" #include "basics/genomic_region.hpp" #include "basics/aligned_read.hpp" +#include "basics/ploidy_map.hpp" #include "readpipe/read_pipe_fwd.hpp" #include "core/tools/coretools.hpp" #include "core/callers/caller_builder.hpp" @@ -849,88 +850,141 @@ auto make_variant_generator_builder(const OptionMap& options) return result; } -void print_ambiguous_contig_ploidies(const std::vector& contig_ploidies, - const OptionMap& options) +struct ContigPloidyLess { - logging::WarningLogger log {}; - log << "Ambiguous ploidies found"; - - for (auto it = std::cbegin(contig_ploidies), end = std::cend(contig_ploidies); it != end;) { - it = std::adjacent_find(it, std::cend(contig_ploidies), - [] (const auto& lhs, const auto& rhs) { - return lhs.contig == rhs.contig; - }); - if (it != std::cend(contig_ploidies)) { - const auto it2 = std::find_if(std::next(it), std::cend(contig_ploidies), - [=] (const auto& cp) { - return it->contig != cp.contig; - }); - std::ostringstream ss {}; - std::copy(it, it2, std::ostream_iterator(ss, " ")); - log << ss.str(); - it = it2; + bool operator()(const ContigPloidy& lhs, const ContigPloidy& rhs) const noexcept + { + if (lhs.sample) { + if (rhs.sample && *lhs.sample != *rhs.sample) { + return *lhs.sample < *rhs.sample; + } else { + return true; + } + } else if (rhs.sample) { + return false; } + return lhs.contig == rhs.contig ? lhs.ploidy < rhs.ploidy : lhs.contig < rhs.contig; } -} +}; + +struct ContigPloidyEqual +{ + bool operator()(const ContigPloidy& lhs, const ContigPloidy& rhs) const noexcept + { + return lhs.sample == rhs.sample && lhs.contig == rhs.contig && lhs.ploidy == rhs.ploidy; + } +}; + +struct ContigPloidyAmbiguous +{ + bool operator()(const ContigPloidy& lhs, const ContigPloidy& rhs) const noexcept + { + if (lhs.sample && rhs.sample) { + return *lhs.sample == *rhs.sample && lhs.contig == rhs.contig; + } else if (!(lhs.sample || rhs.sample)) { + return lhs.contig == rhs.contig; + } + return false; + } +}; + +class AmbiguousPloidy : public UserError +{ + std::string do_where() const override + { + return "make_caller_factory"; + } + + std::string do_why() const override + { + std::ostringstream ss {}; + ss << "The are contigs with ambiguous ploidy: "; + for (auto it = std::cbegin(ploidies_), end = std::cend(ploidies_); it != end;) { + it = std::adjacent_find(it, std::cend(ploidies_), ContigPloidyAmbiguous {}); + if (it != std::cend(ploidies_)) { + const auto it2 = std::find_if(std::next(it), std::cend(ploidies_), + [=] (const auto& cp) { + return ContigPloidyAmbiguous{}(*it, cp); + }); + std::ostringstream ss {}; + std::copy(it, it2, std::ostream_iterator {ss, " "}); + it = it2; + } + } + return ss.str(); + } + + std::string do_help() const override + { + return "Ensure ploidies are specified only once per sample or per sample contig"; + } + + std::vector ploidies_; + +public: + AmbiguousPloidy(std::vector ploidies) : ploidies_ {ploidies} {} +}; void remove_duplicate_ploidies(std::vector& contig_ploidies) { std::sort(std::begin(contig_ploidies), std::end(contig_ploidies), - [] (const auto& lhs, const auto& rhs) { - return (lhs.contig == rhs.contig) ? lhs.ploidy < rhs.ploidy : lhs.contig < rhs.contig; - }); - const auto it = std::unique(std::begin(contig_ploidies), std::end(contig_ploidies), - [] (const auto& lhs, const auto& rhs) { - return lhs.contig == rhs.contig && lhs.ploidy == rhs.ploidy; - }); - contig_ploidies.erase(it, std::end(contig_ploidies)); + ContigPloidyLess {}); + const auto itr = std::unique(std::begin(contig_ploidies), std::end(contig_ploidies), + ContigPloidyEqual {}); + contig_ploidies.erase(itr, std::end(contig_ploidies)); } bool has_ambiguous_ploidies(const std::vector& contig_ploidies) { const auto it2 = std::adjacent_find(std::cbegin(contig_ploidies), std::cend(contig_ploidies), - [] (const auto& lhs, const auto& rhs) { - return lhs.contig == rhs.contig; - }); + ContigPloidyAmbiguous {}); return it2 != std::cend(contig_ploidies); } -boost::optional> extract_contig_ploidies(const OptionMap& options) +class MissingPloidyFile : public MissingFileError { - std::vector result {}; - + std::string do_where() const override + { + return "get_ploidy_map"; + } +public: + MissingPloidyFile(fs::path p) : MissingFileError {std::move(p), "read path"} {}; +}; + + +PloidyMap get_ploidy_map(const OptionMap& options) +{ + std::vector flat_plodies {}; if (options.count("contig-ploidies-file") == 1) { const fs::path input_path {options.at("contig-ploidies-file").as()}; const auto resolved_path = resolve_path(input_path, options); - - logging::ErrorLogger log {}; - if (!fs::exists(resolved_path)) { - stream(log) << "The path " << input_path - << " given in the input option (--contig-ploidies-file) does not exist"; - return boost::none; + throw MissingPloidyFile {input_path}; } - std::ifstream file {resolved_path.string()}; std::transform(std::istream_iterator(file), std::istream_iterator(), - std::back_inserter(result), [] (const Line& line) { - std::istringstream ss {line.line_data}; - ContigPloidy result {}; - ss >> result; - return result; - }); + std::back_inserter(flat_plodies), [] (const Line& line) { + std::istringstream ss {line.line_data}; + ContigPloidy result {}; + ss >> result; + return result; + }); } - if (options.count("contig-ploidies") == 1) { - utils::append(options.at("contig-ploidies").as>(), result); + utils::append(options.at("contig-ploidies").as>(), flat_plodies); } - - remove_duplicate_ploidies(result); - if (has_ambiguous_ploidies(result)) { - print_ambiguous_contig_ploidies(result, options); - return boost::none; + remove_duplicate_ploidies(flat_plodies); + if (has_ambiguous_ploidies(flat_plodies)) { + throw AmbiguousPloidy {flat_plodies}; + } + PloidyMap result {as_unsigned("organism-ploidy", options)}; + for (const auto& p : flat_plodies) { + if (p.sample) { + result.set(*p.sample, p.contig, p.ploidy); + } else { + result.set(p.contig, p.ploidy); + } } - return result; } @@ -1182,7 +1236,7 @@ CallerFactory make_caller_factory(const ReferenceGenome& reference, ReadPipe& re } else { vc_builder.set_min_variant_posterior(min_variant_posterior); } - + vc_builder.set_ploidies(get_ploidy_map(options)); vc_builder.set_max_haplotypes(get_max_haplotypes(options)); vc_builder.set_haplotype_extension_threshold(options.at("haplotype-extension-threshold").as>()); auto min_phase_score = options.at("min-phase-score").as>(); @@ -1208,26 +1262,12 @@ CallerFactory make_caller_factory(const ReferenceGenome& reference, ReadPipe& re // vc_builder.set_model_filtering(!(options.at("disable-call-filtering").as() // || options.at("disable-model-filtering").as())); - const auto contig_ploidies = extract_contig_ploidies(options); - if (!contig_ploidies) { - // TODO - } if (call_sites_only(options)) { vc_builder.set_sites_only(); } - vc_builder.set_flank_scoring(allow_flank_scoring(options)); - CallerFactory result {std::move(vc_builder), as_unsigned("organism-ploidy", options)}; - for (const auto& p : regions) { - const auto it = std::find_if(std::cbegin(*contig_ploidies), std::cend(*contig_ploidies), - [&] (const auto& cp) { return cp.contig == p.first; }); - if (it != std::cend(*contig_ploidies)) { - result.set_contig_ploidy(p.first, it->ploidy); - } - } - - return result; + return CallerFactory {std::move(vc_builder)}; } boost::optional get_final_output_path(const OptionMap& options) diff --git a/src/core/callers/caller_builder.cpp b/src/core/callers/caller_builder.cpp index ff79641f7..8fe5d08f0 100644 --- a/src/core/callers/caller_builder.cpp +++ b/src/core/callers/caller_builder.cpp @@ -70,9 +70,9 @@ CallerBuilder& CallerBuilder::set_variant_generator(const VariantGeneratorBuilde return *this; } -CallerBuilder& CallerBuilder::set_ploidy(unsigned ploidy) noexcept +CallerBuilder& CallerBuilder::set_ploidies(PloidyMap ploidies) noexcept { - params_.ploidy = ploidy; + params_.ploidies = std::move(ploidies); return *this; } @@ -191,11 +191,12 @@ CallerBuilder& CallerBuilder::set_denovo_mutation_rate(double rate) noexcept return *this; } -std::unique_ptr CallerBuilder::build() const +std::unique_ptr CallerBuilder::build(const ContigName& contig) const { if (factory_.count(caller_) == 0) { throw std::runtime_error {"CallerBuilder: unknown caller " + caller_}; } + requested_contig_ = contig; return factory_.at(caller_)(); } @@ -223,38 +224,38 @@ CallerBuilder::CallerFactoryMap CallerBuilder::generate_factory() const params_.allow_flank_scoring, params_.allow_model_filtering }; - + const auto& samples = components_.read_pipe.get().samples(); return CallerFactoryMap { - {"individual", [this, general_parameters = std::move(general_parameters)] () { + {"individual", [this, general_parameters = std::move(general_parameters), &samples] () { return std::make_unique(make_components(), std::move(general_parameters), IndividualCaller::Parameters { - params_.ploidy, + params_.ploidies(samples.front(), *requested_contig_), {params_.snp_heterozygosity, params_.indel_heterozygosity}, params_.min_variant_posterior, params_.min_refcall_posterior }); }}, - {"population", [this, general_parameters = std::move(general_parameters)] () { + {"population", [this, general_parameters = std::move(general_parameters), &samples] () { return std::make_unique(make_components(), std::move(general_parameters), PopulationCaller::Parameters { params_.min_variant_posterior, params_.min_refcall_posterior, - params_.ploidy, + params_.ploidies(samples.front(), *requested_contig_), {params_.snp_heterozygosity, params_.indel_heterozygosity} }); }}, - {"cancer", [this, general_parameters = std::move(general_parameters)] () { + {"cancer", [this, general_parameters = std::move(general_parameters), &samples] () { return std::make_unique(make_components(), std::move(general_parameters), CancerCaller::Parameters { params_.min_variant_posterior, params_.min_somatic_posterior, params_.min_refcall_posterior, - params_.ploidy, + params_.ploidies(samples.front(), *requested_contig_), params_.normal_sample, {params_.snp_heterozygosity, params_.indel_heterozygosity}, @@ -269,7 +270,9 @@ CallerBuilder::CallerFactoryMap CallerBuilder::generate_factory() const std::move(general_parameters), TrioCaller::Parameters { *params_.trio, - params_.ploidy, params_.ploidy, params_.ploidy, + params_.ploidies(params_.trio->mother(), *requested_contig_), + params_.ploidies(params_.trio->father(), *requested_contig_), + params_.ploidies(params_.trio->child(), *requested_contig_), {params_.snp_heterozygosity, params_.indel_heterozygosity}, {*params_.denovo_mutation_rate}, diff --git a/src/core/callers/caller_builder.hpp b/src/core/callers/caller_builder.hpp index 9b7c5cc39..7b2dddd11 100644 --- a/src/core/callers/caller_builder.hpp +++ b/src/core/callers/caller_builder.hpp @@ -12,6 +12,7 @@ #include #include "config/common.hpp" +#include "basics/ploidy_map.hpp" #include "readpipe/read_pipe.hpp" #include "core/tools/coretools.hpp" #include "core/types/trio.hpp" @@ -41,7 +42,7 @@ class CallerBuilder CallerBuilder& set_reference(const ReferenceGenome& reference) noexcept; CallerBuilder& set_read_pipe(const ReadPipe& read_pipe) noexcept; CallerBuilder& set_variant_generator(const VariantGeneratorBuilder& vb) noexcept; - CallerBuilder& set_ploidy(unsigned ploidy) noexcept; + CallerBuilder& set_ploidies(PloidyMap ploidies) noexcept; CallerBuilder& set_caller(std::string caller); CallerBuilder& set_refcall_type(Caller::RefCallType type) noexcept; CallerBuilder& set_sites_only() noexcept; @@ -69,7 +70,7 @@ class CallerBuilder // pedigree CallerBuilder& set_pedigree(Pedigree pedigree); - std::unique_ptr build() const; + std::unique_ptr build(const ContigName& contig) const; private: struct Components @@ -84,7 +85,7 @@ class CallerBuilder struct Parameters { // common - unsigned ploidy; + PloidyMap ploidies; Caller::RefCallType refcall_type = Caller::RefCallType::none; bool call_sites_only = false; Phred min_variant_posterior; @@ -119,6 +120,9 @@ class CallerBuilder Components components_; Parameters params_; CallerFactoryMap factory_; + + mutable boost::optional requested_contig_; + Caller::Components make_components() const; CallerFactoryMap generate_factory() const; }; diff --git a/src/core/callers/caller_factory.cpp b/src/core/callers/caller_factory.cpp index 3c00e99c9..af750560e 100644 --- a/src/core/callers/caller_factory.cpp +++ b/src/core/callers/caller_factory.cpp @@ -10,10 +10,8 @@ namespace octopus { -CallerFactory::CallerFactory(CallerBuilder template_builder, const unsigned default_ploidy) +CallerFactory::CallerFactory(CallerBuilder template_builder) : template_builder_ {std::move(template_builder)} -, contig_ploidies_ {} -, default_ploidy_ {default_ploidy} {} CallerFactory& CallerFactory::set_reference(const ReferenceGenome& reference) noexcept @@ -28,20 +26,9 @@ CallerFactory& CallerFactory::set_read_pipe(ReadPipe& read_pipe) noexcept return *this; } -CallerFactory& CallerFactory::set_contig_ploidy(const ContigName& contig, const unsigned ploidy) -{ - contig_ploidies_[contig] = ploidy; - return *this; -} - std::unique_ptr CallerFactory::make(const ContigName& contig) const { - if (contig_ploidies_.count(contig) == 1) { - template_builder_.set_ploidy(contig_ploidies_.at(contig)); - } else { - template_builder_.set_ploidy(default_ploidy_); - } - return template_builder_.build(); + return template_builder_.build(contig); } } // namespace octopus diff --git a/src/core/callers/caller_factory.hpp b/src/core/callers/caller_factory.hpp index f2d2f9e97..de8d95bbc 100644 --- a/src/core/callers/caller_factory.hpp +++ b/src/core/callers/caller_factory.hpp @@ -23,7 +23,7 @@ class CallerFactory CallerFactory() = delete; - CallerFactory(CallerBuilder template_builder, unsigned default_ploidy); + CallerFactory(CallerBuilder template_builder); CallerFactory(const CallerFactory&) = default; CallerFactory& operator=(const CallerFactory&) = default; @@ -34,14 +34,11 @@ class CallerFactory CallerFactory& set_reference(const ReferenceGenome& reference) noexcept; CallerFactory& set_read_pipe(ReadPipe& read_pipe) noexcept; - CallerFactory& set_contig_ploidy(const ContigName& contig, unsigned ploidy); std::unique_ptr make(const ContigName& contig) const; private: mutable CallerBuilder template_builder_; - std::unordered_map contig_ploidies_; - unsigned default_ploidy_; }; } // namespace octopus From 04c11f650146c1d1d2a7c8457d1ee7fd6338c8c7 Mon Sep 17 00:00:00 2001 From: Daniel Cooke Date: Tue, 11 Oct 2016 22:16:09 +0100 Subject: [PATCH 6/8] Get Travis working --- .travis.yml | 131 ++++++++++++++++++++++++++++++++-------------------- README.md | 2 +- 2 files changed, 81 insertions(+), 52 deletions(-) diff --git a/.travis.yml b/.travis.yml index dc3abb55d..b4508986c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,69 +1,98 @@ sudo: required dist: trusty -language: cpp +language: generic osx_image: xcode8 -before_install: +install: + ############################################################################ + # All the Linux dependencies are installed in ${TRAVIS_BUILD_DIR}/deps/ + ############################################################################ + - DEPS_DIR="${TRAVIS_BUILD_DIR}/deps" + - mkdir -p ${DEPS_DIR} && cd ${DEPS_DIR} + + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then export DEBIAN_FRONTEND=noninteractive; fi + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo apt-get update -qy; fi + + - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi + + ############################################################################ + # Install CMake + ############################################################################ - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then - sudo add-apt-repository ppa:george-edison55/cmake-3.x -y; - sudo apt-get update -q; - sudo apt-get upgrade cmake -y; - sudo apt-get install python3 -y; - sudo apt-get upgrade libboost-all-dev -y; - git clone https://github.com/samtools/htslib.git; - cd htslib && autoheader && autoconf && ./configure && make && sudo make install; + CMAKE_URL="https://cmake.org/files/v3.6/cmake-3.6.2-Linux-x86_64.tar.gz"; + mkdir -p cmake && travis_retry wget --no-check-certificate --quiet -O - "${CMAKE_URL}" | tar --strip-components=1 -xz -C cmake; + export PATH=${DEPS_DIR}/cmake/bin:${PATH}; + else + brew outdated cmake || brew upgrade cmake; fi - - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then - brew update; - brew outdated cmake || upgrade cmake; + + ############################################################################ + # Install Boost + ############################################################################ + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then + BOOST_VERSION=trunk; + BOOST_DIR=${DEPS_DIR}/boost-${BOOST_VERSION}; + BOOST_URL="http://github.com/boostorg/boost.git"; + travis_retry git clone --depth 1 --recursive --quiet ${BOOST_URL} ${BOOST_DIR} || exit 1; + cd ${BOOST_DIR} && ./bootstrap.sh && ./b2 headers; + cd ${BOOST_DIR}/tools/build && ./bootstrap.sh && ./b2 install --prefix=${DEPS_DIR}/b2; + export PATH=${DEPS_DIR}/b2/bin:${PATH}; + fi + + ############################################################################ + # Install Python3 + ############################################################################ + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then + sudo apt-get install python3 -qy; + else brew install python3; - brew outdated boost || upgrade boost; + fi + + ############################################################################ + # Install htslib + ############################################################################ + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then + git clone https://github.com/samtools/htslib.git; + cd htslib && autoheader && autoconf && ./configure && make && sudo make install; + else brew tap homebrew/science; brew install htslib; fi matrix: include: - - os: linux - compiler: gcc - addons: - apt: - sources: - - ubuntu-toolchain-r-test - packages: - - g++-6 - env: COMPILER=g++-6 - - os: linux - compiler: clang - addons: - apt: - sources: - - ubuntu-toolchain-r-test - - llvm-toolchain-precise-3.8 - packages: - - clang-3.8 - env: COMPILER=clang++-3.8 +# - os: linux +# compiler: gcc +# addons: +# apt: +# sources: +# - ubuntu-toolchain-r-test +# packages: +# - g++-6 +# env: COMPILER=g++ +# - os: linux +# compiler: clang +# addons: +# apt: +# sources: +# - ubuntu-toolchain-r-test +# - llvm-toolchain-precise-3.8 +# packages: +# - clang-3.8 +# env: COMPILER=clang++ - os: osx compiler: clang - addons: - apt: - sources: - - ubuntu-toolchain-r-test - - llvm-toolchain-precise-3.8 - packages: - - clang-3.8 - install: export PATH=/usr/local/clang/bin:$PATH - env: COMPILER=clang++-3.8 - - os: osx - compiler: gcc - addons: - apt: - sources: - - ubuntu-toolchain-r-test - packages: - - g++-6 - env: COMPILER=g++-6 + before_install: + - brew install --with-clang llvm + - brew outdated boost || brew upgrade boost + env: COMPILER=clang++ + +before_script: + - cd "${TRAVIS_BUILD_DIR}" script: - - ./install.py --compiler=$COMPILER \ No newline at end of file + - ./install.py --compiler=$COMPILER + +notifications: + email: false \ No newline at end of file diff --git a/README.md b/README.md index 1eba2fda1..eeec39630 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ ![Octopus Logo](logo.png) -[![Build Status](https://travis-ci.com/luntergroup/octopus.svg?token=U9L3a7MWio2P3XpPT3JV&branch=master)](https://travis-ci.com/luntergroup/octopus) +[![Build Status](https://travis-ci.org/luntergroup/octopus.svg?branch=master)](https://travis-ci.org/luntergroup/octopus) [![MIT license](http://img.shields.io/badge/license-MIT-brightgreen.svg)](http://opensource.org/licenses/MIT) [![Gitter](https://badges.gitter.im/octopus-caller/Lobby.svg)](https://gitter.im/octopus-caller/Lobby?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge) From ca2300df81944c44a8ca79c808f6620656c536d9 Mon Sep 17 00:00:00 2001 From: Daniel Cooke Date: Tue, 11 Oct 2016 22:19:05 +0100 Subject: [PATCH 7/8] Switch Travis badge to branch develop --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index eeec39630..728cb27d3 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ ![Octopus Logo](logo.png) -[![Build Status](https://travis-ci.org/luntergroup/octopus.svg?branch=master)](https://travis-ci.org/luntergroup/octopus) +[![Build Status](https://travis-ci.org/luntergroup/octopus.svg?branch=develop)](https://travis-ci.org/luntergroup/octopus) [![MIT license](http://img.shields.io/badge/license-MIT-brightgreen.svg)](http://opensource.org/licenses/MIT) [![Gitter](https://badges.gitter.im/octopus-caller/Lobby.svg)](https://gitter.im/octopus-caller/Lobby?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge) From 1508a943e80ca89c6e6074094e8e9d4f58177c7b Mon Sep 17 00:00:00 2001 From: Daniel Cooke Date: Wed, 12 Oct 2016 21:12:05 +0100 Subject: [PATCH 8/8] Switch Travis badge to master branch --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 728cb27d3..eeec39630 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ ![Octopus Logo](logo.png) -[![Build Status](https://travis-ci.org/luntergroup/octopus.svg?branch=develop)](https://travis-ci.org/luntergroup/octopus) +[![Build Status](https://travis-ci.org/luntergroup/octopus.svg?branch=master)](https://travis-ci.org/luntergroup/octopus) [![MIT license](http://img.shields.io/badge/license-MIT-brightgreen.svg)](http://opensource.org/licenses/MIT) [![Gitter](https://badges.gitter.im/octopus-caller/Lobby.svg)](https://gitter.im/octopus-caller/Lobby?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge)