diff --git a/src/config/option_collation.cpp b/src/config/option_collation.cpp index ac6c7a397..a3f95fcb7 100644 --- a/src/config/option_collation.cpp +++ b/src/config/option_collation.cpp @@ -267,13 +267,14 @@ ReferenceGenome make_reference(const OptionMap& options) InputRegionMap make_search_regions(const std::vector& regions) { - InputRegionMap contig_mapped_regions {}; + std::map> contig_mapped_regions {}; for (const auto& region : regions) { - contig_mapped_regions[region.contig_name()].insert(region); + contig_mapped_regions[region.contig_name()].push_back(region); } InputRegionMap result {}; result.reserve(contig_mapped_regions.size()); - for (const auto& p : contig_mapped_regions) { + for (auto& p : contig_mapped_regions) { + std::sort(std::begin(p.second), std::end(p.second)); auto covered_contig_regions = extract_covered_regions(p.second); result.emplace(std::piecewise_construct, std::forward_as_tuple(p.first), @@ -288,26 +289,24 @@ InputRegionMap extract_search_regions(const ReferenceGenome& reference) return make_search_regions(get_all_contig_regions(reference)); } -auto cut(const MappableFlatSet& mappables, const MappableFlatSet& regions) +auto get_unskipped(const MappableFlatSet& regions, const MappableFlatSet& skips) { - if (mappables.empty()) return regions; + if (skips.empty()) return regions; MappableFlatSet result {}; for (const auto& region : regions) { - auto overlapped = mappables.overlap_range(region); + const auto overlapped = skips.overlap_range(region); if (empty(overlapped)) { result.emplace(region); - } else if (!is_same_region(region, overlapped.front())) { - auto chunk = region; - if (begins_before(overlapped.front(), chunk)) { - chunk = right_overhang_region(chunk, overlapped.front()); - overlapped.advance_begin(1); + } else if (!contains(overlapped.front(), region)) { + if (begins_before(region, overlapped.front())) { + result.emplace(left_overhang_region(region, overlapped.front())); } - std::for_each(std::cbegin(overlapped), std::cend(overlapped), [&] (const auto& region) { - result.emplace(left_overhang_region(chunk, region)); - chunk = expand_lhs(chunk, -begin_distance(chunk, region)); - }); - if (ends_before(overlapped.back(), chunk)) { - result.emplace(right_overhang_region(chunk, overlapped.back())); + auto intervening_chunks = extract_intervening_regions(overlapped); + using std::make_move_iterator; + result.insert(make_move_iterator(std::begin(intervening_chunks)), + make_move_iterator(std::end(intervening_chunks))); + if (ends_before(overlapped.back(), region)) { + result.emplace(right_overhang_region(region, overlapped.back())); } } } @@ -323,7 +322,7 @@ InputRegionMap extract_search_regions(const std::vector& regions, InputRegionMap result {input_regions.size()}; for (auto& p : input_regions) { if (skipped.count(p.first) == 1) { - result.emplace(p.first, cut(skipped.at(p.first), std::move(p.second))); + result.emplace(p.first, get_unskipped(std::move(p.second), skipped.at(p.first))); } else { result.emplace(p.first, std::move(p.second)); } @@ -1352,7 +1351,8 @@ CallerFactory make_caller_factory(const ReferenceGenome& reference, ReadPipe& re vc_builder.set_min_somatic_posterior(min_somatic_posterior); } else if (caller == "trio") { vc_builder.set_trio(make_trio(read_pipe.samples(), options, pedigree)); - vc_builder.set_denovo_mutation_rate(options.at("denovo-mutation-rate").as()); + vc_builder.set_snv_denovo_mutation_rate(options.at("snv-denovo-mutation-rate").as()); + vc_builder.set_indel_denovo_mutation_rate(options.at("indel-denovo-mutation-rate").as()); vc_builder.set_min_denovo_posterior(options.at("min-denovo-posterior").as>()); } diff --git a/src/config/option_parser.cpp b/src/config/option_parser.cpp index a35da45d2..329d301d8 100644 --- a/src/config/option_parser.cpp +++ b/src/config/option_parser.cpp @@ -473,9 +473,13 @@ OptionMap parse_options(const int argc, const char** argv) po::value(), "Paternal sample") - ("denovo-mutation-rate", - po::value()->default_value(1e-8, "1e-8"), - "Expected de novo mutation rate, per megabase pair, for this sample") + ("snv-denovo-mutation-rate", + po::value()->default_value(1e-9, "1e-9"), + "SNV de novo mutation rate, per base per generation") + + ("indel-denovo-mutation-rate", + po::value()->default_value(1e-10, "1e-10"), + "INDEL de novo mutation rate, per base per generation") ("min-denovo-posterior", po::value>()->default_value(Phred {0.5}), @@ -518,7 +522,7 @@ OptionMap parse_options(const int argc, const char** argv) "Include the read mapping quality in the haplotype likelihood calculation") ("max-joint-genotypes", - po::value()->default_value(200000), + po::value()->default_value(1000000), "The maximum number of joint genotype vectors to consider when computing joint" " genotype posterior probabilities") @@ -794,6 +798,16 @@ void check_strictly_positive(const std::string& option, const OptionMap& vm) } } +void check_probability(const std::string& option, const OptionMap& vm) +{ + if (vm.count(option) == 1) { + const auto value = vm.at(option).as(); + if (value < 0 || value > 1) { + throw InvalidCommandLineOptionValue {option, value, "must be between zero and one" }; + } + } +} + void conflicting_options(const OptionMap& vm, const std::string& opt1, const std::string& opt2) { if (vm.count(opt1) == 1 && !vm[opt1].defaulted() && vm.count(opt2) == 1 && !vm[opt2].defaulted()) { @@ -888,6 +902,11 @@ void validate(const OptionMap& vm) "max-haplotypes", "haplotype-holdout-threshold", "haplotype-overflow", "max-joint-genotypes" }; + const std::vector probability_options { + "snp-heterozygosity", "snp-heterozygosity-stdev", "indel-heterozygosity", + "somatic-mutation-rate", "min-expected-somatic-frequency", "min-credible-somatic-frequency", "credible-mass", + "snv-denovo-mutation-rate", "indel-denovo-mutation-rate" + }; conflicting_options(vm, "maternal-sample", "normal-sample"); conflicting_options(vm, "paternal-sample", "normal-sample"); for (const auto& option : positive_int_options) { @@ -896,6 +915,9 @@ void validate(const OptionMap& vm) for (const auto& option : strictly_positive_int_options) { check_strictly_positive(option, vm); } + for (const auto& option : probability_options) { + check_probability(option, vm); + } check_reads_present(vm); check_region_files_consistent(vm); check_trio_consistent(vm); diff --git a/src/core/callers/caller_builder.cpp b/src/core/callers/caller_builder.cpp index 3c32c8793..aaf5b8370 100644 --- a/src/core/callers/caller_builder.cpp +++ b/src/core/callers/caller_builder.cpp @@ -216,9 +216,15 @@ CallerBuilder& CallerBuilder::set_min_denovo_posterior(Phred posterior) return *this; } -CallerBuilder& CallerBuilder::set_denovo_mutation_rate(double rate) noexcept +CallerBuilder& CallerBuilder::set_snv_denovo_mutation_rate(double rate) noexcept { - params_.denovo_mutation_rate = rate; + params_.snv_denovo_mutation_rate = rate; + return *this; +} + +CallerBuilder& CallerBuilder::set_indel_denovo_mutation_rate(double rate) noexcept +{ + params_.indel_denovo_mutation_rate = rate; return *this; } @@ -348,7 +354,7 @@ CallerBuilder::CallerFactoryMap CallerBuilder::generate_factory() const params_.ploidies.of(params_.trio->father(), *requested_contig_), params_.ploidies.of(params_.trio->child(), *requested_contig_), make_trio_prior_model(params_.snp_heterozygosity, params_.indel_heterozygosity), - {*params_.denovo_mutation_rate}, + {*params_.snv_denovo_mutation_rate, *params_.indel_denovo_mutation_rate}, params_.min_variant_posterior, params_.min_denovo_posterior, params_.min_refcall_posterior, diff --git a/src/core/callers/caller_builder.hpp b/src/core/callers/caller_builder.hpp index 13e6cab63..5e6e2030b 100644 --- a/src/core/callers/caller_builder.hpp +++ b/src/core/callers/caller_builder.hpp @@ -70,7 +70,8 @@ class CallerBuilder // trio CallerBuilder& set_trio(Trio trio); CallerBuilder& set_min_denovo_posterior(Phred posterior) noexcept; - CallerBuilder& set_denovo_mutation_rate(double rate) noexcept; + CallerBuilder& set_snv_denovo_mutation_rate(double rate) noexcept; + CallerBuilder& set_indel_denovo_mutation_rate(double rate) noexcept; // pedigree CallerBuilder& set_pedigree(Pedigree pedigree); @@ -117,7 +118,7 @@ class CallerBuilder // trio boost::optional trio; Phred min_denovo_posterior; - boost::optional denovo_mutation_rate; + boost::optional snv_denovo_mutation_rate, indel_denovo_mutation_rate; // pedigree boost::optional pedigree; diff --git a/src/core/callers/trio_caller.cpp b/src/core/callers/trio_caller.cpp index f0320deb1..a19c66097 100644 --- a/src/core/callers/trio_caller.cpp +++ b/src/core/callers/trio_caller.cpp @@ -295,12 +295,15 @@ TrioCaller::calculate_model_posterior(const std::vector& haplotypes, const Latents& latents) const { const auto max_ploidy = std::max({parameters_.maternal_ploidy, parameters_.paternal_ploidy, parameters_.child_ploidy}); - const auto genotypes = generate_all_genotypes(haplotypes, max_ploidy + 1); + std::vector> genotype_indices {}; + const auto genotypes = generate_all_genotypes(haplotypes, max_ploidy + 1, genotype_indices); const auto germline_prior_model = make_prior_model(haplotypes); - DeNovoModel denovo_model {parameters_.denovo_model_params, haplotypes.size(), DeNovoModel::CachingStrategy::address}; + DeNovoModel denovo_model {parameters_.denovo_model_params}; + germline_prior_model->prime(haplotypes); + denovo_model.prime(haplotypes); const model::TrioModel model {parameters_.trio, *germline_prior_model, denovo_model, TrioModel::Options {parameters_.max_joint_genotypes}}; - const auto inferences = model.evaluate(genotypes, haplotype_likelihoods); + const auto inferences = model.evaluate(genotypes, genotype_indices, haplotype_likelihoods); return octopus::calculate_model_posterior(latents.model_latents.log_evidence, inferences.log_evidence); } diff --git a/src/core/models/genotype/trio_model.cpp b/src/core/models/genotype/trio_model.cpp index d6c4f2056..7086143c8 100644 --- a/src/core/models/genotype/trio_model.cpp +++ b/src/core/models/genotype/trio_model.cpp @@ -97,7 +97,7 @@ struct GenotypeRefProbabilityPairGenotypeLess struct ParentsProbabilityPair { GenotypeReference maternal, paternal; - double probability; + double probability, maternal_likelihood, paternal_likelihood; const GenotypeIndiceVector* maternal_indices = nullptr, *paternal_indices = nullptr; }; @@ -249,10 +249,65 @@ auto reduce(std::vector& zipped, const TrioModel::Options& options) return make_reduction_map(zipped, last_to_join, options); } +struct UniformPriorJointProbabilityHelper +{ + std::size_t index; + double probability; +}; + +bool operator==(const UniformPriorJointProbabilityHelper& lhs, const UniformPriorJointProbabilityHelper& rhs) noexcept +{ + return lhs.probability == rhs.probability; +} +bool operator!=(const UniformPriorJointProbabilityHelper& lhs, const UniformPriorJointProbabilityHelper& rhs) noexcept +{ + return lhs.probability != rhs.probability; +} +bool operator<(const UniformPriorJointProbabilityHelper& lhs, const UniformPriorJointProbabilityHelper& rhs) noexcept +{ + return lhs.probability < rhs.probability; +} +bool operator>(const UniformPriorJointProbabilityHelper& lhs, const UniformPriorJointProbabilityHelper& rhs) noexcept +{ + return lhs.probability > rhs.probability; +} + auto reduce(std::vector& zipped, const TrioModel::Options& options) { - auto last_to_join = reduce(zipped, get_sample_reduction_count(options.max_joint_genotypes), - options.max_joint_mass_loss); + const auto reduction_count = get_sample_reduction_count(options.max_joint_genotypes); + auto last_to_join = reduce(zipped, reduction_count, options.max_joint_mass_loss); + if (last_to_join != std::cend(zipped)) { + std::vector likelihood_zipped(zipped.size()); + for (std::size_t i {0}; i < zipped.size(); ++i) { + likelihood_zipped[i].index = i; + likelihood_zipped[i].probability = zipped[i].maternal_likelihood + zipped[i].paternal_likelihood; + } + likelihood_zipped.erase(reduce(likelihood_zipped, reduction_count, options.max_joint_mass_loss), + std::cend(likelihood_zipped)); + std::deque new_indices {}; + const auto num_posterior_joined = static_cast(std::distance(std::begin(zipped), last_to_join)); + for (const auto& p : likelihood_zipped) { + if (p.index >= num_posterior_joined) { + new_indices.push_back(p.index); + } + } + likelihood_zipped.clear(); + if (!new_indices.empty()) { + std::sort(std::begin(new_indices), std::end(new_indices)); + auto curr_index = num_posterior_joined; + // Can use std::partition as it must do a single left-to-right pass due to ForwardIterator + // and complexity requirements + last_to_join = std::partition(last_to_join, std::end(zipped), [&] (const auto& p) { + if (!new_indices.empty() && curr_index++ == new_indices.front()) { + new_indices.pop_front(); + return true; + } else { + return false; + } + }); + assert(new_indices.empty()); + } + } return make_reduction_map(zipped, last_to_join, options); } @@ -343,17 +398,20 @@ auto join(const ReducedVectorMap& maternal, result.reserve(join_size(maternal, paternal)); std::for_each(maternal.first, maternal.last_to_join, [&] (const auto& m) { std::for_each(paternal.first, paternal.last_to_join, [&] (const auto& p) { - result.push_back({m.genotype, p.genotype, joint_probability(m, p, model), m.indices, p.indices}); + result.push_back({m.genotype, p.genotype, joint_probability(m, p, model), + m.probability, m.probability, m.indices, p.indices}); }); }); std::for_each(maternal.last_to_join, maternal.last, [&] (const auto& m) { std::for_each(paternal.first, paternal.last_to_partially_join, [&] (const auto& p) { - result.push_back({m.genotype, p.genotype, joint_probability(m, p, model), m.indices, p.indices}); + result.push_back({m.genotype, p.genotype, joint_probability(m, p, model), + m.probability, m.probability, m.indices, p.indices}); }); }); std::for_each(paternal.last_to_join, paternal.last, [&] (const auto& p) { std::for_each(maternal.first, maternal.last_to_partially_join, [&] (const auto& m) { - result.push_back({m.genotype, p.genotype, joint_probability(m, p, model), m.indices, p.indices}); + result.push_back({m.genotype, p.genotype, joint_probability(m, p, model), + m.probability, m.probability, m.indices, p.indices}); }); }); return result; @@ -586,7 +644,7 @@ auto join(const ReducedVectorMap& parents, } } else if (child_ploidy == 3 && maternal_ploidy == 3 && paternal_ploidy == 3) { - return join(parents, child, ProbabilityOfChildGivenParents<2, 2, 2> {mutation_model}); + return join(parents, child, ProbabilityOfChildGivenParents<3, 3, 3> {mutation_model}); } throw std::runtime_error {"TrioModel: unimplemented joint probability function"}; } diff --git a/src/core/models/mutation/denovo_model.cpp b/src/core/models/mutation/denovo_model.cpp index 9f7d1fc21..6e16a3987 100644 --- a/src/core/models/mutation/denovo_model.cpp +++ b/src/core/models/mutation/denovo_model.cpp @@ -9,28 +9,70 @@ #include #include #include +#include #include +#include "tandem/tandem.hpp" #include "basics/phred.hpp" #include "utils/maths.hpp" #include "core/types/variant.hpp" namespace octopus { -auto make_hmm_model(const double denovo_mutation_rate) noexcept +namespace { + +auto make_flat_hmm_model(const double snv_mutation_rate, const double indel_mutation_rate, + const double max_rate = 0.5) noexcept +{ + auto mutation_penalty = static_cast(probability_to_phred(snv_mutation_rate).score()); + auto gap_open_penalty = mutation_penalty; + auto gap_extension_penalty = static_cast(probability_to_phred(std::min(100 * indel_mutation_rate, max_rate)).score()); + return hmm::FlatGapMutationModel {mutation_penalty, gap_open_penalty, gap_extension_penalty}; +} + +auto make_exponential_repeat_count_model(const double base_rate, const double repeat_count_log_multiplier, + const std::size_t max_repeat_count, const double max_rate) { - auto mutation = static_cast(probability_to_phred(denovo_mutation_rate).score()); - auto gap_open = mutation; - auto gap_extension = static_cast(probability_to_phred(std::min(100 * denovo_mutation_rate, 1.0)).score()); - return hmm::BasicMutationModel {mutation, gap_open, gap_extension}; + using Penalty = hmm::VariableGapOpenMutationModel::Penalty; + std::vector result(max_repeat_count); + const auto log_base_mutation_rate = std::log(base_rate); + const auto min_penalty = static_cast(std::max(std::log(max_rate) / -maths::constants::ln10Div10<>, 1.0)); + for (unsigned i {0}; i < max_repeat_count; ++i) { + auto adjusted_log_rate = repeat_count_log_multiplier * i + log_base_mutation_rate; + auto adjusted_phred_rate = adjusted_log_rate / -maths::constants::ln10Div10<>; + if (adjusted_phred_rate > min_penalty) { + static constexpr double max_representable_penalty {127}; + result[i] = static_cast(std::min(adjusted_phred_rate, max_representable_penalty)); + } else { + std::fill(std::next(std::begin(result), i), std::end(result), min_penalty); + break; + } + } + return result; } +auto make_gap_open_model(double indel_mutation_rate, std::size_t max_repeat_number, double max_rate = 0.2) +{ + return make_exponential_repeat_count_model(indel_mutation_rate, 1.5, max_repeat_number, max_rate); +} + +auto make_gap_extend_model(double indel_mutation_rate, std::size_t max_repeat_number, double max_rate = 0.6) +{ + return make_exponential_repeat_count_model(10 * indel_mutation_rate, 2, max_repeat_number, max_rate); +} + +} // namespace + DeNovoModel::DeNovoModel(Parameters parameters, std::size_t num_haplotypes_hint, CachingStrategy caching) -: mutation_model_ {make_hmm_model(parameters.mutation_rate)} +: flat_mutation_model_ {make_flat_hmm_model(parameters.snv_mutation_rate, parameters.indel_mutation_rate)} +, repeat_length_gap_open_model_ {make_gap_open_model(parameters.indel_mutation_rate, 10)} +, repeat_length_gap_extend_model_ {make_gap_extend_model(parameters.indel_mutation_rate, 10)} , min_ln_probability_ {} , num_haplotypes_hint_ {num_haplotypes_hint} , haplotypes_ {} , caching_ {caching} +, gap_open_penalties_ {} +, gap_open_index_cache_ {} , value_cache_ {} , address_cache_ {} , guarded_index_cache_ {} @@ -38,7 +80,8 @@ DeNovoModel::DeNovoModel(Parameters parameters, std::size_t num_haplotypes_hint, , padded_given_ {} , use_unguarded_ {false} { - min_ln_probability_ = 8 * std::log(parameters.mutation_rate); + gap_open_penalties_.reserve(1000); + min_ln_probability_ = 8 * std::log(parameters.snv_mutation_rate); if (caching_ == CachingStrategy::address) { address_cache_.reserve(num_haplotypes_hint_ * num_haplotypes_hint_); } else if (caching == CachingStrategy::value) { @@ -49,19 +92,21 @@ DeNovoModel::DeNovoModel(Parameters parameters, std::size_t num_haplotypes_hint, void DeNovoModel::prime(std::vector haplotypes) { + if (is_primed()) throw std::runtime_error {"DeNovoModel: already primed"}; constexpr std::size_t max_unguardered {50}; - if (haplotypes.size() <= max_unguardered) { - unguarded_index_cache_.assign(haplotypes.size(), std::vector(haplotypes.size(), 0)); - for (unsigned target {0}; target < haplotypes.size(); ++target) { - for (unsigned given {0}; given < haplotypes.size(); ++given) { + haplotypes_ = std::move(haplotypes); + gap_open_index_cache_.resize(haplotypes_.size()); + if (haplotypes_.size() <= max_unguardered) { + unguarded_index_cache_.assign(haplotypes_.size(), std::vector(haplotypes_.size(), 0)); + for (unsigned target {0}; target < haplotypes_.size(); ++target) { + for (unsigned given {0}; given < haplotypes_.size(); ++given) { if (target != given) { - unguarded_index_cache_[target][given] = evaluate_uncached(haplotypes[target], haplotypes[given]); + unguarded_index_cache_[target][given] = evaluate_uncached(target, given); } } } use_unguarded_ = true; } else { - haplotypes_ = std::move(haplotypes); guarded_index_cache_.assign(haplotypes_.size(), std::vector>(haplotypes_.size())); } } @@ -70,6 +115,8 @@ void DeNovoModel::unprime() noexcept { haplotypes_.clear(); haplotypes_.shrink_to_fit(); + gap_open_index_cache_.clear(); + gap_open_index_cache_.shrink_to_fit(); guarded_index_cache_.clear(); guarded_index_cache_.shrink_to_fit(); unguarded_index_cache_.clear(); @@ -100,7 +147,7 @@ double DeNovoModel::evaluate(const unsigned target, const unsigned given) const auto& result = guarded_index_cache_[target][given]; if (!result) { if (target != given) { - result = evaluate_uncached(haplotypes_[target], haplotypes_[given]); + result = evaluate_uncached(target, given); } else { result = 0; } @@ -111,6 +158,14 @@ double DeNovoModel::evaluate(const unsigned target, const unsigned given) const // private methods +namespace { + +auto get_short_tandem_repeats(const Haplotype::NucleotideSequence& given) +{ + constexpr unsigned max_repeat_period {4}; + return tandem::extract_exact_tandem_repeats(given, 1, max_repeat_period); +} + void pad_given(const Haplotype::NucleotideSequence& target, const Haplotype::NucleotideSequence& given, std::string& result) { @@ -137,14 +192,28 @@ bool can_align_with_hmm(const Haplotype& target, const Haplotype& given) noexcep return sequence_length_distance(target, given) <= hmm::min_flank_pad(); } +template +void rotate_right(Container& c, const std::size_t n) +{ + assert(c.size() > n); + std::rotate(std::rbegin(c), std::next(std::rbegin(c), n), std::rend(c)); +} + double hmm_align(const Haplotype::NucleotideSequence& target, const Haplotype::NucleotideSequence& given, - const hmm::BasicMutationModel& model, const boost::optional min_ln_probability) noexcept + const hmm::FlatGapMutationModel& model, const boost::optional min_ln_probability) noexcept { const auto p = hmm::evaluate(target, given, model); return min_ln_probability ? std::max(p, *min_ln_probability) : p; } -double approx_align(const Haplotype& target, const Haplotype& given, const hmm::BasicMutationModel& model, +double hmm_align(const Haplotype::NucleotideSequence& target, const Haplotype::NucleotideSequence& given, + const hmm::VariableGapOpenMutationModel& model, const boost::optional min_ln_probability) noexcept +{ + const auto p = hmm::evaluate(target, given, model); + return min_ln_probability ? std::max(p, *min_ln_probability) : p; +} + +double approx_align(const Haplotype& target, const Haplotype& given, const hmm::FlatGapMutationModel& model, const boost::optional min_ln_probability) { using maths::constants::ln10Div10; @@ -158,20 +227,116 @@ double approx_align(const Haplotype& target, const Haplotype& given, const hmm:: } const auto variants = target.difference(given); const auto mismatch_size = std::accumulate(std::cbegin(variants), std::cend(variants), 0, - [] (auto curr, const auto& variant) noexcept { + [](auto curr, const auto& variant) noexcept { return curr + (!is_indel(variant) ? region_size(variant) : 0); }); score += mismatch_size * model.mutation; return min_ln_probability ? std::max(-ln10Div10<> * score, *min_ln_probability) : -ln10Div10<> * score; } +} // namespace + +boost::optional DeNovoModel::set_gap_open_penalties(const Haplotype& given) const +{ + const auto repeats = get_short_tandem_repeats(given.sequence()); + if (!repeats.empty()) { + gap_open_penalties_.assign(sequence_size(given), flat_mutation_model_.gap_open); + const auto max_num_repeats = static_cast(repeat_length_gap_open_model_.size()); + unsigned max_repeat_number {0}; + for (const auto& repeat : repeats) { + const auto num_repeats = repeat.length / repeat.period; + assert(num_repeats > 0); + const auto penalty = repeat_length_gap_open_model_[std::min(num_repeats - 1, max_num_repeats - 1)]; + assert(repeat.pos + repeat.length <= gap_open_penalties_.size()); + std::fill_n(std::next(std::begin(gap_open_penalties_), repeat.pos), repeat.length, penalty); + max_repeat_number = std::max(num_repeats, max_repeat_number); + } + return max_repeat_number; + } else { + gap_open_penalties_.clear(); + return boost::none; + } +} + +boost::optional DeNovoModel::set_gap_open_penalties(const unsigned given) const +{ + assert(given < gap_open_index_cache_.size()); + auto& cached_result = gap_open_index_cache_[given]; + if (cached_result) { + if (cached_result->second) { + gap_open_penalties_ = cached_result->first; + } + return cached_result->second; + } else { + auto result = set_gap_open_penalties(haplotypes_[given]); + cached_result = std::make_pair(gap_open_penalties_, result); + return result; + } +} + +hmm::VariableGapOpenMutationModel DeNovoModel::make_variable_hmm_model(const unsigned max_repeat_number) const +{ + assert(max_repeat_number > 0); + auto extension_idx = std::min(repeat_length_gap_extend_model_.size() - 1, static_cast(max_repeat_number) - 1); + auto extension_penalty = repeat_length_gap_extend_model_[extension_idx]; + return {flat_mutation_model_.mutation, gap_open_penalties_, extension_penalty}; +} + double DeNovoModel::evaluate_uncached(const Haplotype& target, const Haplotype& given) const { - if (can_align_with_hmm(target, given)) { + if (sequence_size(target) == sequence_size(given)) { + pad_given(target, given, padded_given_); + return hmm_align(target.sequence(), padded_given_, flat_mutation_model_, min_ln_probability_); + } else { + const auto max_repeat_number = set_gap_open_penalties(given); + if (max_repeat_number) { + if (can_align_with_hmm(target, given)) { + pad_given(target, given, padded_given_); + gap_open_penalties_.resize(padded_given_.size(), flat_mutation_model_.gap_open); + rotate_right(gap_open_penalties_, hmm::min_flank_pad()); + const auto model = make_variable_hmm_model(*max_repeat_number); + return hmm_align(target.sequence(), padded_given_, model, min_ln_probability_); + } else { + return approx_align(target, given, flat_mutation_model_, min_ln_probability_); + } + } else { + if (can_align_with_hmm(target, given)) { + pad_given(target, given, padded_given_); + return hmm_align(target.sequence(), padded_given_, flat_mutation_model_, min_ln_probability_); + } else { + return approx_align(target, given, flat_mutation_model_, min_ln_probability_); + } + } + } +} + +double DeNovoModel::evaluate_uncached(const unsigned target_idx, const unsigned given_idx) const +{ + const auto& target = haplotypes_[target_idx]; + const auto& given = haplotypes_[given_idx]; + if (sequence_size(target) == sequence_size(given)) { pad_given(target, given, padded_given_); - return hmm_align(target.sequence(), padded_given_, mutation_model_, min_ln_probability_); + return hmm_align(target.sequence(), padded_given_, flat_mutation_model_, min_ln_probability_); } else { - return approx_align(target, given, mutation_model_, min_ln_probability_); + const auto max_repeat_length = set_gap_open_penalties(given_idx); + if (max_repeat_length) { + if (can_align_with_hmm(target, given)) { + pad_given(target, given, padded_given_); + gap_open_penalties_.resize(padded_given_.size(), flat_mutation_model_.gap_open); + rotate_right(gap_open_penalties_, hmm::min_flank_pad()); + const auto model = make_variable_hmm_model(*max_repeat_length); + return hmm_align(target.sequence(), padded_given_, model, min_ln_probability_); + } else { + return approx_align(target, given, flat_mutation_model_, min_ln_probability_); + } + } else { + if (can_align_with_hmm(target, given)) { + pad_given(target, given, padded_given_); + return hmm_align(target.sequence(), padded_given_, flat_mutation_model_, min_ln_probability_); + } else { + return approx_align(target, given, flat_mutation_model_, min_ln_probability_); + } + } } } diff --git a/src/core/models/mutation/denovo_model.hpp b/src/core/models/mutation/denovo_model.hpp index 3f7c2200f..ea13d4229 100644 --- a/src/core/models/mutation/denovo_model.hpp +++ b/src/core/models/mutation/denovo_model.hpp @@ -22,7 +22,7 @@ class DeNovoModel public: struct Parameters { - double mutation_rate; + double snv_mutation_rate, indel_mutation_rate; }; enum class CachingStrategy { none, value, address }; @@ -60,11 +60,17 @@ class DeNovoModel } }; - hmm::BasicMutationModel mutation_model_; + using PenaltyVector = hmm::VariableGapOpenMutationModel::PenaltyVector; + using GapOpenResult = std::pair>; + + hmm::FlatGapMutationModel flat_mutation_model_; + std::vector repeat_length_gap_open_model_, repeat_length_gap_extend_model_; boost::optional min_ln_probability_; std::size_t num_haplotypes_hint_; std::vector haplotypes_; CachingStrategy caching_; + mutable PenaltyVector gap_open_penalties_; + mutable std::vector> gap_open_index_cache_; mutable std::unordered_map> value_cache_; mutable std::unordered_map, double, AddressPairHash> address_cache_; mutable std::vector>> guarded_index_cache_; @@ -72,7 +78,11 @@ class DeNovoModel mutable std::string padded_given_; mutable bool use_unguarded_; + boost::optional set_gap_open_penalties(const Haplotype& given) const; + boost::optional set_gap_open_penalties(unsigned given) const; + hmm::VariableGapOpenMutationModel make_variable_hmm_model(unsigned max_repeat_length) const; double evaluate_uncached(const Haplotype& target, const Haplotype& given) const; + double evaluate_uncached(unsigned target, unsigned given) const; double evaluate_basic_cache(const Haplotype& target, const Haplotype& given) const; double evaluate_address_cache(const Haplotype& target, const Haplotype& given) const; }; diff --git a/src/core/models/pairhmm/pair_hmm.cpp b/src/core/models/pairhmm/pair_hmm.cpp index 6b28f4d7e..fb4d5e4ab 100644 --- a/src/core/models/pairhmm/pair_hmm.cpp +++ b/src/core/models/pairhmm/pair_hmm.cpp @@ -136,7 +136,7 @@ auto simd_align(const std::string& truth, const std::string& target, target_size, model.snv_mask.data() + alignment_offset, model.snv_priors.data() + alignment_offset, - model.gap_open_penalties.data() + alignment_offset, + model.gap_open.data() + alignment_offset, model.gap_extend, model.nuc_prior); return -ln10Div10<> * static_cast(score); } else { @@ -152,7 +152,7 @@ auto simd_align(const std::string& truth, const std::string& target, target_size, model.snv_mask.data() + alignment_offset, model.snv_priors.data() + alignment_offset, - model.gap_open_penalties.data() + alignment_offset, + model.gap_open.data() + alignment_offset, model.gap_extend, model.nuc_prior, align1.data(), align2.data(), first_pos); auto lhs_flank_size = static_cast(model.lhs_flank_size); @@ -178,7 +178,7 @@ auto simd_align(const std::string& truth, const std::string& target, target.data(), qualities, model.snv_mask.data() + alignment_offset, model.snv_priors.data() + alignment_offset, - model.gap_open_penalties.data() + alignment_offset, + model.gap_open.data() + alignment_offset, model.gap_extend, model.nuc_prior, first_pos, align1.data(), align2.data(), @@ -186,8 +186,13 @@ auto simd_align(const std::string& truth, const std::string& target, const auto num_explained_bases = target_size - target_mask_size; constexpr int min_explained_bases {2}; if (num_explained_bases < min_explained_bases) flank_score = 0; - assert(flank_score <= score); - return -ln10Div10<> * static_cast(score - flank_score); + //assert(flank_score <= score); + if (flank_score <= score) { + return -ln10Div10<> * static_cast(score - flank_score); + } else { + // Overflow has occurred when calculating score; + return -ln10Div10<> * (flank_score + score); + } } } @@ -207,7 +212,7 @@ void validate(const std::string& truth, const std::string& target, if (truth.size() != model.snv_priors.size()) { throw std::invalid_argument {"PairHMM::align: truth size not equal to snv priors length"}; } - if (truth.size() != model.gap_open_penalties.size()) { + if (truth.size() != model.gap_open.size()) { throw std::invalid_argument {"PairHMM::align: truth size not equal to gap open penalties length"}; } if (target_offset + target.size() > truth.size()) { @@ -231,27 +236,58 @@ double evaluate(const std::string& target, const std::string& truth, const auto m2 = std::mismatch(next(m1.first), cend(target), next(m1.second)); if (m2.first == cend(target)) { // then there is only a single base difference between the sequences, can optimise - const auto truth_index = distance(offsetted_truth_begin_itr, m1.second) + target_offset; - if (truth_index < model.lhs_flank_size || truth_index >= (truth.size() - model.rhs_flank_size)) { + const auto truth_mismatch_idx = distance(offsetted_truth_begin_itr, m1.second) + target_offset; + if (truth_mismatch_idx < model.lhs_flank_size || truth_mismatch_idx >= (truth.size() - model.rhs_flank_size)) { return 0; } const auto target_index = distance(cbegin(target), m1.first); auto mispatch_penalty = target_qualities[target_index]; - if (model.snv_mask[truth_index] == *m1.first) { + if (model.snv_mask[truth_mismatch_idx] == *m1.first) { mispatch_penalty = std::min(target_qualities[target_index], - static_cast(model.snv_priors[truth_index])); + static_cast(model.snv_priors[truth_mismatch_idx])); } - if (mispatch_penalty <= model.gap_open_penalties[truth_index] + if (mispatch_penalty <= model.gap_open[truth_mismatch_idx] || !std::equal(next(m1.first), cend(target), m1.second)) { return lnProbability[mispatch_penalty]; } - return lnProbability[model.gap_open_penalties[truth_index]]; + return lnProbability[model.gap_open[truth_mismatch_idx]]; } // TODO: we should be able to optimise the alignment based of the first mismatch postition return simd_align(truth, target, target_qualities, target_offset, model); } -double evaluate(const std::string& target, const std::string& truth, const BasicMutationModel& model) noexcept +double evaluate(const std::string& target, const std::string& truth, const VariableGapOpenMutationModel& model) noexcept +{ + assert(truth.size() == model.gap_open.size()); + using std::cbegin; using std::cend; using std::next; using std::distance; + static constexpr auto lnProbability = make_phred_to_ln_prob_lookup(); + const auto truth_begin = next(cbegin(truth), min_flank_pad()); + const auto m1 = std::mismatch(cbegin(target), cend(target), truth_begin); + if (m1.first == cend(target)) { + return 0; // sequences are equal, can't do better than this + } + const auto m2 = std::mismatch(next(m1.first), cend(target), next(m1.second)); + if (m2.first == cend(target)) { + // then there is only a single base difference between the sequences, can optimise + const auto truth_mismatch_idx = static_cast(distance(cbegin(truth), m1.second)); + if (model.mutation <= model.gap_open[truth_mismatch_idx] || !std::equal(next(m1.first), cend(target), m1.second)) { + return lnProbability[model.gap_open[truth_mismatch_idx]]; + } + return lnProbability[model.mutation]; + } + const auto truth_alignment_size = static_cast(target.size() + 2 * min_flank_pad() - 1); + thread_local std::vector dummy_qualities; + dummy_qualities.assign(target.size(), model.mutation); + auto score = simd::align(truth.c_str(), target.c_str(), + dummy_qualities.data(), + truth_alignment_size, + static_cast(target.size()), + model.gap_open.data(), + model.gap_extend, 2); + return -ln10Div10<> * static_cast(score); +} + +double evaluate(const std::string& target, const std::string& truth, const FlatGapMutationModel& model) noexcept { using std::cbegin; using std::cend; using std::next; using std::distance; static constexpr auto lnProbability = make_phred_to_ln_prob_lookup(); diff --git a/src/core/models/pairhmm/pair_hmm.hpp b/src/core/models/pairhmm/pair_hmm.hpp index 4564bfe43..297c2a12a 100644 --- a/src/core/models/pairhmm/pair_hmm.hpp +++ b/src/core/models/pairhmm/pair_hmm.hpp @@ -20,12 +20,27 @@ struct MutationModel using Penalty = std::int8_t; const std::vector& snv_mask; const std::vector& snv_priors; - const std::vector& gap_open_penalties; + const std::vector& gap_open; short gap_extend; short nuc_prior = 2; std::size_t lhs_flank_size = 0, rhs_flank_size = 0; }; +struct VariableGapOpenMutationModel +{ + using Penalty = std::int8_t; + using PenaltyVector = std::vector; + Penalty mutation; + const PenaltyVector& gap_open; + short gap_extend; +}; + +struct FlatGapMutationModel +{ + std::int8_t mutation; + short gap_open, gap_extend; +}; + // p(target | truth, target_qualities, target_offset, model) // // Warning: The target must be contained by the truth by at least @@ -35,17 +50,19 @@ double evaluate(const std::string& target, const std::string& truth, std::size_t target_offset, const MutationModel& model); -struct BasicMutationModel -{ - std::int8_t mutation; - short gap_open, gap_extend; -}; +// p(target | truth, model) +// +// Warning: The target must be contained by the truth by exactly +// min_flank_pad() on either side. +double evaluate(const std::string& target, const std::string& truth, + const VariableGapOpenMutationModel& model) noexcept; // p(target | truth, model) // // Warning: The target must be contained by the truth by exactly // min_flank_pad() on either side. -double evaluate(const std::string& target, const std::string& truth, const BasicMutationModel& model) noexcept; +double evaluate(const std::string& target, const std::string& truth, + const FlatGapMutationModel& model) noexcept; } // namespace hmm } // namespace octopus diff --git a/src/core/octopus.cpp b/src/core/octopus.cpp index d9f31df39..dde08c51c 100644 --- a/src/core/octopus.cpp +++ b/src/core/octopus.cpp @@ -1194,11 +1194,24 @@ auto add_identifier(const boost::filesystem::path& base, const std::string& iden return base.parent_path() / new_stem; } +void destroy(VcfWriter& writer) +{ + VcfWriter tmp {}; + swap(writer, tmp); +} + auto get_legacy_path(const boost::filesystem::path& native) { return add_identifier(native, "legacy"); } +void make_legacy_copy(const boost::filesystem::path& native) +{ + const VcfReader in {native}; + VcfWriter out {get_legacy_path(native)}; + convert_to_legacy(in, out); +} + void log_run_start(const GenomeCallingComponents& components) { static auto debug_log = get_debug_log(); @@ -1234,9 +1247,7 @@ void run_octopus(GenomeCallingComponents& components, std::string command) log_run_start(components); write_caller_output_header(components, command); - const auto start = std::chrono::system_clock::now(); - try { if (is_multithreaded(components)) { if (DEBUG_MODE) { @@ -1266,26 +1277,24 @@ void run_octopus(GenomeCallingComponents& components, std::string command) } catch (...) {} throw CallingBug {}; } - components.output().close(); - try { if (components.legacy()) { - auto output_path = components.output().path(); + const auto output_path = components.output().path(); + destroy(components.output()); if (output_path) { - const VcfReader native {*output_path}; - VcfWriter legacy {get_legacy_path(native.path())}; - convert_to_legacy(native, legacy); + make_legacy_copy(*output_path); } } - } catch (...) {} - + } catch (...) { + logging::WarningLogger warn_log {}; + warn_log << "Failed to make legacy output"; + } const auto end = std::chrono::system_clock::now(); const auto search_size = sum_region_sizes(components.search_regions()); stream(info_log) << "Finished calling " << utils::format_with_commas(search_size) << "bp, total runtime " << TimeInterval {start, end}; - cleanup(components); } } // namespace octopus diff --git a/src/core/tools/hapgen/haplotype_generator.cpp b/src/core/tools/hapgen/haplotype_generator.cpp index e645e3334..7cfefb2f5 100644 --- a/src/core/tools/hapgen/haplotype_generator.cpp +++ b/src/core/tools/hapgen/haplotype_generator.cpp @@ -333,8 +333,12 @@ unsigned HaplotypeGenerator::max_removal_impact() const const auto novel_region = right_overhang_region(max_lagged_region, active_region_); const auto num_novel_alleles = count_overlapped(alleles_, novel_region); if (num_novel_alleles == 0) return 0; - const auto max_new_haplotypes = std::max(static_cast(std::exp2(num_novel_alleles / 2)), 1u); - const auto num_leftover_haplotypes = policies_.haplotype_limits.target / max_new_haplotypes; + unsigned num_leftover_haplotypes {0}; + static auto max_exponent = static_cast(std::log2(std::numeric_limits::max())); + if (num_novel_alleles / 2 < max_exponent) { + const auto max_new_haplotypes = std::max(static_cast(std::exp2(num_novel_alleles / 2)), 1u); + num_leftover_haplotypes = policies_.haplotype_limits.target / max_new_haplotypes; + } const auto cur_num_haplotypes = tree_.num_haplotypes(); if (cur_num_haplotypes > num_leftover_haplotypes) { return cur_num_haplotypes - num_leftover_haplotypes; @@ -1010,14 +1014,16 @@ void HaplotypeGenerator::populate_tree_with_novel_alleles() if (last_added_novel_itr != std::cend(novel_active_alleles)) { last_added_novel_itr = extend_tree_until(last_added_novel_itr, std::cend(novel_active_alleles), tree_, policies_.haplotype_limits.overflow); - if (!tree_.is_empty()) { - active_region_ = encompassing_region(active_region_, tree_.encompassing_region()); - } - if (in_holdout_mode()) { - active_region_ = encompassing_region(active_region_, *holdout_region_); - } if (last_added_novel_itr != std::cend(novel_active_alleles)) { + if (in_holdout_mode()) { + active_region_ = encompassing_region(active_region_, tree_.encompassing_region()); + active_region_ = encompassing_region(active_region_, *holdout_region_); + } else { + active_region_ = tree_.encompassing_region(); + } throw HaplotypeOverflow {active_region_, tree_.num_haplotypes()}; + } else { + active_region_ = tree_.encompassing_region(); } } } else { diff --git a/src/core/tools/vargen/dynamic_cigar_scanner.cpp b/src/core/tools/vargen/dynamic_cigar_scanner.cpp index dd7ee863a..667ccdeab 100644 --- a/src/core/tools/vargen/dynamic_cigar_scanner.cpp +++ b/src/core/tools/vargen/dynamic_cigar_scanner.cpp @@ -467,7 +467,12 @@ bool DefaultInclusionPredicate::operator()(const Variant& v, const unsigned dept return static_cast(observed_qualities[0]) / alt_sequence_size(v) > 20; } } else { - return num_observations > 1 && static_cast(num_observations) / depth > 0.05; + // deletion or mnv + if (region_size(v) < 10) { + return num_observations > 1 && static_cast(num_observations) / depth > 0.05; + } else { + return num_observations > 1 && num_observations >= 5; + } } } diff --git a/src/core/tools/vcf_record_factory.cpp b/src/core/tools/vcf_record_factory.cpp index b9250b79f..bb91ccb56 100644 --- a/src/core/tools/vcf_record_factory.cpp +++ b/src/core/tools/vcf_record_factory.cpp @@ -37,7 +37,7 @@ namespace octopus { namespace { constexpr char dummy_base {'#'}; -const std::string deleted_sequence {vcfspec::missingValue, 1}; +const std::string deleted_sequence {vcfspec::deletedBase}; } // namespace @@ -282,7 +282,11 @@ std::vector VcfRecordFactory::make(std::vector&& calls) auto new_sequence = old_genotype[i].sequence(); if (base) { const auto& base_sequence = (**base)->get_genotype_call(sample).genotype[i].sequence(); - new_sequence.front() = base_sequence.front(); + if (!base_sequence.empty()) { + new_sequence.front() = base_sequence.front(); + } else { + new_sequence = vcfspec::missingValue; + } } else { new_sequence.front() = actual_reference_base; } @@ -321,10 +325,10 @@ std::vector VcfRecordFactory::make(std::vector&& calls) Genotype new_genotype {ploidy}; for (unsigned i {0}; i < ploidy; ++i) { if (prev_genotype[i].sequence() == deleted_sequence || - (prev_genotype[i].sequence() == old_genotype[i].sequence() + (old_genotype[i] != prev_genotype[i] + && prev_genotype[i].sequence() == old_genotype[i].sequence() && sequence_size(old_genotype[i]) < region_size(old_genotype))) { - Allele::NucleotideSequence new_sequence(1, vcfspec::deletedBase); - Allele new_allele {mapped_region(curr_call), move(new_sequence)}; + Allele new_allele {mapped_region(curr_call), deleted_sequence}; new_genotype.emplace(move(new_allele)); } else { new_genotype.emplace(old_genotype[i]); diff --git a/src/io/variant/vcf_spec.hpp b/src/io/variant/vcf_spec.hpp index 1fd0f6f56..d75ebaf94 100644 --- a/src/io/variant/vcf_spec.hpp +++ b/src/io/variant/vcf_spec.hpp @@ -1,8 +1,8 @@ // Copyright (c) 2016 Daniel Cooke // Use of this source code is governed by the MIT license that can be found in the LICENSE file. -#ifndef vcf_spec_h -#define vcf_spec_h +#ifndef vcf_spec_hpp +#define vcf_spec_hpp #include @@ -15,186 +15,129 @@ */ namespace octopus { namespace vcfspec { - VCF_SPEC_CONSTANT version {"VCFv4.3"}; - - VCF_SPEC_CONSTANT missingValue {"."}; - - constexpr char deletedBase {'*'}; - - namespace header { - - VCF_SPEC_CONSTANT lineOpener {"#"}; - - VCF_SPEC_CONSTANT chrom {"CHROM"}; - - VCF_SPEC_CONSTANT pos {"POS"}; - - VCF_SPEC_CONSTANT id {"ID"}; - - VCF_SPEC_CONSTANT ref {"REF"}; - - VCF_SPEC_CONSTANT alt {"ALT"}; - - VCF_SPEC_CONSTANT qual {"QUAL"}; - - VCF_SPEC_CONSTANT filter {"FILTER"}; - - VCF_SPEC_CONSTANT info {"INFO"}; - - VCF_SPEC_CONSTANT format {"FORMAT"}; - - static constexpr std::array requiredFields { - chrom, pos, id, ref, alt, qual, filter, info - }; - - static constexpr std::array requiredFieldsWithSamples { - chrom, pos, id, ref, alt, qual, filter, info, format - }; - - namespace meta { - - namespace tag { - - VCF_SPEC_CONSTANT info {"INFO"}; - - VCF_SPEC_CONSTANT filter {"FILTER"}; - - VCF_SPEC_CONSTANT format {"FORMAT"}; - - VCF_SPEC_CONSTANT contig {"contig"}; - - } // namespace tag - - namespace struc { - - VCF_SPEC_CONSTANT id {"ID"}; - - VCF_SPEC_CONSTANT number {"Number"}; - - VCF_SPEC_CONSTANT type {"Type"}; - - VCF_SPEC_CONSTANT description {"Description"}; - - VCF_SPEC_CONSTANT source {"Source"}; - - VCF_SPEC_CONSTANT version {"Version"}; - - // Not all these fields are required in a structured field, but those present - // should be in this order - static constexpr std::array order { - id, number, type, description, source, version - }; - - } // namespace struc - - VCF_SPEC_CONSTANT vcfVersion {"fileformat"}; - - } // namespace meta - - } // namespace header - - namespace allele { - - VCF_SPEC_SEPERATOR seperator {','}; - - } // namespace allele - - namespace filter { - - VCF_SPEC_SEPERATOR seperator {';'}; - - VCF_SPEC_CONSTANT pass {"PASS"}; - - } // namespace filter - - namespace info { - - VCF_SPEC_SEPERATOR seperator {';'}; - - VCF_SPEC_SEPERATOR valueSeperator {','}; - - VCF_SPEC_CONSTANT ancestralAllele {"AA"}; - - VCF_SPEC_CONSTANT alleleReadDepth {"AD"}; - - VCF_SPEC_CONSTANT alleleReadDepthForward {"ADF"}; - - VCF_SPEC_CONSTANT alleleReadDepthReverse {"ADR"}; - - VCF_SPEC_CONSTANT combinedReadDepth {"DP"}; - - VCF_SPEC_CONSTANT alleleFrequency {"AF"}; - - VCF_SPEC_CONSTANT totalAlleleCount {"AN"}; - - VCF_SPEC_CONSTANT rmsBaseQuality {"BQ"}; - - VCF_SPEC_CONSTANT dbSNPMember {"DB"}; - - VCF_SPEC_CONSTANT endPosition {"END"}; - - VCF_SPEC_CONSTANT hapmap2Member {"H2"}; - - VCF_SPEC_CONSTANT hapmap3Member {"H3"}; - - VCF_SPEC_CONSTANT rmsMappingQuality {"MQ"}; - - VCF_SPEC_CONSTANT mappingQualityZeroCount {"MQ0"}; - - VCF_SPEC_CONSTANT hapmap3 {"H3"}; - - VCF_SPEC_CONSTANT numSamplesWithData {"NS"}; - - VCF_SPEC_CONSTANT strandBias {"SB"}; - - VCF_SPEC_CONSTANT somatic {"SOMATIC"}; - - VCF_SPEC_CONSTANT validated {"VALIDATED"}; - - VCF_SPEC_CONSTANT thousandGenomes {"1000G"}; - - } // namespace info - - namespace format { - - VCF_SPEC_SEPERATOR seperator {':'}; - - VCF_SPEC_SEPERATOR valueSeperator {','}; - - VCF_SPEC_CONSTANT genotype {"GT"}; - - VCF_SPEC_CONSTANT alleleReadDepth {"AD"}; - - VCF_SPEC_CONSTANT alleleReadDepthForward {"ADF"}; - - VCF_SPEC_CONSTANT alleleReadDepthReverse {"ADR"}; - - VCF_SPEC_CONSTANT combinedReadDepth {"DP"}; - - VCF_SPEC_CONSTANT filter {"FT"}; - - VCF_SPEC_CONSTANT conditionalQuality {"GQ"}; - - VCF_SPEC_CONSTANT posteriors {"GP"}; - - VCF_SPEC_CONSTANT likelihoods {"GL"}; - - VCF_SPEC_CONSTANT haplotypeQualities {"HQ"}; - - VCF_SPEC_CONSTANT rmsMappingQuality {"MQ"}; - - VCF_SPEC_CONSTANT phredLikelihoods {"PL"}; - - VCF_SPEC_CONSTANT phaseQuality {"PQ"}; - - VCF_SPEC_CONSTANT phaseSet {"PS"}; - - VCF_SPEC_CONSTANT phasedSeperator {"|"}; - - VCF_SPEC_CONSTANT unphasedSeperator {"/"}; - - } // namespace format - +VCF_SPEC_CONSTANT version {"VCFv4.3"}; +VCF_SPEC_CONSTANT missingValue {"."}; + +static constexpr char deletedBase {'*'}; + +namespace header { + +VCF_SPEC_CONSTANT lineOpener {"#"}; +VCF_SPEC_CONSTANT chrom {"CHROM"}; +VCF_SPEC_CONSTANT pos {"POS"}; +VCF_SPEC_CONSTANT id {"ID"}; +VCF_SPEC_CONSTANT ref {"REF"}; +VCF_SPEC_CONSTANT alt {"ALT"}; +VCF_SPEC_CONSTANT qual {"QUAL"}; +VCF_SPEC_CONSTANT filter {"FILTER"}; +VCF_SPEC_CONSTANT info {"INFO"}; +VCF_SPEC_CONSTANT format {"FORMAT"}; + +static constexpr std::array requiredFields { + chrom, pos, id, ref, alt, qual, filter, info +}; + +static constexpr std::array requiredFieldsWithSamples { + chrom, pos, id, ref, alt, qual, filter, info, format +}; + +namespace meta { + +namespace tag { + +VCF_SPEC_CONSTANT info {"INFO"}; +VCF_SPEC_CONSTANT filter {"FILTER"}; +VCF_SPEC_CONSTANT format {"FORMAT"}; +VCF_SPEC_CONSTANT contig {"contig"}; + +} // namespace tag + +namespace struc { + +VCF_SPEC_CONSTANT id {"ID"}; +VCF_SPEC_CONSTANT number {"Number"}; +VCF_SPEC_CONSTANT type {"Type"}; +VCF_SPEC_CONSTANT description {"Description"}; +VCF_SPEC_CONSTANT source {"Source"}; +VCF_SPEC_CONSTANT version {"Version"}; + +// Not all these fields are required in a structured field, but those present +// should be in this order +static constexpr std::array order { + id, number, type, description, source, version +}; + +} // namespace struc + +VCF_SPEC_CONSTANT vcfVersion {"fileformat"}; + +} // namespace meta + +} // namespace header + +namespace allele { + +VCF_SPEC_SEPERATOR seperator {','}; + +} // namespace allele + +namespace filter { + +VCF_SPEC_SEPERATOR seperator {';'}; +VCF_SPEC_CONSTANT pass {"PASS"}; + +} // namespace filter + +namespace info { + +VCF_SPEC_SEPERATOR seperator {';'}; +VCF_SPEC_SEPERATOR valueSeperator {','}; +VCF_SPEC_CONSTANT ancestralAllele {"AA"}; +VCF_SPEC_CONSTANT alleleReadDepth {"AD"}; +VCF_SPEC_CONSTANT alleleReadDepthForward {"ADF"}; +VCF_SPEC_CONSTANT alleleReadDepthReverse {"ADR"}; +VCF_SPEC_CONSTANT combinedReadDepth {"DP"}; +VCF_SPEC_CONSTANT alleleFrequency {"AF"}; +VCF_SPEC_CONSTANT totalAlleleCount {"AN"}; +VCF_SPEC_CONSTANT rmsBaseQuality {"BQ"}; +VCF_SPEC_CONSTANT dbSNPMember {"DB"}; +VCF_SPEC_CONSTANT endPosition {"END"}; +VCF_SPEC_CONSTANT hapmap2Member {"H2"}; +VCF_SPEC_CONSTANT hapmap3Member {"H3"}; +VCF_SPEC_CONSTANT rmsMappingQuality {"MQ"}; +VCF_SPEC_CONSTANT mappingQualityZeroCount {"MQ0"}; +VCF_SPEC_CONSTANT hapmap3 {"H3"}; +VCF_SPEC_CONSTANT numSamplesWithData {"NS"}; +VCF_SPEC_CONSTANT strandBias {"SB"}; +VCF_SPEC_CONSTANT somatic {"SOMATIC"}; +VCF_SPEC_CONSTANT validated {"VALIDATED"}; +VCF_SPEC_CONSTANT thousandGenomes {"1000G"}; + +} // namespace info + +namespace format { + +VCF_SPEC_SEPERATOR seperator {':'}; +VCF_SPEC_SEPERATOR valueSeperator {','}; +VCF_SPEC_CONSTANT genotype {"GT"}; +VCF_SPEC_CONSTANT alleleReadDepth {"AD"}; +VCF_SPEC_CONSTANT alleleReadDepthForward {"ADF"}; +VCF_SPEC_CONSTANT alleleReadDepthReverse {"ADR"}; +VCF_SPEC_CONSTANT combinedReadDepth {"DP"}; +VCF_SPEC_CONSTANT filter {"FT"}; +VCF_SPEC_CONSTANT conditionalQuality {"GQ"}; +VCF_SPEC_CONSTANT posteriors {"GP"}; +VCF_SPEC_CONSTANT likelihoods {"GL"}; +VCF_SPEC_CONSTANT haplotypeQualities {"HQ"}; +VCF_SPEC_CONSTANT rmsMappingQuality {"MQ"}; +VCF_SPEC_CONSTANT phredLikelihoods {"PL"}; +VCF_SPEC_CONSTANT phaseQuality {"PQ"}; +VCF_SPEC_CONSTANT phaseSet {"PS"}; +VCF_SPEC_CONSTANT phasedSeperator {"|"}; +VCF_SPEC_CONSTANT unphasedSeperator {"/"}; + +} // namespace format + } // namespace vcfspec } // namespace octopus diff --git a/src/utils/maths.hpp b/src/utils/maths.hpp index 5cbfa800f..5b54fd127 100644 --- a/src/utils/maths.hpp +++ b/src/utils/maths.hpp @@ -133,6 +133,7 @@ auto stdev(const Container& values, UnaryOperation unary_op) template RealType rmq(InputIt first, InputIt last) { + if (first == last) return 0.0; return std::sqrt((std::inner_product(first, last, first, RealType {0})) / static_cast(std::distance(first, last))); } diff --git a/src/utils/path_utils.cpp b/src/utils/path_utils.cpp index 2406dd194..81f8b8acb 100644 --- a/src/utils/path_utils.cpp +++ b/src/utils/path_utils.cpp @@ -73,11 +73,9 @@ fs::path resolve_path(const fs::path& path, const fs::path& working_directory) return expand_user_path(path); // must be a root path } if (fs::exists(path)) { - return path; // must be a root path + return fs::canonical(path); // must be a root path } - const auto parent_dir = path.parent_path(); - if (fs::exists(parent_dir) && fs::is_directory(parent_dir)) { auto tmp = working_directory; tmp /= path; @@ -87,7 +85,6 @@ fs::path resolve_path(const fs::path& path, const fs::path& working_directory) } return path; // must be yet-to-be-created root path } - auto result = working_directory; result /= path; return result; diff --git a/src/utils/read_size_estimator.cpp b/src/utils/read_size_estimator.cpp index 7dc4fa263..c3ded9d2f 100644 --- a/src/utils/read_size_estimator.cpp +++ b/src/utils/read_size_estimator.cpp @@ -44,9 +44,8 @@ auto estimate_read_size(const AlignedRead& read) noexcept return sizeof(AlignedRead) + estimate_dynamic_size(read); } -auto get_valid_sample_regions(const std::vector& samples, - const InputRegionMap& input_regions, - ReadManager& read_manager) +auto get_covered_sample_regions(const std::vector& samples, const InputRegionMap& input_regions, + ReadManager& read_manager) { InputRegionMap result {}; result.reserve(input_regions.size()); @@ -54,9 +53,7 @@ auto get_valid_sample_regions(const std::vector& samples, InputRegionMap::mapped_type contig_regions {}; std::copy_if(std::cbegin(p.second), std::cend(p.second), std::inserter(contig_regions, std::begin(contig_regions)), - [&] (const auto& region) { - return read_manager.has_reads(samples, region); - }); + [&] (const auto& region) { return read_manager.has_reads(samples, region); }); if (!contig_regions.empty()) { result.emplace(p.first, std::move(contig_regions)); } @@ -72,7 +69,7 @@ boost::optional estimate_mean_read_size(const std::vector read_size_samples {}; diff --git a/src/utils/timing.hpp b/src/utils/timing.hpp index 1a1e2eadb..8a86dcc10 100644 --- a/src/utils/timing.hpp +++ b/src/utils/timing.hpp @@ -54,14 +54,16 @@ inline std::ostream& operator<<(std::ostream& os, const TimeInterval& interval) << ((100 * (duration_m.count() % 60)) / 60) << 'h'; } else { using H = std::chrono::hours::rep; - const auto days = std::div(duration_h.count(), H {24}); + constexpr H num_hours_in_day {24}; + const auto days = std::div(duration_h.count(), num_hours_in_day); if (days.quot < 7) { os << days.quot << '.' << std::setw(2) << std::setfill('0') - << days.rem << 'd'; + << ((100 * days.rem) / num_hours_in_day) << 'd'; } else { - const auto weeks = std::div(duration_h.count(), H {24 * 7}); + constexpr H num_hours_in_week {7 * num_hours_in_day}; + const auto weeks = std::div(duration_h.count(), num_hours_in_week); os << weeks.quot << '.' << std::setw(2) << std::setfill('0') - << weeks.rem << 'w'; + << ((100 * weeks.rem) / num_hours_in_week) << 'w'; } }