diff --git a/src/core/callers/trio_caller.cpp b/src/core/callers/trio_caller.cpp index a1a38e65c..7700398d6 100644 --- a/src/core/callers/trio_caller.cpp +++ b/src/core/callers/trio_caller.cpp @@ -25,9 +25,13 @@ #include "core/models/genotype/uniform_population_prior_model.hpp" #include "core/models/genotype/coalescent_population_prior_model.hpp" #include "core/models/genotype/population_model.hpp" +#include "core/models/genotype/coalescent_genotype_prior_model.hpp" +#include "core/models/genotype/uniform_genotype_prior_model.hpp" +#include "core/models/genotype/individual_model.hpp" #include "utils/map_utils.hpp" #include "utils/mappable_algorithms.hpp" #include "utils/maths.hpp" +#include "utils/concat.hpp" #include "exceptions/unimplemented_feature_error.hpp" namespace octopus { @@ -50,8 +54,11 @@ TrioCaller::TrioCaller(Caller::Components&& components, : Caller {std::move(components), std::move(general_parameters)} , parameters_ {std::move(specific_parameters)} { - if (parameters_.maternal_ploidy == 0 || parameters_.paternal_ploidy == 0 || parameters_.child_ploidy == 0) { - throw std::logic_error {"TrioCaller: ploidy must be > 0"}; + if (parameters_.maternal_ploidy == 0 && parameters_.paternal_ploidy == 0 && parameters_.child_ploidy == 0) { + throw std::logic_error {"At least one sample must have positive ploidy"}; + } + if (parameters_.child_ploidy == 0 && parameters_.maternal_ploidy > 0 && parameters_.paternal_ploidy > 0) { + throw std::logic_error {"There must be at least one inherited haplotype if both parents have zygosity"}; } const auto max_ploidy = model::TrioModel::max_ploidy(); if (parameters_.maternal_ploidy > max_ploidy || parameters_.paternal_ploidy > max_ploidy || parameters_.child_ploidy > max_ploidy) { @@ -105,9 +112,14 @@ TrioCaller::Latents::Latents(const std::vector& haplotypes, : trio {trio} , maternal_genotypes {std::move(genotypes)} , model_latents {std::move(latents)} +, concatenated_genotypes_ {} +, padded_marginal_maternal_posteriors_ {} +, padded_marginal_paternal_posteriors_ {} +, padded_marginal_child_posteriors_ {} +, child_ploidy_ {maternal_genotypes.front().ploidy()} { - set_genotype_posteriors(trio); - set_haplotype_posteriors(haplotypes); + set_genotype_posteriors_shared_genotypes(trio); + set_haplotype_posteriors_shared_genotypes(haplotypes); } TrioCaller::Latents::Latents(const std::vector& haplotypes, @@ -120,8 +132,22 @@ TrioCaller::Latents::Latents(const std::vector& haplotypes, , maternal_genotypes {std::move(maternal_genotypes)} , paternal_genotypes {std::move(paternal_genotypes)} , model_latents {std::move(latents)} -{ - // TODO: in this case +, concatenated_genotypes_ {} +, padded_marginal_maternal_posteriors_ {} +, padded_marginal_paternal_posteriors_ {} +, padded_marginal_child_posteriors_ {} +, child_ploidy_ {child_ploidy} +{ + set_genotype_posteriors_unique_genotypes(trio); + set_haplotype_posteriors_unique_genotypes(haplotypes); + concatenated_genotypes_.clear(); + concatenated_genotypes_.shrink_to_fit(); + padded_marginal_maternal_posteriors_.clear(); + padded_marginal_maternal_posteriors_.shrink_to_fit(); + padded_marginal_paternal_posteriors_.clear(); + padded_marginal_paternal_posteriors_.shrink_to_fit(); + padded_marginal_child_posteriors_.clear(); + padded_marginal_child_posteriors_.shrink_to_fit(); } std::shared_ptr @@ -178,6 +204,15 @@ auto marginalise_child(const std::vector>& genotypes, } // namespace void TrioCaller::Latents::set_genotype_posteriors(const Trio& trio) +{ + if (paternal_genotypes) { + set_genotype_posteriors_unique_genotypes(trio); + } else { + set_genotype_posteriors_shared_genotypes(trio); + } +} + +void TrioCaller::Latents::set_genotype_posteriors_shared_genotypes(const Trio& trio) { auto& trio_posteriors = model_latents.posteriors.joint_genotype_probabilities; marginal_maternal_posteriors = marginalise_mother(maternal_genotypes, trio_posteriors); @@ -190,20 +225,62 @@ void TrioCaller::Latents::set_genotype_posteriors(const Trio& trio) marginal_genotype_posteriors = std::make_shared(std::move(genotype_posteriors)); } +void TrioCaller::Latents::set_genotype_posteriors_unique_genotypes(const Trio& trio) +{ + auto& trio_posteriors = model_latents.posteriors.joint_genotype_probabilities; + assert(paternal_genotypes); + marginal_paternal_posteriors = marginalise_father(*paternal_genotypes, trio_posteriors); + if (!maternal_genotypes.empty()) { + marginal_maternal_posteriors = marginalise_mother(maternal_genotypes, trio_posteriors); + } else { + maternal_genotypes.assign({Genotype {}}); + marginal_maternal_posteriors.assign({1.0}); + } + const bool child_shares_paternal_genotypes {child_ploidy_ == paternal_genotypes->front().ploidy()}; + if (child_shares_paternal_genotypes) { + marginal_child_posteriors = marginalise_child(*paternal_genotypes, trio_posteriors); + } else { + if (maternal_genotypes.size() > 1) { + marginal_child_posteriors = marginalise_child(maternal_genotypes, trio_posteriors); + } else { + marginal_child_posteriors.assign({1.0}); + } + } + concatenated_genotypes_ = concat(maternal_genotypes, *paternal_genotypes); + GenotypeProbabilityMap genotype_posteriors {std::cbegin(concatenated_genotypes_), std::cend(concatenated_genotypes_)}; + const auto num_unique_genotypes = concatenated_genotypes_.size(); + padded_marginal_maternal_posteriors_.resize(num_unique_genotypes); + std::copy(std::cbegin(marginal_maternal_posteriors), std::cend(marginal_maternal_posteriors), + std::begin(padded_marginal_maternal_posteriors_)); + padded_marginal_paternal_posteriors_.resize(num_unique_genotypes); + std::copy(std::crbegin(marginal_paternal_posteriors), std::crend(marginal_paternal_posteriors), + std::rbegin(padded_marginal_paternal_posteriors_)); + padded_marginal_child_posteriors_.resize(num_unique_genotypes); + if (child_shares_paternal_genotypes) { + std::copy(std::crbegin(marginal_child_posteriors), std::crend(marginal_child_posteriors), + std::rbegin(padded_marginal_child_posteriors_)); + } else { + std::copy(std::cbegin(marginal_child_posteriors), std::cend(marginal_child_posteriors), + std::begin(padded_marginal_child_posteriors_)); + } + insert_sample(trio.mother(), padded_marginal_maternal_posteriors_, genotype_posteriors); + insert_sample(trio.father(), padded_marginal_paternal_posteriors_, genotype_posteriors); + insert_sample(trio.child(), padded_marginal_child_posteriors_, genotype_posteriors); + marginal_genotype_posteriors = std::make_shared(std::move(genotype_posteriors)); +} + namespace { using JointProbability = TrioModel::Latents::JointProbability; using TrioProbabilityVector = std::vector; using InverseGenotypeTable = std::vector>; -auto make_inverse_genotype_table(const std::vector& haplotypes, - const std::vector>& genotypes) +auto make_inverse_genotype_table(const std::vector& haplotypes, const std::vector>& genotypes) { assert(!haplotypes.empty() && !genotypes.empty()); using HaplotypeReference = std::reference_wrapper; std::unordered_map> result_map {haplotypes.size()}; - const auto cardinality = element_cardinality_in_genotypes(static_cast(haplotypes.size()), - genotypes.front().ploidy()); + const auto cardinality = element_cardinality_in_genotypes(static_cast(haplotypes.size()), genotypes.front().ploidy()); for (const auto& haplotype : haplotypes) { auto itr = result_map.emplace(std::piecewise_construct, std::forward_as_tuple(std::cref(haplotype)), @@ -262,21 +339,66 @@ auto calculate_haplotype_posteriors(const std::vector& haplotypes, } // namespace void TrioCaller::Latents::set_haplotype_posteriors(const std::vector& haplotypes) +{ + if (paternal_genotypes) { + set_haplotype_posteriors_unique_genotypes(haplotypes); + } else { + set_haplotype_posteriors_shared_genotypes(haplotypes); + } +} + +void TrioCaller::Latents::set_haplotype_posteriors_shared_genotypes(const std::vector& haplotypes) { auto inverse_genotypes = make_inverse_genotype_table(haplotypes, maternal_genotypes); - const GenotypeMarginalPosteriorMatrix genotype_posteriors { - marginal_maternal_posteriors, marginal_paternal_posteriors, marginal_child_posteriors - }; + const GenotypeMarginalPosteriorMatrix genotype_posteriors {marginal_maternal_posteriors, + marginal_paternal_posteriors, + marginal_child_posteriors}; auto haplotype_posteriors = calculate_haplotype_posteriors(haplotypes, maternal_genotypes, genotype_posteriors, inverse_genotypes); marginal_haplotype_posteriors = std::make_shared(haplotype_posteriors); } +void TrioCaller::Latents::set_haplotype_posteriors_unique_genotypes(const std::vector& haplotypes) +{ + auto inverse_genotypes = make_inverse_genotype_table(haplotypes, concatenated_genotypes_); + const GenotypeMarginalPosteriorMatrix genotype_posteriors {padded_marginal_maternal_posteriors_, + padded_marginal_paternal_posteriors_, + padded_marginal_child_posteriors_}; + auto haplotype_posteriors = calculate_haplotype_posteriors(haplotypes, concatenated_genotypes_, genotype_posteriors, inverse_genotypes); + marginal_haplotype_posteriors = std::make_shared(haplotype_posteriors); +} + + + // TrioCaller std::unique_ptr TrioCaller::infer_latents(const std::vector& haplotypes, const HaplotypeLikelihoodArray& haplotype_likelihoods) const { + if (parameters_.child_ploidy == 0) { + assert(parameters_.maternal_ploidy == 0 || parameters_.paternal_ploidy == 0); + std::vector> parent_genotypes {}, empty_genotypes {}; + const auto prior_model = make_single_sample_prior_model(haplotypes); + const model::IndividualModel sample_model {*prior_model}; + if (parameters_.maternal_ploidy > 0) { + parent_genotypes = generate_all_genotypes(haplotypes, parameters_.maternal_ploidy); + haplotype_likelihoods.prime(parameters_.trio.mother()); + } else { + parent_genotypes = generate_all_genotypes(haplotypes, parameters_.paternal_ploidy); + haplotype_likelihoods.prime(parameters_.trio.father()); + } + auto sample_latents = sample_model.evaluate(parent_genotypes, haplotype_likelihoods); + model::TrioModel::InferredLatents trio_latents {}; + trio_latents.log_evidence = sample_latents.log_evidence; + trio_latents.posteriors.joint_genotype_probabilities.reserve(parent_genotypes.size()); + std::transform(std::cbegin(parent_genotypes), std::cend(parent_genotypes), std::cbegin(sample_latents.posteriors.genotype_probabilities), + std::back_inserter(trio_latents.posteriors.joint_genotype_probabilities), + [] (const auto& genotype, auto posterior) -> model::TrioModel::Latents::JointProbability { + return {genotype, genotype, genotype, posterior}; }); + if (parameters_.maternal_ploidy == 0) std::swap(parent_genotypes, empty_genotypes); + return std::make_unique(haplotypes, std::move(parent_genotypes), std::move(empty_genotypes), + parameters_.child_ploidy, std::move(trio_latents), parameters_.trio); + } auto germline_prior_model = make_prior_model(haplotypes); DeNovoModel denovo_model {parameters_.denovo_model_params, haplotypes.size(), DeNovoModel::CachingStrategy::address}; const model::TrioModel model { @@ -297,13 +419,13 @@ TrioCaller::infer_latents(const std::vector& haplotypes, if (parameters_.maternal_ploidy == parameters_.child_ploidy) { auto latents = model.evaluate(maternal_genotypes, paternal_genotypes, maternal_genotypes, haplotype_likelihoods); - return std::make_unique(haplotypes, std::move(maternal_genotypes), - std::move(latents), parameters_.trio); + return std::make_unique(haplotypes, std::move(maternal_genotypes), std::move(maternal_genotypes), + parameters_.child_ploidy, std::move(latents), parameters_.trio); } else { auto latents = model.evaluate(maternal_genotypes, paternal_genotypes, paternal_genotypes, haplotype_likelihoods); - return std::make_unique(haplotypes, std::move(maternal_genotypes), - std::move(latents), parameters_.trio); + return std::make_unique(haplotypes, std::move(maternal_genotypes), std::move(paternal_genotypes), + parameters_.child_ploidy, std::move(latents), parameters_.trio); } } } @@ -911,8 +1033,11 @@ TrioCaller::call_variants(const std::vector& candidates, const Latents& auto denovos = call_denovos(denovo_posteriors, allele_posteriors, parameters_.min_denovo_posterior); const auto germline_alleles = get_germline_alleles(called_alleles, denovos); auto germline_variants = call_germline_variants(germline_alleles, candidates, parameters_.min_variant_posterior); - const auto called_trio = call_trio(trio_posteriors, germline_variants, denovos); + auto called_trio = call_trio(trio_posteriors, germline_variants, denovos); remove_ungenotyped_allele(germline_variants, denovos, called_trio); + if (parameters_.maternal_ploidy == 0) called_trio.mother = Genotype {}; + if (parameters_.paternal_ploidy == 0) called_trio.father = Genotype {}; + if (parameters_.child_ploidy == 0) called_trio.child = Genotype {}; auto denovo_genotypes = call_genotypes(parameters_.trio, called_trio, *latents.genotype_posteriors(), extract_regions(denovos)); auto germline_genotypes = call_genotypes(parameters_.trio, called_trio, *latents.genotype_posteriors(), extract_regions(germline_variants)); return make_calls(std::move(germline_variants), std::move(germline_genotypes), @@ -937,16 +1062,25 @@ TrioCaller::call_reference(const std::vector& alleles, const Latents& la std::unique_ptr TrioCaller::make_prior_model(const std::vector& haplotypes) const { if (parameters_.germline_prior_model_params) { - return std::make_unique(CoalescentModel { - Haplotype {mapped_region(haplotypes.front()), reference_}, - *parameters_.germline_prior_model_params, - haplotypes.size(), CoalescentModel::CachingStrategy::address - }); + return std::make_unique(CoalescentModel {Haplotype {mapped_region(haplotypes.front()), reference_}, + *parameters_.germline_prior_model_params, + haplotypes.size(), CoalescentModel::CachingStrategy::address}); } else { return std::make_unique(); } } +std::unique_ptr TrioCaller::make_single_sample_prior_model(const std::vector& haplotypes) const +{ + if (parameters_.germline_prior_model_params) { + return std::make_unique(CoalescentModel {Haplotype {mapped_region(haplotypes.front()), reference_}, + *parameters_.germline_prior_model_params, + haplotypes.size(), CoalescentModel::CachingStrategy::address}); + } else { + return std::make_unique(); + } +} + namespace debug { template diff --git a/src/core/callers/trio_caller.hpp b/src/core/callers/trio_caller.hpp index db23c6e57..d832785d5 100644 --- a/src/core/callers/trio_caller.hpp +++ b/src/core/callers/trio_caller.hpp @@ -13,6 +13,7 @@ #include "core/types/genotype.hpp" #include "core/models/mutation/coalescent_model.hpp" #include "core/models/genotype/population_prior_model.hpp" +#include "core/models/genotype/genotype_prior_model.hpp" #include "core/models/mutation/denovo_model.hpp" #include "core/models/genotype/trio_model.hpp" @@ -91,6 +92,7 @@ class TrioCaller : public Caller const ReadPileupMap& pileups) const; std::unique_ptr make_prior_model(const std::vector& haplotypes) const; + std::unique_ptr make_single_sample_prior_model(const std::vector& haplotypes) const; }; class TrioCaller::Latents : public Caller::Latents @@ -121,9 +123,16 @@ class TrioCaller::Latents : public Caller::Latents std::vector marginal_maternal_posteriors, marginal_paternal_posteriors, marginal_child_posteriors; mutable std::shared_ptr marginal_genotype_posteriors; std::shared_ptr marginal_haplotype_posteriors; + std::vector> concatenated_genotypes_; + std::vector padded_marginal_maternal_posteriors_, padded_marginal_paternal_posteriors_, padded_marginal_child_posteriors_; + unsigned child_ploidy_; void set_genotype_posteriors(const Trio& trio); + void set_genotype_posteriors_shared_genotypes(const Trio& trio); + void set_genotype_posteriors_unique_genotypes(const Trio& trio); void set_haplotype_posteriors(const std::vector& haplotypes); + void set_haplotype_posteriors_shared_genotypes(const std::vector& haplotypes); + void set_haplotype_posteriors_unique_genotypes(const std::vector& haplotypes); }; } // namespace octopus diff --git a/src/core/csr/measures/misaligned_read_count.cpp b/src/core/csr/measures/misaligned_read_count.cpp index 9346a7b47..b48d41bac 100644 --- a/src/core/csr/measures/misaligned_read_count.cpp +++ b/src/core/csr/measures/misaligned_read_count.cpp @@ -68,10 +68,12 @@ Measure::ResultType MisalignedReadCount::do_evaluate(const VcfRecord& call, cons result.reserve(samples.size()); for (const auto& sample : samples) { int sample_result {0}; - for (const auto& p : assignments.support.at(sample)) { - auto realigned_reads = copy_overlapped(p.second, call); - safe_realign(realigned_reads, p.first); - sample_result += count_likely_misaligned(realigned_reads); + if (assignments.support.count(sample) == 1) { + for (const auto& p : assignments.support.at(sample)) { + auto realigned_reads = copy_overlapped(p.second, call); + safe_realign(realigned_reads, p.first); + sample_result += count_likely_misaligned(realigned_reads); + } } result.push_back(sample_result); } diff --git a/src/core/models/genotype/trio_model.cpp b/src/core/models/genotype/trio_model.cpp index 748a9351e..ab7a51cb2 100644 --- a/src/core/models/genotype/trio_model.cpp +++ b/src/core/models/genotype/trio_model.cpp @@ -603,6 +603,22 @@ template <> struct ProbabilityOfChildGivenParents<1, 0, 1> const DeNovoModel& mutation_model; }; +template <> struct ProbabilityOfChildGivenParents<1, 1, 1> +{ + ProbabilityOfChildGivenParents(const DeNovoModel& mutation_model) : mutation_model {mutation_model} {} + + template + double operator()(const G& child, const G& mother, const G& father) + { + static const double ln2 {std::log(2)}; + const auto p1 = probability_of_child_given_haploid_parent(child[0], mother, mutation_model); + const auto p2 = probability_of_child_given_haploid_parent(child[0], father, mutation_model); + return maths::log_sum_exp(p1, p2) - ln2; + } + + const DeNovoModel& mutation_model; +}; + using JointProbability = TrioModel::Latents::JointProbability; template @@ -656,6 +672,9 @@ auto join(const ReducedVectorMap& parents, if (maternal_ploidy == 0) { return join(parents, child, ProbabilityOfChildGivenParents<1, 0, 1> {mutation_model}); } + if (maternal_ploidy == 1) { + return join(parents, child, ProbabilityOfChildGivenParents<1, 1, 1> {mutation_model}); + } if (maternal_ploidy == 2) { return join(parents, child, ProbabilityOfChildGivenParents<1, 2, 1> {mutation_model}); } @@ -668,8 +687,6 @@ auto join(const ReducedVectorMap& parents, if (paternal_ploidy == 2) { return join(parents, child, ProbabilityOfChildGivenParents<2, 2, 2> {mutation_model}); } - } else { - } } else if (child_ploidy == 3 && maternal_ploidy == 3 && paternal_ploidy == 3) { return join(parents, child, ProbabilityOfChildGivenParents<3, 3, 3> {mutation_model}); @@ -838,7 +855,7 @@ TrioModel::evaluate_allosome(const GenotypeVector& parent_genotypes, const GenotypeVector& child_genotypes, const HaplotypeLikelihoodArray& haplotype_likelihoods) const { - assert(!parent_genotypes.empty() && child_genotypes.empty()); + assert(!parent_genotypes.empty() && !child_genotypes.empty()); const GermlineLikelihoodModel likelihood_model {haplotype_likelihoods}; assert(haplotype_likelihoods.is_primed()); auto parent_likelihoods = compute_likelihoods(parent_genotypes, likelihood_model); diff --git a/src/utils/genotype_reader.cpp b/src/utils/genotype_reader.cpp index ff44781b7..06640a672 100644 --- a/src/utils/genotype_reader.cpp +++ b/src/utils/genotype_reader.cpp @@ -138,10 +138,11 @@ make_allele(const VcfRecord& call, VcfRecord::NucleotideSequence allele_sequence auto extract_genotype(const VcfRecord& call, const SampleName& sample) { auto genotype = get_genotype(call, sample); - boost::optional max_ref_pad {}; - std::vector unknown_pad_indices {}; const auto ploidy = genotype.size(); std::vector> result(ploidy, boost::none); + if (ploidy == 0) return result; + boost::optional max_ref_pad {}; + std::vector unknown_pad_indices {}; for (std::size_t i {0}; i < ploidy; ++i) { auto& allele = genotype[i]; if (is_ref_pad_size_known(allele, call)) { @@ -323,46 +324,39 @@ GenotypeMap extract_genotypes(const std::vector& calls, { if (calls.empty()) return {}; GenotypeMap result {samples.size()}; - for (const auto& sample : samples) { const auto wrapped_calls = segment_overlapped_copy(wrap_calls(calls, sample)); - using InitList = std::initializer_list>; if (wrapped_calls.size() == 1) { if (!call_region) { call_region = encompassing_region(wrapped_calls.front()); } - result.emplace(std::piecewise_construct, - std::forward_as_tuple(sample), - std::forward_as_tuple(InitList { - extract_genotype(wrapped_calls.front(), *call_region, sample, reference) - })); + auto genotype = extract_genotype(wrapped_calls.front(), *call_region, sample, reference); + if (genotype.ploidy() > 0) result[sample] = {std::move(genotype)}; } else { // wrapped_calls.size() > 1 - auto it = std::cbegin(wrapped_calls); + auto call_itr = std::cbegin(wrapped_calls); GenomicRegion region; if (call_region) { - region = left_overhang_region(*call_region, std::next(it)->front()); + region = left_overhang_region(*call_region, std::next(call_itr)->front()); } else { - region = left_overhang_region(it->front(), std::next(it)->front()); + region = left_overhang_region(call_itr->front(), std::next(call_itr)->front()); } - result.emplace(std::piecewise_construct, - std::forward_as_tuple(sample), - std::forward_as_tuple(InitList { - extract_genotype(*it, region, sample, reference) - })); - ++it; - for (auto penultimate = std::prev(std::cend(wrapped_calls)); it != penultimate; ++it) { - region = *intervening_region(std::prev(it)->back(), std::next(it)->front()); - result.at(sample).insert(extract_genotype(*it, region, sample, reference)); + auto genotype = extract_genotype(*call_itr, region, sample, reference); + if (genotype.ploidy() > 0) result[sample] = {std::move(genotype)}; + ++call_itr; + for (auto penultimate = std::prev(std::cend(wrapped_calls)); call_itr != penultimate; ++call_itr) { + region = *intervening_region(std::prev(call_itr)->back(), std::next(call_itr)->front()); + genotype = extract_genotype(*call_itr, region, sample, reference); + if (genotype.ploidy() > 0) result.at(sample).insert(std::move(genotype)); } if (call_region) { - region = right_overhang_region(*call_region, std::prev(it)->back()); + region = right_overhang_region(*call_region, std::prev(call_itr)->back()); } else { - region = right_overhang_region(it->back(), std::prev(it)->back()); + region = right_overhang_region(call_itr->back(), std::prev(call_itr)->back()); } - result.at(sample).insert(extract_genotype(*it, region, sample, reference)); + genotype = extract_genotype(*call_itr, region, sample, reference); + if (genotype.ploidy() > 0) result.at(sample).insert(std::move(genotype)); } } - return result; }