diff --git a/README.md b/README.md index 41bb4e7bc..b8d7cbb32 100644 --- a/README.md +++ b/README.md @@ -393,7 +393,7 @@ Daniel Cooke and Gerton Lunter ## Citing -Please cite the preprint: [A unified haplotype-based method for accurate and comprehensive variant calling](https://www.biorxiv.org/content/early/2018/10/29/456103). +Please cite the paper: [A unified haplotype-based method for accurate and comprehensive variant calling](https://www.nature.com/articles/s41587-021-00861-3). ## License diff --git a/scripts/install.py b/scripts/install.py index 4b1854ad1..9e78d3d4b 100755 --- a/scripts/install.py +++ b/scripts/install.py @@ -17,7 +17,7 @@ forests = ['germline', 'somatic'] latest_llvm = 'llvm' -latest_gcc = 'gcc@10' +latest_gcc = 'gcc' required_curl_version = 7, 41, 0 required_git_version = 2, 7, 0 @@ -262,10 +262,8 @@ def get_brewed_compiler_binaries(homebrew_dir): else: gcc_dir = cellar_dir / latest_gcc gcc_version = os.listdir(str(gcc_dir))[0] - gcc_bin_dir = gcc_dir /gcc_version / 'bin' - gcc_bin_name = latest_gcc.replace('@', '-') - gxx_bin_name = gcc_bin_name.replace('cc', '++') - return gcc_bin_dir / gcc_bin_name, gcc_bin_dir / gxx_bin_name + gcc_bin_dir = gcc_dir / gcc_version / 'bin' + return gcc_bin_dir / 'gcc', gcc_bin_dir / 'g++' def patch_homebrew_centos_gcc9(homebrew_dir): homebrew_bin_dir = homebrew_dir / 'bin' diff --git a/src/config/option_parser.cpp b/src/config/option_parser.cpp index b7eb59db0..d3734fab1 100644 --- a/src/config/option_parser.cpp +++ b/src/config/option_parser.cpp @@ -22,6 +22,8 @@ #include "exceptions/user_error.hpp" #include "config.hpp" +#include "core/csr/measures/measure_factory.hpp" + namespace po = boost::program_options; namespace fs = boost::filesystem; @@ -776,8 +778,10 @@ OptionMap parse_options(const int argc, const char** argv) .add(variant_discovery).add(haplotype_generation).add(general_variant_calling) .add(cancer).add(trio).add(polyclone).add(cell).add(call_filtering); + po::options_description conditional_help_options("Octopus help command line options"); + conditional_help_options.add(general).add(call_filtering); OptionMap vm_init; - po::store(run(po::command_line_parser(argc, argv).options(general).allow_unregistered()), vm_init); + po::store(run(po::command_line_parser(argc, argv).options(conditional_help_options).allow_unregistered()), vm_init); if (vm_init.count("help") == 1) { po::store(run(po::command_line_parser(argc, argv).options(general_variant_calling).allow_unregistered()), vm_init); @@ -821,6 +825,10 @@ OptionMap parse_options(const int argc, const char** argv) } else { std::cout << all << std::endl; } + if (vm_init.count("annotations") == 1) { + std::cout << "Available annotations:\n" << std::endl; + csr::print_all_measures_help(std::cout); + } return vm_init; } diff --git a/src/core/callers/cancer_caller.cpp b/src/core/callers/cancer_caller.cpp index cff824ced..5a337eaa1 100644 --- a/src/core/callers/cancer_caller.cpp +++ b/src/core/callers/cancer_caller.cpp @@ -1492,9 +1492,9 @@ void CancerCaller::log(const ModelPosteriors& model_posteriors) const namespace debug { -template +template void print_genotype_posteriors(S&& stream, - const std::unordered_map>, double>& genotype_posteriors, + std::vector> genotype_posteriors, const std::size_t n = std::numeric_limits::max()) { const auto m = std::min(n, genotype_posteriors.size()); @@ -1503,14 +1503,10 @@ void print_genotype_posteriors(S&& stream, } else { stream << "Printing top " << m << " genotype posteriors " << '\n'; } - using GenotypeReference = std::reference_wrapper>>; - std::vector> v {}; - v.reserve(genotype_posteriors.size()); - std::copy(std::cbegin(genotype_posteriors), std::cend(genotype_posteriors), std::back_inserter(v)); - const auto mth = std::next(std::begin(v), m); - std::partial_sort(std::begin(v), mth, std::end(v), + const auto mth = std::next(std::begin(genotype_posteriors), m); + std::partial_sort(std::begin(genotype_posteriors), mth, std::end(genotype_posteriors), [] (const auto& lhs, const auto& rhs) { return lhs.second > rhs.second; }); - std::for_each(std::begin(v), mth, + std::for_each(std::begin(genotype_posteriors), mth, [&] (const auto& p) { print_variant_alleles(stream, p.first.get()); stream << " " << p.second << '\n'; @@ -1544,6 +1540,10 @@ void CancerCaller::log(const GenotypeVector& germline_genotypes, auto map_somatic = find_map_genotype(cancer_posteriors); auto map_cancer_genotype = map_somatic->first.get(); somatic_log << "MAP cancer genotype: "; + auto weighted_cancer_posteriors = zip_cref(cancer_genotypes, somatic_inferences.weighted_genotype_posteriors); + auto weighted_somatic_log = stream(*debug_log_); + weighted_somatic_log << "Weighted cancer genotypes... "; + debug::print_genotype_posteriors(weighted_somatic_log, weighted_cancer_posteriors, 10); debug::print_variant_alleles(somatic_log, map_cancer_genotype); somatic_log << ' ' << map_somatic->second; auto map_marginal_germline = find_map_genotype(germline_genotype_posteriors); @@ -1553,7 +1553,10 @@ void CancerCaller::log(const GenotypeVector& germline_genotypes, marginal_germline_log << ' ' << map_marginal_germline->second; } if (trace_log_) { - debug::print_genotype_posteriors(stream(*trace_log_), germline_genotype_posteriors); + auto weighted_cancer_posteriors = zip_cref(cancer_genotypes, somatic_inferences.weighted_genotype_posteriors); + auto weighted_somatic_log = stream(*trace_log_); + weighted_somatic_log << "Weighted cancer genotypes... "; + debug::print_genotype_posteriors(weighted_somatic_log, weighted_cancer_posteriors); } } diff --git a/src/core/csr/measures/measure_factory.cpp b/src/core/csr/measures/measure_factory.cpp index f1e449b89..e16871ec7 100644 --- a/src/core/csr/measures/measure_factory.cpp +++ b/src/core/csr/measures/measure_factory.cpp @@ -11,6 +11,8 @@ #include "exceptions/user_error.hpp" #include "utils/map_utils.hpp" #include "utils/string_utils.hpp" +#include "io/variant/vcf_header.hpp" +#include "io/variant/vcf_spec.hpp" namespace octopus { namespace csr { @@ -163,5 +165,36 @@ std::vector get_all_measure_names() return extract_sorted_keys(measure_makers); } +VcfHeader make_header(const std::vector& measures) +{ + VcfHeader::Builder builder {}; + for (const auto& measure : measures) { + measure.annotate(builder); + } + return builder.build_once(); +} + +void print_help(const std::vector& measures, std::ostream& os) +{ + const auto header = make_header(measures); + os << "Name\tKind\tNumber\tType\tDescription" << std::endl; + for (const auto& measure : measures) { + os << measure.name() << '\t'; + using namespace vcfspec::header; + const auto kind = header.has(format, measure.name()) ? format : info; + os << kind << '\t'; + os << get_id_field_value(header, kind, measure.name(), meta::struc::number) << '\t'; + os << get_id_field_value(header, kind, measure.name(), meta::struc::type) << '\t'; + os << get_id_field_value(header, kind, measure.name(), meta::struc::description) << '\t'; + os << std::endl; + } +} + +void print_all_measures_help(std::ostream& os) +{ + const auto measures = make_measures(get_all_measure_names()); + print_help(measures, os); +} + } // namespace csr } // namespace octopus diff --git a/src/core/csr/measures/measure_factory.hpp b/src/core/csr/measures/measure_factory.hpp index 41ecc3542..e367c4180 100644 --- a/src/core/csr/measures/measure_factory.hpp +++ b/src/core/csr/measures/measure_factory.hpp @@ -6,6 +6,7 @@ #include #include +#include #include "utils/string_utils.hpp" #include "measure.hpp" @@ -18,6 +19,9 @@ std::vector make_measures(std::vector names); std::vector get_all_measure_names(); +void print_help(const std::vector& measures, std::ostream& os); +void print_all_measures_help(std::ostream& os); + } // namespace csr } // namespace octopus diff --git a/src/core/models/genotype/subclone_model.cpp b/src/core/models/genotype/subclone_model.cpp index 597699b87..ed73fc15c 100644 --- a/src/core/models/genotype/subclone_model.cpp +++ b/src/core/models/genotype/subclone_model.cpp @@ -133,6 +133,10 @@ compute_genotype_likelihoods_with_fixed_mixture_model(const std::vector>& genotype, const ConstantMixtureGenotypeLikelihoodModel& model) +{ + return model.evaluate(genotype); +} auto evaluate(const CancerGenotype>& genotype, const ConstantMixtureGenotypeLikelihoodModel& model) { return model.evaluate(demote(genotype)); @@ -243,14 +247,41 @@ generate_seeds(const std::vector& samples, if (result.size() > max_seeds) return result; max_seeds -= result.size(); result.reserve(max_seeds); - result.push_back(genotype_log_priors); - IndividualModel germline_model {priors.genotype_prior_model}; - if (haplotypes) germline_model.prime(*haplotypes); + ConstantMixtureGenotypeLikelihoodModel basic_likelihood_model {haplotype_log_likelihoods}; + std::vector basic_sample_likelihoods {}; + basic_sample_likelihoods.reserve(samples.size()); for (const auto& sample : samples) { - haplotype_log_likelihoods.prime(sample); - auto latents = germline_model.evaluate(genotypes, haplotype_log_likelihoods); - result.push_back(std::move(latents.posteriors.genotype_log_probabilities)); + basic_likelihood_model.cache().prime(sample); + basic_sample_likelihoods.push_back(evaluate(genotypes, basic_likelihood_model)); } + auto basic_likelihoods = add_all_and_normalise(basic_sample_likelihoods); + auto basic_posteriors = add_and_normalise(genotype_log_priors, basic_likelihoods); + result.push_back(basic_posteriors); + --max_seeds; + if (max_seeds == 0) return result; + result.push_back(basic_likelihoods); + --max_seeds; + if (max_seeds == 0) return result; + result.push_back(genotype_log_priors); + maths::normalise_logs(result.back()); + --max_seeds; + if (max_seeds == 0) return result; + for (auto& likelihoods : basic_sample_likelihoods) { + result.push_back(std::move(likelihoods)); + --max_seeds; + if (max_seeds == 0) return result; + } + std::vector> ranked_basic_posteriors {}; + ranked_basic_posteriors.reserve(genotypes.size()); + for (std::size_t i {0}; i < genotypes.size(); ++i) { + ranked_basic_posteriors.push_back({basic_posteriors[i], i}); + } + const auto nth_ranked_basic_posteriors_itr = std::next(std::begin(ranked_basic_posteriors), max_seeds); + std::partial_sort(std::begin(ranked_basic_posteriors), + nth_ranked_basic_posteriors_itr, + std::end(ranked_basic_posteriors), std::greater<> {}); + std::transform(std::begin(ranked_basic_posteriors), nth_ranked_basic_posteriors_itr, + std::back_inserter(result), [&] (const auto& p) { return make_point_seed(genotypes.size(), p.second); }); return result; } diff --git a/src/io/variant/vcf_header.cpp b/src/io/variant/vcf_header.cpp index 753d7cc64..b41112665 100644 --- a/src/io/variant/vcf_header.cpp +++ b/src/io/variant/vcf_header.cpp @@ -51,7 +51,11 @@ bool VcfHeader::has(const Tag& t) const noexcept bool VcfHeader::has(const Tag& t, const StructuredKey& k) const noexcept { - return structured_fields_.count(t) > 0; // TODO: complete this + const auto er = structured_fields_.equal_range(t); + return std::find_if(er.first, er.second, + [&] (const auto& p) { + return p.second.at(vcfspec::header::meta::struc::id) == k; + }) != er.second; } const VcfHeader::Value& VcfHeader::at(const BasicKey& k) const @@ -59,8 +63,8 @@ const VcfHeader::Value& VcfHeader::at(const BasicKey& k) const return basic_fields_.at(k); } -const VcfHeader::Value& VcfHeader::find(const Tag& search_tag, const StructuredKey& search_key, - const StructuredKey& id_key, const Value& id_value) const +const VcfHeader::Value& VcfHeader::find(const Tag& search_tag, const StructuredKey& id_key, + const Value& id_value, const StructuredKey& search_key) const { const auto er = structured_fields_.equal_range(search_tag); return std::find_if(er.first, er.second, @@ -118,15 +122,19 @@ const VcfHeader::StructuredFieldMap& VcfHeader::structured_fields() const noexce // non-member methods -const VcfHeader::Value& get_id_field_value(const VcfHeader& header, const VcfHeader::Tag& tag, - const VcfHeader::Value& id_value, - const VcfHeader::StructuredKey& lookup_key) +const VcfHeader::Value& +get_id_field_value(const VcfHeader& header, + const VcfHeader::Tag& tag, + const VcfHeader::StructuredKey& id_value, + const VcfHeader::StructuredKey& lookup_key) { return header.find(tag, vcfspec::header::meta::struc::id, id_value, lookup_key); } -const VcfHeader::Value& get_id_field_type(const VcfHeader& header, const VcfHeader::Tag& tag, - const VcfHeader::Value& id_value) +const VcfHeader::Value& +get_id_field_type(const VcfHeader& header, + const VcfHeader::Tag& tag, + const VcfHeader::Value& id_value) { using namespace vcfspec::header::meta; return header.find(tag, struc::id, id_value, struc::type); diff --git a/src/io/variant/vcf_header.hpp b/src/io/variant/vcf_header.hpp index 081806c0e..bc9f84ae7 100644 --- a/src/io/variant/vcf_header.hpp +++ b/src/io/variant/vcf_header.hpp @@ -102,8 +102,8 @@ class VcfHeader : public Equitable bool has(const Tag& tag, const StructuredKey& k) const noexcept; const Value& at(const BasicKey& k) const; - const Value& find(const Tag& search_tag, const StructuredKey& search_key, - const StructuredKey& id_key, const Value& id_value) const; + const Value& find(const Tag& search_tag, const StructuredKey& id_key, + const Value& id_value, const StructuredKey& search_key) const; std::vector basic_keys() const; std::vector tags() const; @@ -162,12 +162,16 @@ VcfHeader::VcfHeader(T&& file_format, U&& samples, B&& basic_fields, S&& structu , structured_fields_ {std::forward(structured_fields)} {} -const std::string& get_id_field_value(const VcfHeader& header, const VcfHeader::Tag& t, - const VcfHeader::StructuredKey& id_value, - const VcfHeader::StructuredKey& lookup_key); +const VcfHeader::Value& +get_id_field_value(const VcfHeader& header, + const VcfHeader::Tag& t, + const VcfHeader::StructuredKey& id_value, + const VcfHeader::StructuredKey& lookup_key); -const std::string& get_id_field_type(const VcfHeader& header, const VcfHeader::Tag& t, - const VcfHeader::Value& id_value); +const VcfHeader::Value& +get_id_field_type(const VcfHeader& header, + const VcfHeader::Tag& t, + const VcfHeader::Value& id_value); VcfType get_typed_value(const VcfHeader& header, const VcfHeader::Tag& t, const VcfHeader::StructuredKey& key, const VcfHeader::Value& value);