diff --git a/deps/gbwtgraph b/deps/gbwtgraph index e16221f047..9fa7f6fa82 160000 --- a/deps/gbwtgraph +++ b/deps/gbwtgraph @@ -1 +1 @@ -Subproject commit e16221f0479cdf6a7c709ce353687a059503bb67 +Subproject commit 9fa7f6fa82f329399a99c1e84573bdae96940d1b diff --git a/src/alignment.cpp b/src/alignment.cpp index 274476e16c..49a39e0787 100644 --- a/src/alignment.cpp +++ b/src/alignment.cpp @@ -2847,6 +2847,30 @@ void alignment_set_distance_to_correct(Alignment& aln, const map > > alignment_refpos_to_path_offsets(const void alignment_set_distance_to_correct(Alignment& aln, const Alignment& base, const unordered_map* translation = nullptr); void alignment_set_distance_to_correct(Alignment& aln, const map>>& base_offsets, const unordered_map* translation = nullptr); +/** + * Stop the program and print a useful error message if the alignment has + * quality values, but not the right number of them for the number of sequence + * bases. + */ +void check_quality_length(const Alignment& aln); + /** * Represents a report on whether an alignment makes sense in the context of a graph. */ diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index 0f15b8139d..a0ed588db4 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -379,8 +379,13 @@ void Deconstructor::get_genotypes(vcflib::Variant& v, const vector& name } else { string blank_gt = "."; if (gbwt_sample_to_phase_range.count(sample_name)) { - auto& phase_range = gbwt_sample_to_phase_range.at(sample_name); - for (int phase = phase_range.first + 1; phase <= phase_range.second; ++phase) { + // note: this is the same logic used when filling in actual genotypes + int min_phase, max_phase; + std::tie(min_phase, max_phase) = gbwt_sample_to_phase_range.at(sample_name); + // shift left by 1 unless min phase is 0 + int sample_ploidy = min_phase == 0 ? max_phase + 1 : max_phase; + assert(sample_ploidy > 0); + for (int phase = 1; phase < sample_ploidy; ++phase) { blank_gt += "|."; } } @@ -1132,6 +1137,10 @@ string Deconstructor::get_vcf_header() { if (haplotype == PathMetadata::NO_HAPLOTYPE) { haplotype = 0; } + if (haplotype > 10) { + cerr << "Warning [vg deconstruct]: Suspiciously large haplotype, " << haplotype + << ", parsed from path, " << path_name << ": This will leed to giant GT entries."; + } sample_to_haps[sample_name].insert((int)haplotype); sample_names.insert(sample_name); } @@ -1159,6 +1168,10 @@ string Deconstructor::get_vcf_header() { // Default to 0. phase = 0; } + if (phase > 10) { + cerr << "Warning [vg deconstruct]: Suspiciously large haplotype, " << phase + << ", parsed from GBWT thread, " << path_name << ": This will leed to giant GT entries."; + } sample_to_haps[sample_name].insert((int)phase); sample_names.insert(sample_name); } diff --git a/src/haplotype_indexer.cpp b/src/haplotype_indexer.cpp index 75399a86e6..38d79601e8 100644 --- a/src/haplotype_indexer.cpp +++ b/src/haplotype_indexer.cpp @@ -415,6 +415,127 @@ std::unique_ptr HaplotypeIndexer::build_gbwt(const PathHandle return build_gbwt({}, "GBWT", &graph); } +//------------------------------------------------------------------------------ + +// GAF / GAM to GBWT. + +// Returns (count, symbol) for present symbols. +// Assumes that the input is non-empty and all vectors have the same length. +std::vector> present_symbols(const std::vector>& counts_by_job) { + std::vector> result; + for (size_t symbol = 0; symbol < counts_by_job.front().size(); symbol++) { + size_t count = 0; + for (size_t j = 0; j < counts_by_job.size(); j++) { + count += counts_by_job[j][symbol]; + } + if (count > 0) { + result.push_back(std::make_pair(count, symbol)); + } + } + return result; +} + +// Inputs: (count, symbol) +// Outputs: (code length, symbol) sorted by code length +std::vector> canonical_huffman(const std::vector>& symbols) { + if (symbols.empty()) { + return std::vector>(); + } + + // Internal nodes as pairs of children. + std::vector> nodes; + // (count, id), with id referring first to symbols and then to nodes. + std::vector> queue; queue.reserve(symbols.size()); + for (size_t i = 0; i < symbols.size(); i++) { + queue.push_back(std::make_pair(symbols[i].first, i)); + } + std::make_heap(queue.begin(), queue.end(), std::greater>()); + + // Build the Huffman tree. + while (queue.size() > 1) { + std::pop_heap(queue.begin(), queue.end(), std::greater>()); + auto left = queue.back(); queue.pop_back(); + std::pop_heap(queue.begin(), queue.end(), std::greater>()); + auto right = queue.back(); queue.pop_back(); + size_t count = left.first + right.first; + size_t id = symbols.size() + nodes.size(); + nodes.push_back(std::make_pair(left.second, right.second)); + queue.push_back(std::make_pair(count, id)); + std::push_heap(queue.begin(), queue.end(), std::greater>()); + } + + // Determine the code lengths. + std::vector> result(symbols.size()); + std::function dfs = [&](size_t node, size_t depth) { + if (node < symbols.size()) { + result[node] = std::make_pair(depth, symbols[node].second); + } else { + dfs(nodes[node - symbols.size()].first, depth + 1); + dfs(nodes[node - symbols.size()].second, depth + 1); + } + }; + dfs(queue.front().second, 0); + + std::sort(result.begin(), result.end()); + return result; +} + +std::string longest_common_prefix(const std::vector>& read_names) { + bool has_prefix = false; + std::string prefix; + for (auto& names : read_names) { + for (auto& name : names) { + if (!has_prefix) { + prefix = name; + has_prefix = true; + } else { + size_t i = 0; + while (i < prefix.length() && i < name.length() && prefix[i] == name[i]) { + i++; + } + prefix.resize(i); + } + } + } + return prefix; +} + +// This consumes the read names. +void create_alignment_metadata( + std::vector>& read_names, + const std::string& prefix, + gbwt::Metadata& metadata) { + + // We can use 32-bit values, as GBWT metadata uses them as well. + string_hash_map> read_info; // name -> (sample id, fragment count) + for (auto& names : read_names) { + for (const std::string& name : names) { + std::string sample_name = name.substr(prefix.length()); + std::uint32_t sample_id = 0, fragment_count = 0; + auto iter = read_info.find(sample_name); + if (iter == read_info.end()) { + sample_id = read_info.size(); + read_info[sample_name] = std::make_pair(sample_id, fragment_count); + } else { + sample_id = iter->second.first; + fragment_count = iter->second.second; + iter->second.second++; + } + metadata.addPath(sample_id, 0, 0, fragment_count); + } + names = std::vector(); + } + std::vector sample_names(read_info.size()); + for (auto& p : read_info) { + sample_names[p.second.first] = p.first; + } + read_info = string_hash_map>(); + + metadata.setSamples(sample_names); + metadata.setContigs({ "unknown" }); + metadata.setHaplotypes(sample_names.size()); +} + std::unique_ptr HaplotypeIndexer::build_gbwt(const HandleGraph& graph, const std::vector& aln_filenames, const std::string& aln_format, size_t parallel_jobs) const { @@ -442,6 +563,7 @@ std::unique_ptr HaplotypeIndexer::build_gbwt(const HandleGraph& grap // GBWT construction. std::vector builder_mutexes(jobs.size()); std::vector> builders(jobs.size()); + std::vector> quality_values(jobs.size(), std::vector(256, 0)); // This is a bit inefficient, as read names are often longer than the SSO threshold for GCC (but not for Clang). // TODO: Maybe use concatenated 0-terminated names? std::vector> read_names(jobs.size()); @@ -468,6 +590,10 @@ std::unique_ptr HaplotypeIndexer::build_gbwt(const HandleGraph& grap job_id = 0; } } + for (auto c : aln.quality()) { + unsigned char value = io::quality_short_to_char(c); + quality_values[job_id][value]++; + } { // Insert the path into the appropriate builder and record the read name. std::lock_guard lock(builder_mutexes[job_id]); @@ -505,6 +631,21 @@ std::unique_ptr HaplotypeIndexer::build_gbwt(const HandleGraph& grap std::unique_ptr result(new gbwt::GBWT(partial_indexes)); partial_indexes.clear(); + // Determine the quality score alphabet and canonical Huffman code lengths. + // The items are first (count, character value) and then (code length, character value). + std::vector> present = present_symbols(quality_values); + present = canonical_huffman(present); + std::string alphabet, code_lengths; + for (auto symbol : present) { + alphabet.push_back(symbol.second); + if (code_lengths.length() > 0) { + code_lengths.push_back(','); + } + code_lengths.append(std::to_string(symbol.first)); + } + result->tags.set("quality_values", alphabet); + result->tags.set("quality_lengths", code_lengths); + // Create the metadata. if (this->show_progress) { #pragma omp critical @@ -513,34 +654,9 @@ std::unique_ptr HaplotypeIndexer::build_gbwt(const HandleGraph& grap } } result->addMetadata(); - result->metadata.setContigs({ "unknown" }); - { - // We can use 32-bit values, as GBWT metadata uses them as well. - string_hash_map> read_info; // name -> (sample id, fragment count) - for (auto& names : read_names) { - for (const std::string& name : names) { - std::uint32_t sample_id = 0, fragment_count = 0; - auto iter = read_info.find(name); - if (iter == read_info.end()) { - sample_id = read_info.size(); - read_info[name] = std::make_pair(sample_id, fragment_count); - } else { - sample_id = iter->second.first; - fragment_count = iter->second.second; - iter->second.second++; - } - result->metadata.addPath(sample_id, 0, 0, fragment_count); - } - names = std::vector(); - } - std::vector sample_names(read_info.size()); - for (auto& p : read_info) { - sample_names[p.second.first] = p.first; - } - read_info = string_hash_map>(); - result->metadata.setSamples(sample_names); - result->metadata.setHaplotypes(sample_names.size()); - } + std::string prefix = longest_common_prefix(read_names); + result->tags.set("sample_prefix", prefix); + create_alignment_metadata(read_names, prefix, result->metadata); if (this->show_progress) { #pragma omp critical { @@ -555,4 +671,6 @@ std::unique_ptr HaplotypeIndexer::build_gbwt(const HandleGraph& grap return result; } +//------------------------------------------------------------------------------ + } diff --git a/src/haplotype_indexer.hpp b/src/haplotype_indexer.hpp index ca212a9426..ab3a69c587 100644 --- a/src/haplotype_indexer.hpp +++ b/src/haplotype_indexer.hpp @@ -140,6 +140,15 @@ class HaplotypeIndexer : public Progressive { * Runs approximately the given number of jobs in parallel. The exact * number depends on the sizes of weakly connected components in the graph. * Each job uses at most 2 threads. + * + * In order to save space, the longest common prefix of alignment names + * is stored as GBWT tag sample_prefix. Only the diverging suffixes are + * stored as sample names. + * + * Additionally, the list of values used in quality strings is stored as GBWT + * tag quality_values. Tag quality_codes stores a comma-separated list of + * canonical Huffman code lengths for the quality values. The values are + * sorted by code length. */ std::unique_ptr build_gbwt(const HandleGraph& graph, const std::vector& aln_filenames, const std::string& aln_format, diff --git a/src/readfilter.hpp b/src/readfilter.hpp index 3d38bba2f4..31d2caa1aa 100644 --- a/src/readfilter.hpp +++ b/src/readfilter.hpp @@ -1534,6 +1534,8 @@ inline void ReadFilter::emit_tsv(Alignment& read, std::ostream& out) out << softclip_start(read); } else if (field == "softclip_end") { out << softclip_end(read); + } else if (field == "identity") { + out << read.identity(); } else if (field == "mapping_quality") { out << get_mapq(read); } else if (field == "sequence") { diff --git a/src/recombinator.cpp b/src/recombinator.cpp index a4ababbcb5..65d540b2d0 100644 --- a/src/recombinator.cpp +++ b/src/recombinator.cpp @@ -66,7 +66,7 @@ hash_map Haplotypes::kmer_counts(const for (size_t subchain_id = 0; subchain_id < chain.subchains.size(); subchain_id++) { const Subchain& subchain = chain.subchains[subchain_id]; for (size_t kmer_id = 0; kmer_id < subchain.kmers.size(); kmer_id++) { - result[subchain.kmers[kmer_id].first] = 0; + result[subchain.kmers[kmer_id]] = 0; } } } @@ -146,51 +146,145 @@ std::string Haplotypes::Subchain::to_string() const { return result; } +size_t Haplotypes::Subchain::distance(const gbwtgraph::GBZ& gbz, size_t i) const { + if (this->type != normal || i >= this->sequences.size()) { + return 0; + } + + size_t result = 1; + gbwt::edge_type curr(this->start, this->sequences[i].second); + while (true) { + curr = gbz.index.LF(curr); + if (curr.first == gbwt::ENDMARKER || curr.first == this->end) { + break; + } + result += gbz.graph.get_length(gbwtgraph::GBWTGraph::node_to_handle(curr.first)); + } + + return result; +} + +// TODO: What is the right formula? +double Haplotypes::Subchain::badness(const gbwtgraph::GBZ& gbz) const { + double result = 0.0; + + // Factor 1: Subchain length, ideally over a reference path. + if (this->type == normal) { + size_t expected_length = HaplotypePartitioner::SUBCHAIN_LENGTH; + size_t selected = 0; + for (size_t i = 0; i < this->sequences.size(); i++) { + gbwt::size_type path_id = gbwt::Path::id(this->sequences[i].first); + auto found = gbz.graph.id_to_path.find(path_id); + if (found != gbz.graph.id_to_path.end()) { + selected = i; break; + } + } + size_t length = this->distance(gbz, selected); + if (length < expected_length) { + result += std::log(static_cast(expected_length) / static_cast(length)); + } + } + + // Factor 2: Number of haplotypes relative to the expected number. + size_t expected_haplotypes = gbz.index.metadata.haplotypes(); + size_t haplotypes = this->sequences.size(); + if (haplotypes < expected_haplotypes) { + result += std::log(static_cast(expected_haplotypes) / static_cast(haplotypes)); + } + + // Factor 3: Information content of the kmers. + // Disabled for the moment. + /* double expected_entropy = 4.0 * std::log(static_cast(haplotypes)); + double entropy = 0.0; + for (size_t i = 0; i < this->kmer_counts.size(); i++) { + double p = static_cast(this->kmer_counts[i]) / static_cast(haplotypes); + entropy -= p * std::log(p); + } + if (entropy < expected_entropy) { + result += expected_entropy - entropy; + } */ + + return result; +} + void Haplotypes::Subchain::simple_sds_serialize(std::ostream& out) const { sdsl::simple_sds::serialize_value(this->type, out); sdsl::simple_sds::serialize_value(this->start, out); sdsl::simple_sds::serialize_value(this->end, out); sdsl::simple_sds::serialize_vector(this->kmers, out); + this->kmer_counts.simple_sds_serialize(out); sdsl::simple_sds::serialize_vector(this->sequences, out); this->kmers_present.simple_sds_serialize(out); } -void Haplotypes::Subchain::simple_sds_load(std::istream& in) { +void load_subchain_header(Haplotypes::Subchain& subchain, std::istream& in) { std::uint64_t temp = sdsl::simple_sds::load_value(in); switch (temp) { - case normal: // Fall through. - case prefix: // Fall through. - case suffix: // Fall through. - case full_haplotype: - this->type = static_cast(temp); + case Haplotypes::Subchain::normal: // Fall through. + case Haplotypes::Subchain::prefix: // Fall through. + case Haplotypes::Subchain::suffix: // Fall through. + case Haplotypes::Subchain::full_haplotype: + subchain.type = static_cast(temp); break; default: throw sdsl::simple_sds::InvalidData("Invalid subchain type: " + std::to_string(temp)); } - this->start = sdsl::simple_sds::load_value(in); - this->end = sdsl::simple_sds::load_value(in); - bool should_have_start = (this->type == normal || this->type == suffix); - bool should_have_end = (this->type == normal || this->type == prefix); - if ((this->start != gbwt::ENDMARKER) != should_have_start) { - throw sdsl::simple_sds::InvalidData("Subchain start node " + std::to_string(this->start) + " does not match type " + std::to_string(temp)); + subchain.start = sdsl::simple_sds::load_value(in); + subchain.end = sdsl::simple_sds::load_value(in); + bool should_have_start = (subchain.type == Haplotypes::Subchain::normal || subchain.type == Haplotypes::Subchain::suffix); + bool should_have_end = (subchain.type == Haplotypes::Subchain::normal || subchain.type == Haplotypes::Subchain::prefix); + if ((subchain.start != gbwt::ENDMARKER) != should_have_start) { + throw sdsl::simple_sds::InvalidData("Subchain start node " + std::to_string(subchain.start) + " does not match type " + std::to_string(temp)); } - if ((this->end != gbwt::ENDMARKER) != should_have_end) { - throw sdsl::simple_sds::InvalidData("Subchain end node " + std::to_string(this->end) + " does not match type" + std::to_string(temp)); + if ((subchain.end != gbwt::ENDMARKER) != should_have_end) { + throw sdsl::simple_sds::InvalidData("Subchain end node " + std::to_string(subchain.end) + " does not match type" + std::to_string(temp)); } +} - this->kmers = sdsl::simple_sds::load_vector>(in); - this->sequences = sdsl::simple_sds::load_vector(in); - this->kmers_present.simple_sds_load(in); - if (kmers_present.size() != kmers.size() * sequences.size()) { +void load_subchain_kmers_present(Haplotypes::Subchain& subchain, std::istream& in) { + subchain.kmers_present.simple_sds_load(in); + if (subchain.kmers_present.size() != subchain.kmers.size() * subchain.sequences.size()) { throw sdsl::simple_sds::InvalidData("Invalid length for the kmer presence bitvector in subchain from " + - std::to_string(this->start) + " to " + std::to_string(this->end)); + std::to_string(subchain.start) + " to " + std::to_string(subchain.end)); + } +} + +void Haplotypes::Subchain::load_v1(std::istream& in) { + load_subchain_header(*this, in); + + // Kmer and sequence information must be converted to a more compact format. + auto kmers_counts = sdsl::simple_sds::load_vector>(in); + auto seqs = sdsl::simple_sds::load_vector(in); + this->kmers = std::vector(kmers_counts.size()); + this->kmer_counts = sdsl::int_vector<0>(kmers_counts.size(), 0, sdsl::bits::length(this->sequences.size())); + for (size_t i = 0; i < kmers_counts.size(); i++) { + this->kmers[i] = kmers_counts[i].first; + this->kmer_counts[i] = kmers_counts[i].second; + } + this->sequences = std::vector(seqs.size()); + for (size_t i = 0; i < seqs.size(); i++) { + this->sequences[i].first = seqs[i].first; + this->sequences[i].second = seqs[i].second; } + + load_subchain_kmers_present(*this, in); +} + +void Haplotypes::Subchain::simple_sds_load(std::istream& in) { + load_subchain_header(*this, in); + + this->kmers = sdsl::simple_sds::load_vector(in); + this->kmer_counts.simple_sds_load(in); + this->sequences = sdsl::simple_sds::load_vector(in); + + load_subchain_kmers_present(*this, in); } size_t Haplotypes::Subchain::simple_sds_size() const { size_t result = sdsl::simple_sds::value_size() + 2 * sdsl::simple_sds::value_size(); result += sdsl::simple_sds::vector_size(this->kmers); + result += this->kmer_counts.simple_sds_size(); result += sdsl::simple_sds::vector_size(this->sequences); result += this->kmers_present.simple_sds_size(); return result; @@ -217,14 +311,25 @@ void Haplotypes::TopLevelChain::simple_sds_load(std::istream& in) { } } -void Haplotypes::TopLevelChain::load_old(std::istream& in) { +void Haplotypes::TopLevelChain::load_v1(std::istream& in) { this->offset = sdsl::simple_sds::load_value(in); this->job_id = sdsl::simple_sds::load_value(in); this->contig_name = "component_" + std::to_string(this->offset); size_t subchain_count = sdsl::simple_sds::load_value(in); this->subchains.resize(subchain_count); for (size_t i = 0; i < subchain_count; i++) { - this->subchains[i].simple_sds_load(in); + this->subchains[i].load_v1(in); + } +} + +void Haplotypes::TopLevelChain::load_v2(std::istream& in) { + this->offset = sdsl::simple_sds::load_value(in); + this->job_id = sdsl::simple_sds::load_value(in); + this->contig_name = sdsl::simple_sds::load_string(in); + size_t subchain_count = sdsl::simple_sds::load_value(in); + this->subchains.resize(subchain_count); + for (size_t i = 0; i < subchain_count; i++) { + this->subchains[i].load_v1(in); } } @@ -263,8 +368,10 @@ void Haplotypes::simple_sds_load(std::istream& in) { for (auto& chain : this->chains) { if (this->header.version == Header::VERSION) { chain.simple_sds_load(in); + } else if (this->header.version == 2) { + chain.load_v2(in); } else { - chain.load_old(in); + chain.load_v1(in); } } @@ -420,7 +527,14 @@ Haplotypes HaplotypePartitioner::partition_haplotypes(const Parameters& paramete result.header.total_subchains += total_subchains; result.header.total_kmers += total_kmers; if (this->verbosity >= Haplotypes::verbosity_detailed) { - std::cerr << "Finished job " << job << " with " << chains.size() << " chains, " << total_subchains << " subchains, and " << total_kmers << " kmers" << std::endl; + std::cerr << "Finished job " << job << " with " << chains.size() << " chains ("; + for (size_t i = 0; i < chains.size(); i++) { + if (i > 0) { + std::cerr << ", "; + } + std::cerr << result.chains[chains[i].offset].contig_name; + } + std::cerr << "), " << total_subchains << " subchains, and " << total_kmers << " kmers" << std::endl; } } } @@ -720,7 +834,8 @@ std::vector HaplotypePartitioner::unique_minimi empty. */ void present_kmers(const std::vector>& sequences, - std::vector>& all_kmers, + std::vector& all_kmers, + sdsl::int_vector<0>& kmer_counts, sdsl::bit_vector& kmers_present) { // Build a map of distinct kmers. For each kmer, record the largest sequence @@ -744,10 +859,12 @@ void present_kmers(const std::vector(present.size(), 0, sdsl::bits::length(sequences.size())); size_t offset = 0; for (auto iter = present.begin(); iter != present.end(); ++iter) { if (iter->second.second < sequences.size()) { - all_kmers.push_back({ iter->first, iter->second.second }); + all_kmers.push_back(iter->first); + kmer_counts[offset] = iter->second.second; iter->second.first = offset; offset++; } @@ -810,7 +927,7 @@ void HaplotypePartitioner::build_subchains(const gbwtgraph::TopLevelChain& chain output.subchains.push_back({ iter->first.type, gbwtgraph::GBWTGraph::handle_to_node(iter->first.start), gbwtgraph::GBWTGraph::handle_to_node(iter->first.end), - {}, {}, sdsl::bit_vector() + {}, sdsl::int_vector<0>(0, 0, 64), {}, sdsl::bit_vector() }); Haplotypes::Subchain& subchain = output.subchains.back(); std::vector> kmers_by_sequence; @@ -818,8 +935,11 @@ void HaplotypePartitioner::build_subchains(const gbwtgraph::TopLevelChain& chain for (sequence_type sequence : iter->second) { kmers_by_sequence.emplace_back(this->unique_minimizers(sequence, iter->first)); } - present_kmers(kmers_by_sequence, subchain.kmers, subchain.kmers_present); - subchain.sequences = std::move(iter->second); + present_kmers(kmers_by_sequence, subchain.kmers, subchain.kmer_counts, subchain.kmers_present); + subchain.sequences = std::vector(iter->second.size()); + for (size_t i = 0; i < iter->second.size(); i++) { + subchain.sequences[i] = Haplotypes::compact_sequence_type(iter->second[i].first, iter->second[i].second); + } } } @@ -830,7 +950,7 @@ void HaplotypePartitioner::build_subchains(const gbwtgraph::TopLevelChain& chain output.subchains.push_back({ Haplotypes::Subchain::full_haplotype, gbwt::ENDMARKER, gbwt::ENDMARKER, - {}, {}, sdsl::bit_vector() + {}, sdsl::int_vector<0>(0, 0, 64), {}, sdsl::bit_vector() }); Haplotypes::Subchain& subchain = output.subchains.back(); gbwt::node_type node = gbwtgraph::GBWTGraph::handle_to_node(chain.handle); @@ -840,10 +960,10 @@ void HaplotypePartitioner::build_subchains(const gbwtgraph::TopLevelChain& chain for (auto seq_id : sequences) { kmers_by_sequence.emplace_back(this->unique_minimizers(seq_id)); } - present_kmers(kmers_by_sequence, subchain.kmers, subchain.kmers_present); + present_kmers(kmers_by_sequence, subchain.kmers, subchain.kmer_counts, subchain.kmers_present); subchain.sequences.reserve(sequences.size()); for (size_t i = 0; i < sequences.size(); i++) { - subchain.sequences.push_back({ sequences[i], 0 }); + subchain.sequences.push_back(Haplotypes::compact_sequence_type(sequences[i], 0)); } } } @@ -1056,19 +1176,75 @@ void RecombinatorHaplotype::suffix(const gbwt::GBWT& index) { } void RecombinatorHaplotype::insert(gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata) { - std::string sample_name = "recombination"; + std::string sample_name = "recombination"; // TODO: Make this a static class variable. metadata.add_haplotype(sample_name, this->contig_name, this->id, this->fragment); builder.insert(this->path, true); } //------------------------------------------------------------------------------ +/* + * An additional haplotype fragment from a bad subchain. + * + * GBWT metadata will be set as following: + * + * * Sample name is "fragment". + * * Contig name is taken from the top-level chain. + * * Haplotype identifier is set during construction. + * * Fragment identifier is the identifier of the subchain. + */ +struct RecombinatorFragment { + // GBWT starting position (inclusive). + gbwt::edge_type from; + + // GBWT ending position (inclusive). + gbwt::node_type to; + + // Haplotype identifier. + size_t haplotype; + + // Fragment / subchain identifier. + size_t fragment; + + RecombinatorFragment(gbwt::edge_type from, gbwt::node_type to, size_t haplotype, size_t fragment) : + from(from), to(to), haplotype(haplotype), fragment(fragment) {} + + // Generates the GBWT path and the metadata for the fragment. + void generate( + const gbwt::GBWT& index, + gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata, + const std::string& contig_name + ) const; +}; + +void RecombinatorFragment::generate( + const gbwt::GBWT& index, + gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata, + const std::string& contig_name +) const { + gbwt::vector_type path; + for (gbwt::edge_type curr = this->from; curr.first != gbwt::ENDMARKER; curr = index.LF(curr)) { + path.push_back(curr.first); + if (curr.first == this->to) { + break; + } + } + + std::string sample_name = "fragment"; // TODO: Make this a static class variable. + metadata.add_haplotype(sample_name, contig_name, this->haplotype, this->fragment); + builder.insert(path, true); +} + +//------------------------------------------------------------------------------ + void Recombinator::Statistics::combine(const Statistics& another) { this->chains += another.chains; this->subchains += another.subchains; + this->bad_subchains += another.bad_subchains; this->fragments += another.fragments; this->full_haplotypes += another.full_haplotypes; this->haplotypes = std::max(this->haplotypes, another.haplotypes); + this->extra_fragments += another.extra_fragments; this->connections += another.connections; this->ref_paths += another.ref_paths; this->kmers += another.kmers; @@ -1078,6 +1254,9 @@ void Recombinator::Statistics::combine(const Statistics& another) { std::ostream& Recombinator::Statistics::print(std::ostream& out) const { out << this->haplotypes << " haplotypes for " << this->chains << " chains (" << this->full_haplotypes << " full, " << this->subchains << " subchains, " << this->fragments << " fragments)"; + if (this->bad_subchains > 0) { + out << "; " << this->bad_subchains << " bad subchains with " << this->extra_fragments << " extra fragments"; + } if (this->subchains > 0) { double connection_rate = static_cast(this->connections) / (this->subchains * this->haplotypes); out << "; connection rate " << connection_rate; @@ -1123,7 +1302,11 @@ void Recombinator::Parameters::print(std::ostream& out) const { out << "- kmer coverage " << this->coverage << std::endl; } if (this->diploid_sampling) { - out << "- diploid sampling (" << this->num_haplotypes << " candidates)" << std::endl; + out << "- diploid sampling (" << this->num_haplotypes << " candidates"; + if (this->extra_fragments) { + out << ", badness threshold " << this->badness_threshold; + } + out << ")" << std::endl; } else { out << "- heuristic sampling (" << this->num_haplotypes << " haplotypes)" << std::endl; } @@ -1157,6 +1340,10 @@ void recombinator_sanity_checks(const Recombinator::Parameters& parameters) { std::string msg = "recombinator_sanity_checks(): number of haplotypes cannot be 0"; throw std::runtime_error(msg); } + if (parameters.badness_threshold <= 0.0) { + std::string msg = "recombinator_sanity_checks(): badness threshold must be positive"; + throw std::runtime_error(msg); + } if (parameters.diploid_sampling && parameters.num_haplotypes < 2) { std::string msg = "recombinator_sanity_checks(): diploid sampling requires at least 2 haplotypes"; throw std::runtime_error(msg); @@ -1372,7 +1559,7 @@ std::vector> classify_kmers( std::vector> kmer_types; size_t selected_kmers = 0; for (size_t kmer_id = 0; kmer_id < subchain.kmers.size(); kmer_id++) { - double count = kmer_counts.at(subchain.kmers[kmer_id].first); + double count = kmer_counts.at(subchain.kmers[kmer_id]); if (count < absent_threshold) { kmer_types.push_back({ Recombinator::absent, -1.0 * parameters.absent_score }); selected_kmers++; @@ -1434,10 +1621,15 @@ std::vector Recombinator::classify_kmers( // Select the best pair of haplotypes from the candidates. Each haplotype gets // +1 for getting a kmer right and -1 for getting it wrong. +// Also returns the remaining non-duplicated haplotypes as extra fragments if +// the appropriate paremeters have been set. std::vector> select_diploid( + const gbwtgraph::GBZ& gbz, const Haplotypes::Subchain& subchain, const std::vector>& candidates, - const std::vector>& kmer_types + const std::vector>& kmer_types, + size_t original_selected, + const Recombinator::Parameters& parameters ) { std::int64_t best_score = std::numeric_limits::min(); size_t best_left = 0, best_right = 1; @@ -1471,6 +1663,24 @@ std::vector> select_diploid( } } + // If this is a bad subchain, we move the selected haplotypes to the front + // and return the remaining non-duplicated haplotypes as extra fragments. + // Otherwise we return only the selected haplotypes. + if (parameters.extra_fragments) { + double badness = subchain.badness(gbz); + if (badness > parameters.badness_threshold) { + std::vector> result; + result.push_back(candidates[best_left]); + result.push_back(candidates[best_right]); + for (size_t i = 0; i < original_selected; i++) { + if (i != best_left && i != best_right) { + result.push_back(candidates[i]); + } + } + return result; + } + } + return { candidates[best_left], candidates[best_right] }; } @@ -1479,6 +1689,7 @@ std::vector> select_diploid( // Updates the local haplotypes with scores and ranks in each round of selection // if provided. std::vector> select_haplotypes( + const gbwtgraph::GBZ& gbz, const Haplotypes::Subchain& subchain, const hash_map& kmer_counts, double coverage, @@ -1558,12 +1769,14 @@ std::vector> select_haplotypes( selected_haplotypes.push_back(next); } - // Do diploid sampling if necessary. + // Do diploid sampling. If this is a bad subchain, we also return the + // extra fragments starting from the third haplotype. But we don't return + // duplicated ones. if (parameters.diploid_sampling) { - return select_diploid(subchain, selected_haplotypes, kmer_types); + return select_diploid(gbz, subchain, selected_haplotypes, kmer_types, original_selected, parameters); + } else { + return selected_haplotypes; } - - return selected_haplotypes; } Recombinator::Statistics Recombinator::generate_haplotypes(const Haplotypes::TopLevelChain& chain, @@ -1592,16 +1805,28 @@ Recombinator::Statistics Recombinator::generate_haplotypes(const Haplotypes::Top statistics.full_haplotypes = 1; } else { bool have_haplotypes = false; - for (auto& subchain : chain.subchains) { + std::vector extra_fragments; + for (size_t subchain_id = 0; subchain_id < chain.subchains.size(); subchain_id++) { + const auto& subchain = chain.subchains[subchain_id]; if (subchain.type == Haplotypes::Subchain::full_haplotype) { throw std::runtime_error("Recombinator::generate_haplotypes(): nontrivial chain " + std::to_string(chain.offset) + " contains a subchain with full haplotypes"); } assert(!subchain.sequences.empty()); - // Select the haplotypes greedily. + // Select the haplotypes greedily. If we are doing diploid sampling and we get + // extra fragments, store them for later processing. std::vector> selected_haplotypes = select_haplotypes( - subchain, kmer_counts, coverage, &statistics, nullptr, parameters + this->gbz, subchain, kmer_counts, coverage, &statistics, nullptr, parameters ); + if (parameters.diploid_sampling && selected_haplotypes.size() > 2) { + for (size_t i = 2; i < selected_haplotypes.size(); i++) { + gbwt::edge_type start(subchain.start, subchain.sequences[selected_haplotypes[i].first].second); + extra_fragments.emplace_back(start, subchain.end, i - 1, subchain_id); + } + statistics.bad_subchains++; + statistics.extra_fragments += selected_haplotypes.size() - 2; + selected_haplotypes.resize(2); + } // Try to match the existing haplotypes with the selected sequences based on // GBWT sequence id. @@ -1650,6 +1875,11 @@ Recombinator::Statistics Recombinator::generate_haplotypes(const Haplotypes::Top } } statistics.fragments = haplotypes.front().fragment; + + // Add the extra fragments as separate paths. + for (const RecombinatorFragment& fragment : extra_fragments) { + fragment.generate(this->gbz.index, builder, metadata, chain.contig_name); + } } return statistics; @@ -1696,7 +1926,7 @@ std::vector Recombinator::extract_sequences( double coverage = get_or_estimate_coverage(counts, parameters, this->verbosity); // Fill in the scores. - select_haplotypes(subchain, counts, coverage, nullptr, &result, parameters); + select_haplotypes(this->gbz, subchain, counts, coverage, nullptr, &result, parameters); return result; } diff --git a/src/recombinator.hpp b/src/recombinator.hpp index 17e6f618b2..b8969917fd 100644 --- a/src/recombinator.hpp +++ b/src/recombinator.hpp @@ -37,6 +37,9 @@ namespace vg { * * Versions: * + * * Version 3: Subchains use smaller integers when possible. Compatible with + * version 2. + * * * Version 2: Top-level chains include a contig name. Compatible with version 1. * * * Version 1: Initial version. @@ -61,7 +64,7 @@ class Haplotypes { /// Header of the serialized file. struct Header { constexpr static std::uint32_t MAGIC_NUMBER = 0x4C504148; // "HAPL" - constexpr static std::uint32_t VERSION = 2; + constexpr static std::uint32_t VERSION = 3; constexpr static std::uint32_t MIN_VERSION = 1; constexpr static std::uint64_t DEFAULT_K = 29; @@ -90,6 +93,9 @@ class Haplotypes { /// A GBWT sequence as (sequence identifier, offset in a node). typedef std::pair sequence_type; + /// A more space-efficient representation of `sequence_type`. + typedef std::pair compact_sequence_type; + /// Representation of a subchain. struct Subchain { /// Subchain types. @@ -118,13 +124,21 @@ class Haplotypes { /// A vector of distinct kmers. For each kmer, list the kmer itself and the number /// of haplotypes it appears in. - std::vector> kmers; + std::vector kmers; - // TODO: This could be smaller - /// Sequences as (GBWT sequence id, offset in the relevant node). - std::vector sequences; + /// Number of haplotypes each kmer appears in. + sdsl::int_vector<0> kmer_counts; - // TODO: This needs to be compressed for larger datasets. + /// Sequences as (GBWT sequence id, offset in the relevant node). + std::vector sequences; + + // TODO v3: Use an extra bit for each sequence to mark whether the presence for that sequence + // is stored explicitly or relative to the last explicit sequence. + // We need to cluster the sequences by similarity and store the clusters consecutively. + // And then use sd_vector for the sequences with relative presence. + // Decompress to a single bitvector when needed. + /// A bit vector marking the presence of kmers in the sequences. + /// Sequence `i` contains kmer `j` if and only if `kmers_present[i * kmers.size() + j] == 1`. sdsl::bit_vector kmers_present; /// Returns the start node as a GBWTGraph handle. @@ -142,9 +156,30 @@ class Haplotypes { /// Returns a string representation of the type and the boundary nodes. std::string to_string() const; + /// Returns (sequence identifier, offset in a node) for the given sequence. + sequence_type get_sequence(size_t i) const { + return { this->sequences[i].first, this->sequences[i].second }; + } + + /// Returns the distance from the last base of `start` to the first base of + /// `end` over the given sequence. Returns 0 if the subchain is not normal or + /// if the sequence does not exist. + size_t distance(const gbwtgraph::GBZ& gbz, size_t i) const; + + /// Returns an estimate of the badness of the subchain. + /// The ideal value is 0.0, and higher values indicate worse subchains. + /// The estimate is based on the following factors: + /// * Length of the subchain. + /// * Number of haplotypes relative to the expected number. + /// * Information content of the kmers (disabled). + double badness(const gbwtgraph::GBZ& gbz) const; + /// Serializes the object to a stream in the simple-sds format. void simple_sds_serialize(std::ostream& out) const; + /// Loads a less space-efficient version 1 or 2 subchain. + void load_v1(std::istream& in); + /// Loads the object from a stream in the simple-sds format. void simple_sds_load(std::istream& in); @@ -172,8 +207,11 @@ class Haplotypes { /// Loads the object from a stream in the simple-sds format. void simple_sds_load(std::istream& in); - /// Loads the old version without a contig name. - void load_old(std::istream& in); + /// Loads a version 1 chain without a contig name. + void load_v1(std::istream& in); + + /// Loads a less space-efficient version 2 chain. + void load_v2(std::istream& in); /// Returns the size of the object in elements. size_t simple_sds_size() const; @@ -392,6 +430,10 @@ class Recombinator { /// A reasonable number of candidates for diploid sampling. constexpr static size_t NUM_CANDIDATES = 32; + // TODO: Proper threshold? + /// Badness threshold for subchains. + constexpr static double BADNESS_THRESHOLD = 4.0; + /// Expected kmer coverage. Use 0 to estimate from kmer counts. constexpr static size_t COVERAGE = 0; @@ -425,6 +467,9 @@ class Recombinator { /// Number of subchains. size_t subchains = 0; + /// Number of subchains exceeding the badness threshold. + size_t bad_subchains = 0; + /// Number of fragments. size_t fragments = 0; @@ -434,6 +479,9 @@ class Recombinator { /// Number of haplotypes generated. size_t haplotypes = 0; + /// Number of additional haplotype fragments in bad subchains. + size_t extra_fragments = 0; + /// Number of times a haplotype was extended from a subchain to the next subchain. size_t connections = 0; @@ -491,9 +539,17 @@ class Recombinator { /// highest-scoring pair out of them. bool diploid_sampling = false; + /// When using diploid sampling, include the remaining candidates as + /// additional fragments in bad subchains. + bool extra_fragments = false; + + /// Badness threshold for subchains when using diploid sampling. + double badness_threshold = BADNESS_THRESHOLD; + /// Include named and reference paths. bool include_reference = false; + // TODO: Should be use extra_fragments? /// Preset parameters for common use cases. enum preset_t { /// Default parameters. diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp index 883558e567..05a204fdf8 100644 --- a/src/subcommand/giraffe_main.cpp +++ b/src/subcommand/giraffe_main.cpp @@ -2042,6 +2042,9 @@ int main_giraffe(int argc, char** argv) { std::cerr << "Thread " << thread_num << " now mapping " << aln1.name() << ", " << aln2.name() << std::endl; } + + check_quality_length(aln1); + check_quality_length(aln2); toUppercaseInPlace(*aln1.mutable_sequence()); toUppercaseInPlace(*aln2.mutable_sequence()); @@ -2150,6 +2153,7 @@ int main_giraffe(int argc, char** argv) { std::cerr << "Thread " << thread_num << " now mapping " << aln.name() << std::endl; } + check_quality_length(aln); toUppercaseInPlace(*aln.mutable_sequence()); // Map the read with the MinimizerMapper. diff --git a/src/subcommand/haplotypes_main.cpp b/src/subcommand/haplotypes_main.cpp index 3a3c36a3ba..ec7a9f97bf 100644 --- a/src/subcommand/haplotypes_main.cpp +++ b/src/subcommand/haplotypes_main.cpp @@ -1,6 +1,14 @@ /** \file haplotypes_main.cpp * * Defines the "vg haplotypes" subcommand, which samples haplotypes by kmer counts in the reads. + * + * TODO: + * + * 1. Option to sample non-contiguous haplotypes. This may be important in large + * snarls. Select a suffix of a fragment, all fragments fully within the snarl, + * and the prefix of the fragment exiting the snarl. + * + * 2. Tests for --linear-structure, --extra-fragments, and #1. */ #include "subcommand.hpp" @@ -25,50 +33,62 @@ using namespace vg; //---------------------------------------------------------------------------- +// Default values for command line parameters. + +namespace haplotypes_defaults { + constexpr size_t DEFAULT_MAX_THREADS = 16; -size_t haplotypes_default_threads() { +size_t threads() { size_t threads = omp_get_max_threads(); threads = std::max(threads, size_t(1)); return std::min(threads, DEFAULT_MAX_THREADS); } -constexpr size_t haplotypes_default_k() { +constexpr size_t k() { return Haplotypes::Header::DEFAULT_K; } -constexpr size_t haplotypes_default_w() { +constexpr size_t w() { return gbwtgraph::Key64::WINDOW_LENGTH; } -constexpr size_t haplotypes_default_subchain_length() { +constexpr size_t subchain_length() { return HaplotypePartitioner::SUBCHAIN_LENGTH; } -constexpr size_t haplotypes_default_n() { +constexpr size_t n() { return Recombinator::NUM_HAPLOTYPES; } -constexpr size_t haplotypes_default_candidates() { +constexpr size_t candidates() { return Recombinator::NUM_CANDIDATES; } -constexpr size_t haplotypes_default_coverage() { +constexpr size_t coverage() { return Recombinator::COVERAGE; } -constexpr double haplotypes_default_discount() { +constexpr double discount() { return Recombinator::PRESENT_DISCOUNT; } -constexpr double haplotypes_default_adjustment() { +constexpr double adjustment() { return Recombinator::HET_ADJUSTMENT; } -constexpr double haplotypes_default_absent() { +constexpr double absent() { return Recombinator::ABSENT_SCORE; } +constexpr double badness() { + return Recombinator::BADNESS_THRESHOLD; +} + +}; // namespace haplotypes_defaults + +//---------------------------------------------------------------------------- + struct HaplotypesConfig { enum OperatingMode { mode_invalid, @@ -78,6 +98,7 @@ struct HaplotypesConfig { mode_map_variants, mode_extract, mode_classify, + mode_statistics, }; OperatingMode mode = mode_invalid; @@ -90,7 +111,7 @@ struct HaplotypesConfig { std::string haplotype_input, kmer_input, vcf_input; // Computational parameters. - size_t k = haplotypes_default_k(), w = haplotypes_default_w(); + size_t k = haplotypes_defaults::k(), w = haplotypes_defaults::w(); HaplotypePartitioner::Parameters partitioner_parameters; Recombinator::Parameters recombinator_parameters; @@ -101,8 +122,11 @@ struct HaplotypesConfig { size_t chain_id = std::numeric_limits::max(); size_t subchain_id = std::numeric_limits::max(); + // For subchain statistics. + std::string ref_sample; + // Other parameters. - size_t threads = haplotypes_default_threads(); + size_t threads = haplotypes_defaults::threads(); bool validate = false; HaplotypesConfig(int argc, char** argv, size_t max_threads); @@ -118,13 +142,15 @@ void extract_haplotypes(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, void classify_kmers(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, const HaplotypesConfig& config); +void subchain_statistics(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, const HaplotypesConfig& config); + //---------------------------------------------------------------------------- int main_haplotypes(int argc, char** argv) { double start = gbwt::readTimer(); gbwt::Verbosity::set(gbwt::Verbosity::SILENT); size_t max_threads = omp_get_max_threads(); - omp_set_num_threads(haplotypes_default_threads()); + omp_set_num_threads(haplotypes_defaults::threads()); // Parse the arguments. HaplotypesConfig config(argc, argv, max_threads); @@ -177,6 +203,11 @@ int main_haplotypes(int argc, char** argv) { classify_kmers(gbz, haplotypes, config); } + // Output statistics on subchain lengths and kmer counts. + if (config.mode == HaplotypesConfig::mode_statistics) { + subchain_statistics(gbz, haplotypes, config); + } + if (config.verbosity >= Haplotypes::verbosity_basic) { double seconds = gbwt::readTimer() - start; double gib = gbwt::inGigabytes(gbwt::memoryUsage()); @@ -198,6 +229,8 @@ void help_haplotypes(char** argv, bool developer_options) { if (developer_options) { std::cerr << usage << "-i graph.hapl --vcf-input variants.vcf graph.gbz > output.tsv" << std::endl; std::cerr << usage << "-i graph.hapl -k kmers.kff --extract M:N graph.gbz > output.fa" << std::endl; + std::cerr << usage << "-i graph.hapl -k kmers.kff --classify output.kmers graph.gbz" << std::endl; + std::cerr << usage << "-i graph.hapl --statistics ref_sample graph.gbz > output.tsv" << std::endl; } std::cerr << std::endl; @@ -214,26 +247,28 @@ void help_haplotypes(char** argv, bool developer_options) { std::cerr << " -k, --kmer-input X use kmer counts from this KFF file (required for --gbz-output)" << std::endl; std::cerr << std::endl; std::cerr << "Options for generating haplotype information:" << std::endl; - std::cerr << " --kmer-length N kmer length for building the minimizer index (default: " << haplotypes_default_k() << ")" << std::endl; - std::cerr << " --window-length N window length for building the minimizer index (default: " << haplotypes_default_w() << ")" << std::endl; - std::cerr << " --subchain-length N target length (in bp) for subchains (default: " << haplotypes_default_subchain_length() << ")" << std::endl; + std::cerr << " --kmer-length N kmer length for building the minimizer index (default: " << haplotypes_defaults::k() << ")" << std::endl; + std::cerr << " --window-length N window length for building the minimizer index (default: " << haplotypes_defaults::w() << ")" << std::endl; + std::cerr << " --subchain-length N target length (in bp) for subchains (default: " << haplotypes_defaults::subchain_length() << ")" << std::endl; std::cerr << " --linear-structure extend subchains to avoid haplotypes visiting them multiple times" << std::endl; std::cerr << std::endl; std::cerr << "Options for sampling haplotypes:" << std::endl; std::cerr << " --preset X use preset X (default, haploid, diploid)" << std::endl; std::cerr << " --coverage N kmer coverage in the KFF file (default: estimate)" << std::endl; - std::cerr << " --num-haplotypes N generate N haplotypes (default: " << haplotypes_default_n() << ")" << std::endl; - std::cerr << " sample from N candidates (with --diploid-sampling; default: " << haplotypes_default_candidates() << ")" << std::endl; - std::cerr << " --present-discount F discount scores for present kmers by factor F (default: " << haplotypes_default_discount() << ")" << std::endl; - std::cerr << " --het-adjustment F adjust scores for heterozygous kmers by F (default: " << haplotypes_default_adjustment() << ")" << std::endl; - std::cerr << " --absent-score F score absent kmers -F/+F (default: " << haplotypes_default_absent() << ")" << std::endl; + std::cerr << " --num-haplotypes N generate N haplotypes (default: " << haplotypes_defaults::n() << ")" << std::endl; + std::cerr << " sample from N candidates (with --diploid-sampling; default: " << haplotypes_defaults::candidates() << ")" << std::endl; + std::cerr << " --present-discount F discount scores for present kmers by factor F (default: " << haplotypes_defaults::discount() << ")" << std::endl; + std::cerr << " --het-adjustment F adjust scores for heterozygous kmers by F (default: " << haplotypes_defaults::adjustment() << ")" << std::endl; + std::cerr << " --absent-score F score absent kmers -F/+F (default: " << haplotypes_defaults::absent() << ")" << std::endl; std::cerr << " --haploid-scoring use a scoring model without heterozygous kmers" << std::endl; std::cerr << " --diploid-sampling choose the best pair from the sampled haplotypes" << std::endl; + std::cerr << " --extra-fragments in diploid sampling, select all candidates in bad subchains" << std::endl; + std::cerr << " --badness F threshold for the badness of a subchain (default: " << haplotypes_defaults::badness() << ")" << std::endl; std::cerr << " --include-reference include named and reference paths in the output" << std::endl; std::cerr << std::endl; std::cerr << "Other options:" << std::endl; std::cerr << " -v, --verbosity N verbosity level (0 = silent, 1 = basic, 2 = detailed, 3 = debug; default: 0)" << std::endl; - std::cerr << " -t, --threads N approximate number of threads (default: " << haplotypes_default_threads() << " on this system)" << std::endl; + std::cerr << " -t, --threads N approximate number of threads (default: " << haplotypes_defaults::threads() << " on this system)" << std::endl; std::cerr << std::endl; if (developer_options) { std::cerr << "Developer options:" << std::endl; @@ -241,8 +276,9 @@ void help_haplotypes(char** argv, bool developer_options) { std::cerr << " --vcf-input X map the variants in VCF file X to subchains" << std::endl; std::cerr << " --contig-prefix X a prefix for transforming VCF contig names into GBWT contig names" << std::endl; std::cerr << " --extract M:N extract haplotypes in chain M, subchain N in FASTA format" << std::endl; - std::cerr << " --score-output X write haplotype scores to X" << std::endl; + std::cerr << " --score-output X write haplotype scores to X (use with --extract)" << std::endl; std::cerr << " --classify X classify kmers and write output to X" << std::endl; + std::cerr << " --statistics X output subchain statistics over reference sample X" << std::endl; std::cerr << std::endl; } } @@ -262,13 +298,16 @@ HaplotypesConfig::HaplotypesConfig(int argc, char** argv, size_t max_threads) { constexpr int OPT_ABSENT_SCORE = 1305; constexpr int OPT_HAPLOID_SCORING = 1306; constexpr int OPT_DIPLOID_SAMPLING = 1307; - constexpr int OPT_INCLUDE_REFERENCE = 1308; + constexpr int OPT_EXTRA_FRAGMENTS = 1308; + constexpr int OPT_BADNESS = 1309; + constexpr int OPT_INCLUDE_REFERENCE = 1310; constexpr int OPT_VALIDATE = 1400; constexpr int OPT_VCF_INPUT = 1500; constexpr int OPT_CONTIG_PREFIX = 1501; constexpr int OPT_EXTRACT = 1600; constexpr int OPT_SCORE_OUTPUT = 1601; constexpr int OPT_CLASSIFY = 1602; + constexpr int OPT_STATISTICS = 1603; static struct option long_options[] = { @@ -290,6 +329,8 @@ HaplotypesConfig::HaplotypesConfig(int argc, char** argv, size_t max_threads) { { "absent-score", required_argument, 0, OPT_ABSENT_SCORE }, { "haploid-scoring", no_argument, 0, OPT_HAPLOID_SCORING }, { "diploid-sampling", no_argument, 0, OPT_DIPLOID_SAMPLING }, + { "extra-fragments", no_argument, 0, OPT_EXTRA_FRAGMENTS }, + { "badness", required_argument, 0, OPT_BADNESS }, { "include-reference", no_argument, 0, OPT_INCLUDE_REFERENCE }, { "verbosity", required_argument, 0, 'v' }, { "threads", required_argument, 0, 't' }, @@ -299,6 +340,7 @@ HaplotypesConfig::HaplotypesConfig(int argc, char** argv, size_t max_threads) { { "extract", required_argument, 0, OPT_EXTRACT }, { "score-output", required_argument, 0, OPT_SCORE_OUTPUT }, { "classify", required_argument, 0, OPT_CLASSIFY }, + { "statistics", required_argument, 0, OPT_STATISTICS }, { 0, 0, 0, 0 } }; @@ -413,6 +455,16 @@ HaplotypesConfig::HaplotypesConfig(int argc, char** argv, size_t max_threads) { case OPT_DIPLOID_SAMPLING: this->recombinator_parameters.diploid_sampling = true; break; + case OPT_EXTRA_FRAGMENTS: + this->recombinator_parameters.extra_fragments = true; + break; + case OPT_BADNESS: + this->recombinator_parameters.badness_threshold = parse(optarg); + if (this->recombinator_parameters.badness_threshold <= 0.0) { + std::cerr << "error: [vg haplotypes] badness threshold must be positive" << std::endl; + std::exit(EXIT_FAILURE); + } + break; case OPT_INCLUDE_REFERENCE: this->recombinator_parameters.include_reference = true; break; @@ -463,6 +515,9 @@ HaplotypesConfig::HaplotypesConfig(int argc, char** argv, size_t max_threads) { case OPT_CLASSIFY: this->kmer_output = optarg; break; + case OPT_STATISTICS: + this->ref_sample = optarg; + break; case 'h': case '?': @@ -492,6 +547,8 @@ HaplotypesConfig::HaplotypesConfig(int argc, char** argv, size_t max_threads) { this->mode = mode_extract; } else if (!this->haplotype_input.empty() && !this->kmer_input.empty() && !this->kmer_output.empty()) { this->mode = mode_classify; + } else if (!this->haplotype_input.empty() && !this->ref_sample.empty()) { + this->mode = mode_statistics; } if (this->mode == mode_invalid) { help_haplotypes(argv, false); @@ -500,7 +557,7 @@ HaplotypesConfig::HaplotypesConfig(int argc, char** argv, size_t max_threads) { // Use conditional defaults if the user did not override them. if (this->recombinator_parameters.diploid_sampling && !num_haplotypes_set) { - this->recombinator_parameters.num_haplotypes = haplotypes_default_candidates(); + this->recombinator_parameters.num_haplotypes = haplotypes_defaults::candidates(); } } @@ -655,26 +712,33 @@ gbwt::size_type path_for_contig(const gbwtgraph::GBZ& gbz, gbwt::size_type conti return path_id; } -std::pair seq_chain_for_path(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, gbwt::size_type path_id, const std::string& contig_name) { +// Returns the GBWT sequence id for the given path in the orientation it appears in this top-level chain. +// Returns `gbwt::invalid_sequence()` if the path is not found. +gbwt::size_type seq_for_chain(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, gbwt::size_type path_id, size_t chain_id) { gbwt::size_type sequence_id = gbwt::Path::encode(path_id, false); gbwt::size_type reverse_id = gbwt::Path::encode(path_id, true); + for (const Haplotypes::Subchain& subchain : haplotypes.chains[chain_id].subchains) { + for (const std::pair& sequence : subchain.sequences) { + if (sequence.first == sequence_id) { + return sequence.first; + } + if (sequence.first == reverse_id) { + return sequence.first; + } + } + } + return gbwt::invalid_sequence(); +} + +std::pair seq_chain_for_path(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, gbwt::size_type path_id, const std::string& contig_name) { size_t found_chains = 0; std::pair result(gbwt::invalid_sequence(), haplotypes.components()); for (size_t chain_id = 0; chain_id < haplotypes.components(); chain_id++) { - const Haplotypes::Subchain& subchain = haplotypes.chains[chain_id].subchains.front(); - for (size_t i = 0; i < subchain.sequences.size(); i++) { - if (subchain.sequences[i].first == sequence_id) { - result.first = sequence_id; - result.second = chain_id; - found_chains++; - break; - } - if (subchain.sequences[i].first == reverse_id) { - result.first = reverse_id; - result.second = chain_id; - found_chains++; - break; - } + gbwt::size_type seq_id = seq_for_chain(gbz, haplotypes, path_id, chain_id); + if (seq_id != gbwt::invalid_sequence()) { + result.first = seq_id; + result.second = chain_id; + found_chains++; } } if (found_chains != 1) { @@ -709,29 +773,32 @@ struct ReferenceInterval { return this->end - this->start; } - std::string to_string() const { - std::string result; + char type_as_char() const { switch (this->type) { case Haplotypes::Subchain::normal: - result.push_back('N'); - break; + return 'N'; case Haplotypes::Subchain::prefix: - result.push_back('P'); - break; + return 'P'; case Haplotypes::Subchain::suffix: - result.push_back('S'); - break; + return 'S'; case Haplotypes::Subchain::full_haplotype: - result.push_back('F'); - break; + return 'F'; } + return '?'; + } + + std::string to_string() const { + std::string result; + result.push_back(this->type_as_char()); result += std::to_string(this->id) + "(" + std::to_string(this->start) + ".." + std::to_string(this->end) + ")"; return result; } }; -std::vector subchain_intervals(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, gbwt::size_type sequence_id, size_t chain_id, bool reverse) { - gbwt::size_type actual_sequence_id = (reverse ? gbwt::Path::reverse(sequence_id) : sequence_id); +// Also returns the total length of the path / GBWT sequence in bp. +std::pair, size_t> subchain_intervals(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, gbwt::size_type sequence_id, size_t chain_id) { + bool reverse = gbwt::Path::is_reverse(sequence_id); + gbwt::size_type actual_sequence_id = gbwt::Path::encode(gbwt::Path::id(sequence_id), false); gbwt::vector_type path = gbz.index.extract(actual_sequence_id); size_t total_length = 0; for (auto gbwt_node : path) { @@ -778,7 +845,7 @@ std::vector subchain_intervals(const gbwtgraph::GBZ& gbz, con interval.start = seq_offset; } else if (subchain.type == Haplotypes::Subchain::prefix) { // If a prefix follows a suffix, they cover the same interval. - interval.start = result.back().start; + interval.start = (result.empty() ? 0 : result.back().start); } if (subchain.has_end()) { while (node_offset < path.size() && path[node_offset] != subchain.end) { @@ -787,14 +854,14 @@ std::vector subchain_intervals(const gbwtgraph::GBZ& gbz, con } interval.end = seq_offset; // If a prefix follows a suffix, they cover the same interval. - if (subchain.type == Haplotypes::Subchain::prefix) { + if (subchain.type == Haplotypes::Subchain::prefix && !result.empty()) { result.back().end = interval.end; } } result.push_back(interval); } - return result; + return { result, total_length }; } void map_variants(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, const HaplotypesConfig& config) { @@ -868,7 +935,10 @@ void map_variants(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, const size_t offset = iter->second; gbwt::size_type sequence_id = offset_to_seq_chain[offset].first; gbwt::size_type chain_id = offset_to_seq_chain[offset].second; - auto ref_intervals = subchain_intervals(gbz, haplotypes, sequence_id, chain_id, gbwt::Path::is_reverse(sequence_id)); + std::vector ref_intervals; + size_t total_length; + std::tie(ref_intervals, total_length) = + subchain_intervals(gbz, haplotypes, sequence_id, chain_id); for (auto interval : variant_positions[offset]) { size_t low = 0, high = ref_intervals.size(); bool found = false; @@ -990,6 +1060,72 @@ void classify_kmers(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, con //---------------------------------------------------------------------------- +gbwt::size_type path_for_sample_contig( + const gbwtgraph::GBZ& gbz, + const std::string& sample_name, const std::string& contig_name +) { + gbwt::size_type sample_id = gbz.index.metadata.sample(sample_name); + if (sample_id >= gbz.index.metadata.samples()) { + std::cerr << "error: [vg haplotypes] sample " << sample_name << " not found" << std::endl; + std::exit(EXIT_FAILURE); + } + gbwt::size_type contig_id = gbz.index.metadata.contig(contig_name); + if (contig_id >= gbz.index.metadata.contigs()) { + std::cerr << "error: [vg haplotypes] contig " << contig_name << " not found" << std::endl; + std::exit(EXIT_FAILURE); + } + + auto paths = gbz.index.metadata.findPaths(sample_id, contig_id); + if (paths.size() != 1) { + std::cerr << "error: [vg haplotypes] found " << paths.size() << " paths for sample " << sample_name << ", contig " << contig_name << std::endl; + std::exit(EXIT_FAILURE); + } + return paths.front(); +} + +void subchain_statistics(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, const HaplotypesConfig& config) { + gbwt::size_type sample_id = gbz.index.metadata.sample(config.ref_sample); + if (sample_id >= gbz.index.metadata.samples()) { + std::cerr << "error: [vg haplotypes] sample " << config.ref_sample << " not found in the graph" << std::endl; + std::exit(EXIT_FAILURE); + } + + // Header line: graph name, sample name. + std::cout << "H\t" << config.graph_name << "\t" << config.ref_sample << std::endl; + + for (size_t chain_id = 0; chain_id < haplotypes.components(); chain_id++) { + gbwt::size_type path_id = path_for_sample_contig(gbz, config.ref_sample, haplotypes.chains[chain_id].contig_name); + gbwt::size_type seq_id = seq_for_chain(gbz, haplotypes, path_id, chain_id); + if (seq_id == gbwt::invalid_sequence()) { + std::cerr << "error: [vg haplotypes] could not determine reference orientation in chain " << chain_id << std::endl; + std::exit(EXIT_FAILURE); + } + std::vector ref_intervals; + size_t total_length; + std::tie(ref_intervals, total_length) = subchain_intervals(gbz, haplotypes, seq_id, chain_id); + + // Contig line: chain id, contig name, total length. + std::cout + << "C\t" + << chain_id << "\t" + << haplotypes.chains[chain_id].contig_name << "\t" + << total_length << std::endl; + // For each subchain: type, id, start, end, length, number of kmers. + for (size_t i = 0; i < ref_intervals.size(); i++) { + size_t kmers = haplotypes.chains[chain_id].subchains[ref_intervals[i].id].kmers.size(); + std::cout + << ref_intervals[i].type_as_char() << "\t" + << ref_intervals[i].id << "\t" + << ref_intervals[i].start << "\t" + << ref_intervals[i].end << "\t" + << ref_intervals[i].length() << "\t" + << kmers << std::endl; + } + } +} + +//---------------------------------------------------------------------------- + void validate_error(const std::string& header, const std::string& message) { std::cerr << "error: [vg haplotypes] "; if (!header.empty()) { @@ -1205,7 +1341,7 @@ void validate_chain(const Haplotypes::TopLevelChain& chain, if (subchain.type != Haplotypes::Subchain::full_haplotype) { hash_set all_kmers; for (size_t i = 0; i < subchain.kmers.size(); i++) { - all_kmers.insert(subchain.kmers[i].first); + all_kmers.insert(subchain.kmers[i]); } if (all_kmers.size() != subchain.kmers.size()) { std::string message = expected_got(subchain.kmers.size(), all_kmers.size()) + " kmers"; @@ -1224,15 +1360,15 @@ void validate_chain(const Haplotypes::TopLevelChain& chain, } for (size_t j = 0, offset = i * subchain.kmers.size(); j < subchain.kmers.size(); j++, offset++) { if (subchain.kmers_present[offset]) { - auto iter = unique_minimizers.find(subchain.kmers[j].first); + auto iter = unique_minimizers.find(subchain.kmers[j]); if (iter == unique_minimizers.end()) { std::string message = "kmer " + std::to_string(j) + " not present in the haplotype"; validate_error_sequence(chain_id, subchain_id, i, message); } - used_kmers[subchain.kmers[j].first]++; + used_kmers[subchain.kmers[j]]++; iter->second = true; } else { - if (unique_minimizers.find(subchain.kmers[j].first) != unique_minimizers.end()) { + if (unique_minimizers.find(subchain.kmers[j]) != unique_minimizers.end()) { std::string message = "kmer " + std::to_string(j) + " is present in the haplotype"; validate_error_sequence(chain_id, subchain_id, i, message); } @@ -1247,8 +1383,8 @@ void validate_chain(const Haplotypes::TopLevelChain& chain, size_t invalid_count = 0; for (size_t kmer_id = 0; kmer_id < subchain.kmers.size(); kmer_id++) { size_t count = 0; - auto iter = used_kmers.find(subchain.kmers[kmer_id].first); - if (iter == used_kmers.end() || iter->second != subchain.kmers[kmer_id].second) { + auto iter = used_kmers.find(subchain.kmers[kmer_id]); + if (iter == used_kmers.end() || iter->second != subchain.kmer_counts[kmer_id]) { invalid_count++; } } @@ -1335,7 +1471,7 @@ void validate_haplotypes(const Haplotypes& haplotypes, for (size_t subchain_id = 0; subchain_id < chain.subchains.size(); subchain_id++) { const Haplotypes::Subchain& subchain = chain.subchains[subchain_id]; for (size_t i = 0; i < subchain.kmers.size(); i++) { - auto iter = kmers.find(subchain.kmers[i].first); + auto iter = kmers.find(subchain.kmers[i]); if (iter != kmers.end()) { const Haplotypes::Subchain& prev = haplotypes.chains[iter->second.first].subchains[iter->second.second]; if (chain_id == iter->second.first && subchain_id == iter->second.second + 1 && subchain.type == Haplotypes::Subchain::prefix && prev.type == Haplotypes::Subchain::suffix) { @@ -1346,7 +1482,7 @@ void validate_haplotypes(const Haplotypes& haplotypes, validate_error_subchain(chain_id, subchain_id, message); } } - kmers[subchain.kmers[i].first] = { chain_id, subchain_id }; + kmers[subchain.kmers[i]] = { chain_id, subchain_id }; } } } diff --git a/src/subcommand/map_main.cpp b/src/subcommand/map_main.cpp index 956a4db47c..ad1fb39f58 100644 --- a/src/subcommand/map_main.cpp +++ b/src/subcommand/map_main.cpp @@ -978,6 +978,8 @@ int main_map(int argc, char** argv) { return; } + check_quality_length(alignment); + int tid = omp_get_thread_num(); vector alignments = mapper[tid]->align_multi(alignment, kmer_size, @@ -1016,6 +1018,8 @@ int main_map(int argc, char** argv) { }; function lambda = [&](Alignment& aln1, Alignment& aln2) { + check_quality_length(aln1); + check_quality_length(aln2); auto our_mapper = mapper[omp_get_thread_num()]; bool queued_resolve_later = false; auto alnp = our_mapper->align_paired_multi(aln1, @@ -1067,6 +1071,7 @@ int main_map(int argc, char** argv) { } else if (fastq2.empty()) { // single function lambda = [&](Alignment& alignment) { + check_quality_length(alignment); int tid = omp_get_thread_num(); vector alignments = mapper[tid]->align_multi(alignment, kmer_size, @@ -1092,6 +1097,8 @@ int main_map(int argc, char** argv) { } }; function lambda = [&](Alignment& aln1, Alignment& aln2) { + check_quality_length(aln1); + check_quality_length(aln2); auto our_mapper = mapper[omp_get_thread_num()]; bool queued_resolve_later = false; auto alnp = our_mapper->align_paired_multi(aln1, @@ -1165,6 +1172,8 @@ int main_map(int argc, char** argv) { } }; function lambda = [&](Alignment& aln1, Alignment& aln2) { + check_quality_length(aln1); + check_quality_length(aln2); auto our_mapper = mapper[omp_get_thread_num()]; bool queued_resolve_later = false; auto alnp = our_mapper->align_paired_multi(aln1, @@ -1214,6 +1223,7 @@ int main_map(int argc, char** argv) { } else { // Processing single-end GAM input function lambda = [&](Alignment& alignment) { + check_quality_length(alignment); int tid = omp_get_thread_num(); vector alignments = mapper[tid]->align_multi(alignment, kmer_size, diff --git a/src/subcommand/mpmap_main.cpp b/src/subcommand/mpmap_main.cpp index 7b00984a5c..f42ff1a0ac 100644 --- a/src/subcommand/mpmap_main.cpp +++ b/src/subcommand/mpmap_main.cpp @@ -2115,7 +2115,8 @@ int main_mpmap(int argc, char** argv) { if (watchdog) { watchdog->check_in(thread_num, alignment.name()); } - + + check_quality_length(alignment); toUppercaseInPlace(*alignment.mutable_sequence()); bool is_rna = uses_Us(alignment); @@ -2181,6 +2182,8 @@ int main_mpmap(int argc, char** argv) { watchdog->check_in(thread_num, alignment_1.name()); } + check_quality_length(alignment_1); + check_quality_length(alignment_2); toUppercaseInPlace(*alignment_1.mutable_sequence()); toUppercaseInPlace(*alignment_2.mutable_sequence()); diff --git a/src/subcommand/rna_main.cpp b/src/subcommand/rna_main.cpp index 515c2112e1..7925a8d00a 100644 --- a/src/subcommand/rna_main.cpp +++ b/src/subcommand/rna_main.cpp @@ -21,6 +21,7 @@ using namespace std; using namespace vg; using namespace vg::subcommand; +string GZ_SUFFIX = ".gz"; void help_rna(char** argv) { cerr << "\nusage: " << argv[0] << " rna [options] graph.[vg|pg|hg|gbz] > splicing_graph.[vg|pg|hg]" << endl @@ -250,6 +251,21 @@ int32_t main_rna(int32_t argc, char** argv) { return 1; } + for (const auto& filenames : {std::cref(transcript_filenames), std::cref(intron_filenames)}) { + // For each collection of filenames (see https://stackoverflow.com/a/64794991/) + + for (auto filename : filenames.get()) { + + // taken from https://stackoverflow.com/a/20446239/ + if (filename.size() >= GZ_SUFFIX.size() && + filename.compare(filename.size() - GZ_SUFFIX.size(), GZ_SUFFIX.size(), GZ_SUFFIX) == 0) { + + cerr << "[vg rna] ERROR: Annotation file " << filename << " appears to be gzipped. Decompress it before use." << endl; + return 1; + } + } + } + if (!haplotypes_filename.empty() && gbz_format) { cerr << "[vg rna] ERROR: Only one set of haplotypes can be provided (GBZ file contains both a graph and haplotypes). Use either --haplotypes or --gbz-format." << endl; diff --git a/src/subcommand/view_main.cpp b/src/subcommand/view_main.cpp index c254058f08..859d54a835 100644 --- a/src/subcommand/view_main.cpp +++ b/src/subcommand/view_main.cpp @@ -41,7 +41,7 @@ void help_view(char** argv) { << " -V, --vg-in input VG format only" << endl << " -j, --json output JSON format" << endl - << " -J, --json-in input JSON format" << endl + << " -J, --json-in input JSON format (use with e.g. -a as necessary)" << endl << " -c, --json-stream streaming conversion of a VG format graph in line delimited JSON format" << endl << " (this cannot be loaded directly via -J)" << endl @@ -52,11 +52,11 @@ void help_view(char** argv) { << " -T, --turtle-in input turtle format." << endl << " -r, --rdf_base_uri set base uri for the RDF output" << endl - << " -a, --align-in input GAM format" << endl + << " -a, --align-in input GAM format, or JSON version of GAM format" << endl << " -A, --aln-graph GAM add alignments from GAM to the graph" << endl - << " -q, --locus-in input stream is Locus format" << endl - << " -z, --locus-out output stream Locus format" << endl + << " -q, --locus-in input is Locus format, or JSON version of Locus format" << endl + << " -z, --locus-out output is Locus format" << endl << " -Q, --loci FILE input is Locus format for use by dot output" << endl << " -d, --dot output dot format" << endl @@ -81,12 +81,12 @@ void help_view(char** argv) { << " -i, --interleaved fastq is interleaved paired-ended" << endl << " -L, --pileup output VG Pileup format" << endl - << " -l, --pileup-in input VG Pileup format" << endl + << " -l, --pileup-in input VG Pileup format, or JSON version of VG Pileup format" << endl << " -B, --distance-in input distance index" << endl << " -R, --snarl-in input VG Snarl format" << endl << " -E, --snarl-traversal-in input VG SnarlTraversal format" << endl - << " -K, --multipath-in input VG MultipathAlignment format (GAMP)" << endl + << " -K, --multipath-in input VG MultipathAlignment format (GAMP), or JSON version of GAMP format" << endl << " -k, --multipath output VG MultipathAlignment format (GAMP)" << endl << " -D, --expect-duplicates don't warn if encountering the same node or edge multiple times" << endl << " -x, --extract-tag TAG extract and concatenate messages with the given tag" << endl diff --git a/src/utility.cpp b/src/utility.cpp index d4f493fba3..8510d98369 100644 --- a/src/utility.cpp +++ b/src/utility.cpp @@ -10,6 +10,9 @@ #include #include #include +// We don't define _GNU_SOURCE to get the cpuset functions since we will +// already have it for libstdc++ on the platforms where we need them +#include // For setting the temporary directory in submodules. @@ -118,7 +121,7 @@ void choose_good_thread_count() { if (count == 0) { // First priority: OMP_NUM_THREADS const char* value = getenv("OMP_NUM_THREADS"); - if (value) { + if (value && *value != '\0') { // Read the value. Throws if it isn't a legit number. count = std::stoi(value); } @@ -144,6 +147,34 @@ void choose_good_thread_count() { } } +#if !defined(__APPLE__) && defined(_GNU_SOURCE) + if (count == 0) { + // Next priority: CPU affinity mask (used by Slurm) + cpu_set_t mask; + if (sched_getaffinity(getpid(), sizeof(cpu_set_t), &mask)) { + // TODO: If you have >1024 bits in your mask, glibc can't deal and you will get EINVAL. + // We're supposed to then try increasingly large dynamically-allocated CPU flag sets until we find one that works. + auto problem = errno; + std::cerr << "warning[vg]: Cannot determine CPU count from affinity mask: " << strerror(problem) << std::endl; + } else { + // We're also supposed to intersect this mask with the actual + // existing processors, in case somebody flags on way more + // processors than actually exist. But Linux doesn't seem to do + // that by default, so we don't worry about it. + count = CPU_COUNT(&mask); + } + } +#endif + + if (count == 0) { + // Next priority: SLURM_JOB_CPUS_PER_NODE + const char* value = getenv("SLURM_JOB_CPUS_PER_NODE"); + if (value && *value != '\0') { + // Read the value. Throws if it isn't a legit number. + count = std::stoi(value); + } + } + if (count == 0) { // Next priority: hardware concurrency as reported by the STL. // This may itself be 0 if ungettable. diff --git a/src/utility.hpp b/src/utility.hpp index f798739df0..6d8d5c17f7 100644 --- a/src/utility.hpp +++ b/src/utility.hpp @@ -38,8 +38,9 @@ double get_fraction_of_ns(const string& seq); /// TODO: Assumes that this is the same for every parallel section. int get_thread_count(void); /// Decide on and apply a sensible OMP thread count. Pay attention to -/// OMP_NUM_THREADS if set, the "hardware concurrency", and container limit -/// information that may be available in /proc. +/// OMP_NUM_THREADS and SLURM_JOB_CPUS_PER_NODE if set, the "hardware +/// concurrency", and container limit information that may be available in +/// /proc. void choose_good_thread_count(); string wrap_text(const string& str, size_t width); bool is_number(const string& s);