Merge remote-tracking branch 'origin/master' into recursive-chunk

vgteam · Jan 17, 2025 · 11f8b6b · 11f8b6b · adamnovak · Jan 18, 2025
2 parents d2baa7e + e91be89
commit 11f8b6b
Show file tree

Hide file tree

Showing 17 changed files with 817 additions and 157 deletions.
diff --git a/deps/gbwtgraph b/deps/gbwtgraph
diff --git a/src/alignment.cpp b/src/alignment.cpp
@@ -2847,6 +2847,30 @@ void alignment_set_distance_to_correct(Alignment& aln, const map<string ,vector<
     }
 }
 
+void check_quality_length(const Alignment& aln) {
+    size_t quality_length = aln.quality().length();
+    if (quality_length == 0 || quality_length == aln.sequence().length()) {
+        // This length is acceptable.
+        return;
+    }
+
+    bool too_short = quality_length < aln.sequence().length();
+
+    std::stringstream ss;
+    ss << "Read " << aln.name() << " has " << aln.sequence().length() << " bases of sequence but ";
+    if (too_short) {
+        ss << "only ";
+    }
+    ss << quality_length << " base quality values.";
+    if (too_short) {
+        ss << " Was the quality information truncated?";
+    }
+
+    #pragma omp critical (cerr)
+    std::cerr << "error [vg::alignment.cpp]: " << ss.str() << std::endl;
+    exit(1);
+}
+
 AlignmentValidity alignment_is_valid(const Alignment& aln, const HandleGraph* hgraph, bool check_sequence) {
     size_t read_idx = 0;
     for (size_t i = 0; i < aln.path().mapping_size(); ++i) {

diff --git a/src/alignment.hpp b/src/alignment.hpp
@@ -328,6 +328,13 @@ map<string ,vector<pair<size_t, bool> > > alignment_refpos_to_path_offsets(const
 void alignment_set_distance_to_correct(Alignment& aln, const Alignment& base, const unordered_map<string, string>* translation = nullptr);
 void alignment_set_distance_to_correct(Alignment& aln, const map<string, vector<pair<size_t, bool>>>& base_offsets, const unordered_map<string, string>* translation = nullptr);
 
+/**
+ * Stop the program and print a useful error message if the alignment has
+ * quality values, but not the right number of them for the number of sequence
+ * bases. 
+ */
+void check_quality_length(const Alignment& aln);
+
 /**
  * Represents a report on whether an alignment makes sense in the context of a graph.
  */

diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp
@@ -379,8 +379,13 @@ void Deconstructor::get_genotypes(vcflib::Variant& v, const vector<string>& name
         } else {
             string blank_gt = ".";
             if (gbwt_sample_to_phase_range.count(sample_name)) {
-                auto& phase_range = gbwt_sample_to_phase_range.at(sample_name);
-                for (int phase = phase_range.first + 1; phase <= phase_range.second; ++phase) {
+                // note: this is the same logic used when filling in actual genotypes
+                int min_phase, max_phase;
+                std::tie(min_phase, max_phase) = gbwt_sample_to_phase_range.at(sample_name);
+                // shift left by 1 unless min phase is 0
+                int sample_ploidy = min_phase == 0 ? max_phase + 1 : max_phase;
+                assert(sample_ploidy > 0);
+                for (int phase = 1; phase < sample_ploidy; ++phase) {
                     blank_gt += "|.";
                 }
             }
@@ -1132,6 +1137,10 @@ string Deconstructor::get_vcf_header() {
                 if (haplotype == PathMetadata::NO_HAPLOTYPE) {
                     haplotype = 0;
                 }
+                if (haplotype > 10) {
+                    cerr << "Warning [vg deconstruct]: Suspiciously large haplotype, " << haplotype
+                         << ", parsed from path, " << path_name << ": This will leed to giant GT entries.";
+                }
                 sample_to_haps[sample_name].insert((int)haplotype);
                 sample_names.insert(sample_name);
             }
@@ -1159,6 +1168,10 @@ string Deconstructor::get_vcf_header() {
                             // Default to 0.
                             phase = 0;
                         }
+                        if (phase > 10) {
+                            cerr << "Warning [vg deconstruct]: Suspiciously large haplotype, " << phase
+                                 << ", parsed from GBWT thread, " << path_name << ": This will leed to giant GT entries.";
+                        }
                         sample_to_haps[sample_name].insert((int)phase);
                         sample_names.insert(sample_name);
                     }

diff --git a/src/haplotype_indexer.cpp b/src/haplotype_indexer.cpp
@@ -415,6 +415,127 @@ std::unique_ptr<gbwt::DynamicGBWT> HaplotypeIndexer::build_gbwt(const PathHandle
     return build_gbwt({}, "GBWT", &graph);
 }
 
+//------------------------------------------------------------------------------
+
+// GAF / GAM to GBWT.
+
+// Returns (count, symbol) for present symbols.
+// Assumes that the input is non-empty and all vectors have the same length.
+std::vector<std::pair<size_t, size_t>> present_symbols(const std::vector<std::vector<size_t>>& counts_by_job) {
+    std::vector<std::pair<size_t, size_t>> result;
+    for (size_t symbol = 0; symbol < counts_by_job.front().size(); symbol++) {
+        size_t count = 0;
+        for (size_t j = 0; j < counts_by_job.size(); j++) {
+            count += counts_by_job[j][symbol];
+        }
+        if (count > 0) {
+            result.push_back(std::make_pair(count, symbol));
+        }
+    }
+    return result;
+}
+
+// Inputs: (count, symbol)
+// Outputs: (code length, symbol) sorted by code length
+std::vector<std::pair<size_t, size_t>> canonical_huffman(const std::vector<std::pair<size_t, size_t>>& symbols) {
+    if (symbols.empty()) {
+        return std::vector<std::pair<size_t, size_t>>();
+    }
+
+    // Internal nodes as pairs of children.
+    std::vector<std::pair<size_t, size_t>> nodes;
+    // (count, id), with id referring first to symbols and then to nodes.
+    std::vector<std::pair<size_t, size_t>> queue; queue.reserve(symbols.size());
+    for (size_t i = 0; i < symbols.size(); i++) {
+        queue.push_back(std::make_pair(symbols[i].first, i));
+    }
+    std::make_heap(queue.begin(), queue.end(), std::greater<std::pair<size_t, size_t>>());
+
+    // Build the Huffman tree.
+    while (queue.size() > 1) {
+        std::pop_heap(queue.begin(), queue.end(), std::greater<std::pair<size_t, size_t>>());
+        auto left = queue.back(); queue.pop_back();
+        std::pop_heap(queue.begin(), queue.end(), std::greater<std::pair<size_t, size_t>>());
+        auto right = queue.back(); queue.pop_back();
+        size_t count = left.first + right.first;
+        size_t id = symbols.size() + nodes.size();
+        nodes.push_back(std::make_pair(left.second, right.second));
+        queue.push_back(std::make_pair(count, id));
+        std::push_heap(queue.begin(), queue.end(), std::greater<std::pair<size_t, size_t>>());
+    }
+
+    // Determine the code lengths.
+    std::vector<std::pair<size_t, size_t>> result(symbols.size());
+    std::function<void(size_t, size_t)> dfs = [&](size_t node, size_t depth) {
+        if (node < symbols.size()) {
+            result[node] = std::make_pair(depth, symbols[node].second);
+        } else {
+            dfs(nodes[node - symbols.size()].first, depth + 1);
+            dfs(nodes[node - symbols.size()].second, depth + 1);
+        }
+    };
+    dfs(queue.front().second, 0);
+
+    std::sort(result.begin(), result.end());
+    return result;
+}
+
+std::string longest_common_prefix(const std::vector<std::vector<std::string>>& read_names) {
+    bool has_prefix = false;
+    std::string prefix;
+    for (auto& names : read_names) {
+        for (auto& name : names) {
+            if (!has_prefix) {
+                prefix = name;
+                has_prefix = true;
+            } else {
+                size_t i = 0;
+                while (i < prefix.length() && i < name.length() && prefix[i] == name[i]) {
+                    i++;
+                }
+                prefix.resize(i);
+            }
+        }
+    }
+    return prefix;
+}
+
+// This consumes the read names.
+void create_alignment_metadata(
+    std::vector<std::vector<std::string>>& read_names,
+    const std::string& prefix,
+    gbwt::Metadata& metadata) {
+
+    // We can use 32-bit values, as GBWT metadata uses them as well.
+    string_hash_map<std::string, std::pair<std::uint32_t, std::uint32_t>> read_info; // name -> (sample id, fragment count)
+    for (auto& names : read_names) {
+        for (const std::string& name : names) {
+            std::string sample_name = name.substr(prefix.length());
+            std::uint32_t sample_id = 0, fragment_count = 0;
+            auto iter = read_info.find(sample_name);
+            if (iter == read_info.end()) {
+                sample_id = read_info.size();
+                read_info[sample_name] = std::make_pair(sample_id, fragment_count);
+            } else {
+                sample_id = iter->second.first;
+                fragment_count = iter->second.second;
+                iter->second.second++;
+            }
+            metadata.addPath(sample_id, 0, 0, fragment_count);
+        }
+        names = std::vector<std::string>();
+    }
+    std::vector<std::string> sample_names(read_info.size());
+    for (auto& p : read_info) {
+        sample_names[p.second.first] = p.first;
+    }
+    read_info = string_hash_map<std::string, std::pair<std::uint32_t, std::uint32_t>>();
+
+    metadata.setSamples(sample_names);
+    metadata.setContigs({ "unknown" });
+    metadata.setHaplotypes(sample_names.size());
+}
+
 std::unique_ptr<gbwt::GBWT> HaplotypeIndexer::build_gbwt(const HandleGraph& graph,
     const std::vector<std::string>& aln_filenames, const std::string& aln_format, size_t parallel_jobs) const {
 
@@ -442,6 +563,7 @@ std::unique_ptr<gbwt::GBWT> HaplotypeIndexer::build_gbwt(const HandleGraph& grap
     // GBWT construction.
     std::vector<std::mutex> builder_mutexes(jobs.size());
     std::vector<std::unique_ptr<gbwt::GBWTBuilder>> builders(jobs.size());
+    std::vector<std::vector<size_t>> quality_values(jobs.size(), std::vector<size_t>(256, 0));
     // This is a bit inefficient, as read names are often longer than the SSO threshold for GCC (but not for Clang).
     // TODO: Maybe use concatenated 0-terminated names?
     std::vector<std::vector<std::string>> read_names(jobs.size());
@@ -468,6 +590,10 @@ std::unique_ptr<gbwt::GBWT> HaplotypeIndexer::build_gbwt(const HandleGraph& grap
                 job_id = 0;
             }
         }
+        for (auto c : aln.quality()) {
+            unsigned char value = io::quality_short_to_char(c);
+            quality_values[job_id][value]++;
+        }
         {
             // Insert the path into the appropriate builder and record the read name.
             std::lock_guard<std::mutex> lock(builder_mutexes[job_id]);
@@ -505,6 +631,21 @@ std::unique_ptr<gbwt::GBWT> HaplotypeIndexer::build_gbwt(const HandleGraph& grap
     std::unique_ptr<gbwt::GBWT> result(new gbwt::GBWT(partial_indexes));
     partial_indexes.clear();
 
+    // Determine the quality score alphabet and canonical Huffman code lengths.
+    // The items are first (count, character value) and then (code length, character value).
+    std::vector<std::pair<size_t, size_t>> present = present_symbols(quality_values);
+    present = canonical_huffman(present);
+    std::string alphabet, code_lengths;
+    for (auto symbol : present) {
+        alphabet.push_back(symbol.second);
+        if (code_lengths.length() > 0) {
+            code_lengths.push_back(',');
+        }
+        code_lengths.append(std::to_string(symbol.first));
+    }
+    result->tags.set("quality_values", alphabet);
+    result->tags.set("quality_lengths", code_lengths);
+
     // Create the metadata.
     if (this->show_progress) {
         #pragma omp critical
@@ -513,34 +654,9 @@ std::unique_ptr<gbwt::GBWT> HaplotypeIndexer::build_gbwt(const HandleGraph& grap
         }
     }
     result->addMetadata();
-    result->metadata.setContigs({ "unknown" });
-    {
-        // We can use 32-bit values, as GBWT metadata uses them as well.
-        string_hash_map<std::string, std::pair<std::uint32_t, std::uint32_t>> read_info; // name -> (sample id, fragment count)
-        for (auto& names : read_names) {
-            for (const std::string& name : names) {
-                std::uint32_t sample_id = 0, fragment_count = 0;
-                auto iter = read_info.find(name);
-                if (iter == read_info.end()) {
-                    sample_id = read_info.size();
-                    read_info[name] = std::make_pair(sample_id, fragment_count);
-                } else {
-                    sample_id = iter->second.first;
-                    fragment_count = iter->second.second;
-                    iter->second.second++;
-                }
-                result->metadata.addPath(sample_id, 0, 0, fragment_count);
-            }
-            names = std::vector<std::string>();
-        }
-        std::vector<std::string> sample_names(read_info.size());
-        for (auto& p : read_info) {
-            sample_names[p.second.first] = p.first;
-        }
-        read_info = string_hash_map<std::string, std::pair<std::uint32_t, std::uint32_t>>();
-        result->metadata.setSamples(sample_names);
-        result->metadata.setHaplotypes(sample_names.size());
-    }
+    std::string prefix = longest_common_prefix(read_names);
+    result->tags.set("sample_prefix", prefix);
+    create_alignment_metadata(read_names, prefix, result->metadata);
     if (this->show_progress) {
         #pragma omp critical
         {
@@ -555,4 +671,6 @@ std::unique_ptr<gbwt::GBWT> HaplotypeIndexer::build_gbwt(const HandleGraph& grap
     return result;
 }
 
+//------------------------------------------------------------------------------
+
 }
diff --git a/src/haplotype_indexer.hpp b/src/haplotype_indexer.hpp
@@ -140,6 +140,15 @@ class HaplotypeIndexer : public Progressive {
      * Runs approximately the given number of jobs in parallel. The exact
      * number depends on the sizes of weakly connected components in the graph.
      * Each job uses at most 2 threads.
+     *
+     * In order to save space, the longest common prefix of alignment names
+     * is stored as GBWT tag sample_prefix. Only the diverging suffixes are
+     * stored as sample names.
+     *
+     * Additionally, the list of values used in quality strings is stored as GBWT
+     * tag quality_values. Tag quality_codes stores a comma-separated list of
+     * canonical Huffman code lengths for the quality values. The values are
+     * sorted by code length.
      */
     std::unique_ptr<gbwt::GBWT> build_gbwt(const HandleGraph& graph,
         const std::vector<std::string>& aln_filenames, const std::string& aln_format,

diff --git a/src/readfilter.hpp b/src/readfilter.hpp
@@ -1534,6 +1534,8 @@ inline void ReadFilter<Alignment>::emit_tsv(Alignment& read, std::ostream& out)
             out << softclip_start(read);
         } else if (field == "softclip_end") {
             out << softclip_end(read);
+        } else if (field == "identity") {
+            out << read.identity();
         } else if (field == "mapping_quality") {
             out << get_mapq(read); 
         } else if (field == "sequence") {