Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into recursive-chunk
Browse files Browse the repository at this point in the history
  • Loading branch information
adamnovak committed Jan 17, 2025
2 parents d2baa7e + e91be89 commit 11f8b6b
Show file tree
Hide file tree
Showing 17 changed files with 817 additions and 157 deletions.
2 changes: 1 addition & 1 deletion deps/gbwtgraph
Submodule gbwtgraph updated 1 files
+30 −21 src/utils.cpp
24 changes: 24 additions & 0 deletions src/alignment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2847,6 +2847,30 @@ void alignment_set_distance_to_correct(Alignment& aln, const map<string ,vector<
}
}

void check_quality_length(const Alignment& aln) {
size_t quality_length = aln.quality().length();
if (quality_length == 0 || quality_length == aln.sequence().length()) {
// This length is acceptable.
return;
}

bool too_short = quality_length < aln.sequence().length();

std::stringstream ss;
ss << "Read " << aln.name() << " has " << aln.sequence().length() << " bases of sequence but ";
if (too_short) {
ss << "only ";
}
ss << quality_length << " base quality values.";
if (too_short) {
ss << " Was the quality information truncated?";
}

#pragma omp critical (cerr)
std::cerr << "error [vg::alignment.cpp]: " << ss.str() << std::endl;
exit(1);
}

AlignmentValidity alignment_is_valid(const Alignment& aln, const HandleGraph* hgraph, bool check_sequence) {
size_t read_idx = 0;
for (size_t i = 0; i < aln.path().mapping_size(); ++i) {
Expand Down
7 changes: 7 additions & 0 deletions src/alignment.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,13 @@ map<string ,vector<pair<size_t, bool> > > alignment_refpos_to_path_offsets(const
void alignment_set_distance_to_correct(Alignment& aln, const Alignment& base, const unordered_map<string, string>* translation = nullptr);
void alignment_set_distance_to_correct(Alignment& aln, const map<string, vector<pair<size_t, bool>>>& base_offsets, const unordered_map<string, string>* translation = nullptr);

/**
* Stop the program and print a useful error message if the alignment has
* quality values, but not the right number of them for the number of sequence
* bases.
*/
void check_quality_length(const Alignment& aln);

/**
* Represents a report on whether an alignment makes sense in the context of a graph.
*/
Expand Down
17 changes: 15 additions & 2 deletions src/deconstructor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -379,8 +379,13 @@ void Deconstructor::get_genotypes(vcflib::Variant& v, const vector<string>& name
} else {
string blank_gt = ".";
if (gbwt_sample_to_phase_range.count(sample_name)) {
auto& phase_range = gbwt_sample_to_phase_range.at(sample_name);
for (int phase = phase_range.first + 1; phase <= phase_range.second; ++phase) {
// note: this is the same logic used when filling in actual genotypes
int min_phase, max_phase;
std::tie(min_phase, max_phase) = gbwt_sample_to_phase_range.at(sample_name);
// shift left by 1 unless min phase is 0
int sample_ploidy = min_phase == 0 ? max_phase + 1 : max_phase;
assert(sample_ploidy > 0);
for (int phase = 1; phase < sample_ploidy; ++phase) {
blank_gt += "|.";
}
}
Expand Down Expand Up @@ -1132,6 +1137,10 @@ string Deconstructor::get_vcf_header() {
if (haplotype == PathMetadata::NO_HAPLOTYPE) {
haplotype = 0;
}
if (haplotype > 10) {
cerr << "Warning [vg deconstruct]: Suspiciously large haplotype, " << haplotype
<< ", parsed from path, " << path_name << ": This will leed to giant GT entries.";
}
sample_to_haps[sample_name].insert((int)haplotype);
sample_names.insert(sample_name);
}
Expand Down Expand Up @@ -1159,6 +1168,10 @@ string Deconstructor::get_vcf_header() {
// Default to 0.
phase = 0;
}
if (phase > 10) {
cerr << "Warning [vg deconstruct]: Suspiciously large haplotype, " << phase
<< ", parsed from GBWT thread, " << path_name << ": This will leed to giant GT entries.";
}
sample_to_haps[sample_name].insert((int)phase);
sample_names.insert(sample_name);
}
Expand Down
174 changes: 146 additions & 28 deletions src/haplotype_indexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,127 @@ std::unique_ptr<gbwt::DynamicGBWT> HaplotypeIndexer::build_gbwt(const PathHandle
return build_gbwt({}, "GBWT", &graph);
}

//------------------------------------------------------------------------------

// GAF / GAM to GBWT.

// Returns (count, symbol) for present symbols.
// Assumes that the input is non-empty and all vectors have the same length.
std::vector<std::pair<size_t, size_t>> present_symbols(const std::vector<std::vector<size_t>>& counts_by_job) {
std::vector<std::pair<size_t, size_t>> result;
for (size_t symbol = 0; symbol < counts_by_job.front().size(); symbol++) {
size_t count = 0;
for (size_t j = 0; j < counts_by_job.size(); j++) {
count += counts_by_job[j][symbol];
}
if (count > 0) {
result.push_back(std::make_pair(count, symbol));
}
}
return result;
}

// Inputs: (count, symbol)
// Outputs: (code length, symbol) sorted by code length
std::vector<std::pair<size_t, size_t>> canonical_huffman(const std::vector<std::pair<size_t, size_t>>& symbols) {
if (symbols.empty()) {
return std::vector<std::pair<size_t, size_t>>();
}

// Internal nodes as pairs of children.
std::vector<std::pair<size_t, size_t>> nodes;
// (count, id), with id referring first to symbols and then to nodes.
std::vector<std::pair<size_t, size_t>> queue; queue.reserve(symbols.size());
for (size_t i = 0; i < symbols.size(); i++) {
queue.push_back(std::make_pair(symbols[i].first, i));
}
std::make_heap(queue.begin(), queue.end(), std::greater<std::pair<size_t, size_t>>());

// Build the Huffman tree.
while (queue.size() > 1) {
std::pop_heap(queue.begin(), queue.end(), std::greater<std::pair<size_t, size_t>>());
auto left = queue.back(); queue.pop_back();
std::pop_heap(queue.begin(), queue.end(), std::greater<std::pair<size_t, size_t>>());
auto right = queue.back(); queue.pop_back();
size_t count = left.first + right.first;
size_t id = symbols.size() + nodes.size();
nodes.push_back(std::make_pair(left.second, right.second));
queue.push_back(std::make_pair(count, id));
std::push_heap(queue.begin(), queue.end(), std::greater<std::pair<size_t, size_t>>());
}

// Determine the code lengths.
std::vector<std::pair<size_t, size_t>> result(symbols.size());
std::function<void(size_t, size_t)> dfs = [&](size_t node, size_t depth) {
if (node < symbols.size()) {
result[node] = std::make_pair(depth, symbols[node].second);
} else {
dfs(nodes[node - symbols.size()].first, depth + 1);
dfs(nodes[node - symbols.size()].second, depth + 1);
}
};
dfs(queue.front().second, 0);

std::sort(result.begin(), result.end());
return result;
}

std::string longest_common_prefix(const std::vector<std::vector<std::string>>& read_names) {
bool has_prefix = false;
std::string prefix;
for (auto& names : read_names) {
for (auto& name : names) {
if (!has_prefix) {
prefix = name;
has_prefix = true;
} else {
size_t i = 0;
while (i < prefix.length() && i < name.length() && prefix[i] == name[i]) {
i++;
}
prefix.resize(i);
}
}
}
return prefix;
}

// This consumes the read names.
void create_alignment_metadata(
std::vector<std::vector<std::string>>& read_names,
const std::string& prefix,
gbwt::Metadata& metadata) {

// We can use 32-bit values, as GBWT metadata uses them as well.
string_hash_map<std::string, std::pair<std::uint32_t, std::uint32_t>> read_info; // name -> (sample id, fragment count)
for (auto& names : read_names) {
for (const std::string& name : names) {
std::string sample_name = name.substr(prefix.length());
std::uint32_t sample_id = 0, fragment_count = 0;
auto iter = read_info.find(sample_name);
if (iter == read_info.end()) {
sample_id = read_info.size();
read_info[sample_name] = std::make_pair(sample_id, fragment_count);
} else {
sample_id = iter->second.first;
fragment_count = iter->second.second;
iter->second.second++;
}
metadata.addPath(sample_id, 0, 0, fragment_count);
}
names = std::vector<std::string>();
}
std::vector<std::string> sample_names(read_info.size());
for (auto& p : read_info) {
sample_names[p.second.first] = p.first;
}
read_info = string_hash_map<std::string, std::pair<std::uint32_t, std::uint32_t>>();

metadata.setSamples(sample_names);
metadata.setContigs({ "unknown" });
metadata.setHaplotypes(sample_names.size());
}

std::unique_ptr<gbwt::GBWT> HaplotypeIndexer::build_gbwt(const HandleGraph& graph,
const std::vector<std::string>& aln_filenames, const std::string& aln_format, size_t parallel_jobs) const {

Expand Down Expand Up @@ -442,6 +563,7 @@ std::unique_ptr<gbwt::GBWT> HaplotypeIndexer::build_gbwt(const HandleGraph& grap
// GBWT construction.
std::vector<std::mutex> builder_mutexes(jobs.size());
std::vector<std::unique_ptr<gbwt::GBWTBuilder>> builders(jobs.size());
std::vector<std::vector<size_t>> quality_values(jobs.size(), std::vector<size_t>(256, 0));
// This is a bit inefficient, as read names are often longer than the SSO threshold for GCC (but not for Clang).
// TODO: Maybe use concatenated 0-terminated names?
std::vector<std::vector<std::string>> read_names(jobs.size());
Expand All @@ -468,6 +590,10 @@ std::unique_ptr<gbwt::GBWT> HaplotypeIndexer::build_gbwt(const HandleGraph& grap
job_id = 0;
}
}
for (auto c : aln.quality()) {
unsigned char value = io::quality_short_to_char(c);
quality_values[job_id][value]++;
}
{
// Insert the path into the appropriate builder and record the read name.
std::lock_guard<std::mutex> lock(builder_mutexes[job_id]);
Expand Down Expand Up @@ -505,6 +631,21 @@ std::unique_ptr<gbwt::GBWT> HaplotypeIndexer::build_gbwt(const HandleGraph& grap
std::unique_ptr<gbwt::GBWT> result(new gbwt::GBWT(partial_indexes));
partial_indexes.clear();

// Determine the quality score alphabet and canonical Huffman code lengths.
// The items are first (count, character value) and then (code length, character value).
std::vector<std::pair<size_t, size_t>> present = present_symbols(quality_values);
present = canonical_huffman(present);
std::string alphabet, code_lengths;
for (auto symbol : present) {
alphabet.push_back(symbol.second);
if (code_lengths.length() > 0) {
code_lengths.push_back(',');
}
code_lengths.append(std::to_string(symbol.first));
}
result->tags.set("quality_values", alphabet);
result->tags.set("quality_lengths", code_lengths);

// Create the metadata.
if (this->show_progress) {
#pragma omp critical
Expand All @@ -513,34 +654,9 @@ std::unique_ptr<gbwt::GBWT> HaplotypeIndexer::build_gbwt(const HandleGraph& grap
}
}
result->addMetadata();
result->metadata.setContigs({ "unknown" });
{
// We can use 32-bit values, as GBWT metadata uses them as well.
string_hash_map<std::string, std::pair<std::uint32_t, std::uint32_t>> read_info; // name -> (sample id, fragment count)
for (auto& names : read_names) {
for (const std::string& name : names) {
std::uint32_t sample_id = 0, fragment_count = 0;
auto iter = read_info.find(name);
if (iter == read_info.end()) {
sample_id = read_info.size();
read_info[name] = std::make_pair(sample_id, fragment_count);
} else {
sample_id = iter->second.first;
fragment_count = iter->second.second;
iter->second.second++;
}
result->metadata.addPath(sample_id, 0, 0, fragment_count);
}
names = std::vector<std::string>();
}
std::vector<std::string> sample_names(read_info.size());
for (auto& p : read_info) {
sample_names[p.second.first] = p.first;
}
read_info = string_hash_map<std::string, std::pair<std::uint32_t, std::uint32_t>>();
result->metadata.setSamples(sample_names);
result->metadata.setHaplotypes(sample_names.size());
}
std::string prefix = longest_common_prefix(read_names);
result->tags.set("sample_prefix", prefix);
create_alignment_metadata(read_names, prefix, result->metadata);
if (this->show_progress) {
#pragma omp critical
{
Expand All @@ -555,4 +671,6 @@ std::unique_ptr<gbwt::GBWT> HaplotypeIndexer::build_gbwt(const HandleGraph& grap
return result;
}

//------------------------------------------------------------------------------

}
9 changes: 9 additions & 0 deletions src/haplotype_indexer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,15 @@ class HaplotypeIndexer : public Progressive {
* Runs approximately the given number of jobs in parallel. The exact
* number depends on the sizes of weakly connected components in the graph.
* Each job uses at most 2 threads.
*
* In order to save space, the longest common prefix of alignment names
* is stored as GBWT tag sample_prefix. Only the diverging suffixes are
* stored as sample names.
*
* Additionally, the list of values used in quality strings is stored as GBWT
* tag quality_values. Tag quality_codes stores a comma-separated list of
* canonical Huffman code lengths for the quality values. The values are
* sorted by code length.
*/
std::unique_ptr<gbwt::GBWT> build_gbwt(const HandleGraph& graph,
const std::vector<std::string>& aln_filenames, const std::string& aln_format,
Expand Down
2 changes: 2 additions & 0 deletions src/readfilter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1534,6 +1534,8 @@ inline void ReadFilter<Alignment>::emit_tsv(Alignment& read, std::ostream& out)
out << softclip_start(read);
} else if (field == "softclip_end") {
out << softclip_end(read);
} else if (field == "identity") {
out << read.identity();
} else if (field == "mapping_quality") {
out << get_mapq(read);
} else if (field == "sequence") {
Expand Down
Loading

1 comment on commit 11f8b6b

@adamnovak
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for branch recursive-chunk. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 17253 seconds

Please sign in to comment.