diff --git a/deps/gbwtgraph b/deps/gbwtgraph
index e16221f047..9fa7f6fa82 160000
--- a/deps/gbwtgraph
+++ b/deps/gbwtgraph
@@ -1 +1 @@
-Subproject commit e16221f0479cdf6a7c709ce353687a059503bb67
+Subproject commit 9fa7f6fa82f329399a99c1e84573bdae96940d1b
diff --git a/src/alignment.cpp b/src/alignment.cpp
index 274476e16c..49a39e0787 100644
--- a/src/alignment.cpp
+++ b/src/alignment.cpp
@@ -2847,6 +2847,30 @@ void alignment_set_distance_to_correct(Alignment& aln, const map<string ,vector<
     }
 }
 
+void check_quality_length(const Alignment& aln) {
+    size_t quality_length = aln.quality().length();
+    if (quality_length == 0 || quality_length == aln.sequence().length()) {
+        // This length is acceptable.
+        return;
+    }
+
+    bool too_short = quality_length < aln.sequence().length();
+
+    std::stringstream ss;
+    ss << "Read " << aln.name() << " has " << aln.sequence().length() << " bases of sequence but ";
+    if (too_short) {
+        ss << "only ";
+    }
+    ss << quality_length << " base quality values.";
+    if (too_short) {
+        ss << " Was the quality information truncated?";
+    }
+    
+    #pragma omp critical (cerr)
+    std::cerr << "error [vg::alignment.cpp]: " << ss.str() << std::endl;
+    exit(1);
+}
+
 AlignmentValidity alignment_is_valid(const Alignment& aln, const HandleGraph* hgraph, bool check_sequence) {
     size_t read_idx = 0;
     for (size_t i = 0; i < aln.path().mapping_size(); ++i) {
diff --git a/src/alignment.hpp b/src/alignment.hpp
index f5230775bb..a90dc500b0 100644
--- a/src/alignment.hpp
+++ b/src/alignment.hpp
@@ -328,6 +328,13 @@ map<string ,vector<pair<size_t, bool> > > alignment_refpos_to_path_offsets(const
 void alignment_set_distance_to_correct(Alignment& aln, const Alignment& base, const unordered_map<string, string>* translation = nullptr);
 void alignment_set_distance_to_correct(Alignment& aln, const map<string, vector<pair<size_t, bool>>>& base_offsets, const unordered_map<string, string>* translation = nullptr);
 
+/**
+ * Stop the program and print a useful error message if the alignment has
+ * quality values, but not the right number of them for the number of sequence
+ * bases. 
+ */
+void check_quality_length(const Alignment& aln);
+
 /**
  * Represents a report on whether an alignment makes sense in the context of a graph.
  */
diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp
index 0f15b8139d..a0ed588db4 100644
--- a/src/deconstructor.cpp
+++ b/src/deconstructor.cpp
@@ -379,8 +379,13 @@ void Deconstructor::get_genotypes(vcflib::Variant& v, const vector<string>& name
         } else {
             string blank_gt = ".";
             if (gbwt_sample_to_phase_range.count(sample_name)) {
-                auto& phase_range = gbwt_sample_to_phase_range.at(sample_name);
-                for (int phase = phase_range.first + 1; phase <= phase_range.second; ++phase) {
+                // note: this is the same logic used when filling in actual genotypes
+                int min_phase, max_phase;
+                std::tie(min_phase, max_phase) = gbwt_sample_to_phase_range.at(sample_name);
+                // shift left by 1 unless min phase is 0
+                int sample_ploidy = min_phase == 0 ? max_phase + 1 : max_phase;
+                assert(sample_ploidy > 0);
+                for (int phase = 1; phase < sample_ploidy; ++phase) {
                     blank_gt += "|.";
                 }
             }
@@ -1132,6 +1137,10 @@ string Deconstructor::get_vcf_header() {
                 if (haplotype == PathMetadata::NO_HAPLOTYPE) {
                     haplotype = 0;
                 }
+                if (haplotype > 10) {
+                    cerr << "Warning [vg deconstruct]: Suspiciously large haplotype, " << haplotype
+                         << ", parsed from path, " << path_name << ": This will leed to giant GT entries.";
+                }
                 sample_to_haps[sample_name].insert((int)haplotype);
                 sample_names.insert(sample_name);
             }
@@ -1159,6 +1168,10 @@ string Deconstructor::get_vcf_header() {
                             // Default to 0.
                             phase = 0;
                         }
+                        if (phase > 10) {
+                            cerr << "Warning [vg deconstruct]: Suspiciously large haplotype, " << phase
+                                 << ", parsed from GBWT thread, " << path_name << ": This will leed to giant GT entries.";
+                        }
                         sample_to_haps[sample_name].insert((int)phase);
                         sample_names.insert(sample_name);
                     }
diff --git a/src/haplotype_indexer.cpp b/src/haplotype_indexer.cpp
index 75399a86e6..38d79601e8 100644
--- a/src/haplotype_indexer.cpp
+++ b/src/haplotype_indexer.cpp
@@ -415,6 +415,127 @@ std::unique_ptr<gbwt::DynamicGBWT> HaplotypeIndexer::build_gbwt(const PathHandle
     return build_gbwt({}, "GBWT", &graph);
 }
 
+//------------------------------------------------------------------------------
+
+// GAF / GAM to GBWT.
+
+// Returns (count, symbol) for present symbols.
+// Assumes that the input is non-empty and all vectors have the same length.
+std::vector<std::pair<size_t, size_t>> present_symbols(const std::vector<std::vector<size_t>>& counts_by_job) {
+    std::vector<std::pair<size_t, size_t>> result;
+    for (size_t symbol = 0; symbol < counts_by_job.front().size(); symbol++) {
+        size_t count = 0;
+        for (size_t j = 0; j < counts_by_job.size(); j++) {
+            count += counts_by_job[j][symbol];
+        }
+        if (count > 0) {
+            result.push_back(std::make_pair(count, symbol));
+        }
+    }
+    return result;
+}
+
+// Inputs: (count, symbol)
+// Outputs: (code length, symbol) sorted by code length
+std::vector<std::pair<size_t, size_t>> canonical_huffman(const std::vector<std::pair<size_t, size_t>>& symbols) {
+    if (symbols.empty()) {
+        return std::vector<std::pair<size_t, size_t>>();
+    }
+
+    // Internal nodes as pairs of children.
+    std::vector<std::pair<size_t, size_t>> nodes;
+    // (count, id), with id referring first to symbols and then to nodes.
+    std::vector<std::pair<size_t, size_t>> queue; queue.reserve(symbols.size());
+    for (size_t i = 0; i < symbols.size(); i++) {
+        queue.push_back(std::make_pair(symbols[i].first, i));
+    }
+    std::make_heap(queue.begin(), queue.end(), std::greater<std::pair<size_t, size_t>>());
+
+    // Build the Huffman tree.
+    while (queue.size() > 1) {
+        std::pop_heap(queue.begin(), queue.end(), std::greater<std::pair<size_t, size_t>>());
+        auto left = queue.back(); queue.pop_back();
+        std::pop_heap(queue.begin(), queue.end(), std::greater<std::pair<size_t, size_t>>());
+        auto right = queue.back(); queue.pop_back();
+        size_t count = left.first + right.first;
+        size_t id = symbols.size() + nodes.size();
+        nodes.push_back(std::make_pair(left.second, right.second));
+        queue.push_back(std::make_pair(count, id));
+        std::push_heap(queue.begin(), queue.end(), std::greater<std::pair<size_t, size_t>>());
+    }
+
+    // Determine the code lengths.
+    std::vector<std::pair<size_t, size_t>> result(symbols.size());
+    std::function<void(size_t, size_t)> dfs = [&](size_t node, size_t depth) {
+        if (node < symbols.size()) {
+            result[node] = std::make_pair(depth, symbols[node].second);
+        } else {
+            dfs(nodes[node - symbols.size()].first, depth + 1);
+            dfs(nodes[node - symbols.size()].second, depth + 1);
+        }
+    };
+    dfs(queue.front().second, 0);
+
+    std::sort(result.begin(), result.end());
+    return result;
+}
+
+std::string longest_common_prefix(const std::vector<std::vector<std::string>>& read_names) {
+    bool has_prefix = false;
+    std::string prefix;
+    for (auto& names : read_names) {
+        for (auto& name : names) {
+            if (!has_prefix) {
+                prefix = name;
+                has_prefix = true;
+            } else {
+                size_t i = 0;
+                while (i < prefix.length() && i < name.length() && prefix[i] == name[i]) {
+                    i++;
+                }
+                prefix.resize(i);
+            }
+        }
+    }
+    return prefix;
+}
+
+// This consumes the read names.
+void create_alignment_metadata(
+    std::vector<std::vector<std::string>>& read_names,
+    const std::string& prefix,
+    gbwt::Metadata& metadata) {
+
+    // We can use 32-bit values, as GBWT metadata uses them as well.
+    string_hash_map<std::string, std::pair<std::uint32_t, std::uint32_t>> read_info; // name -> (sample id, fragment count)
+    for (auto& names : read_names) {
+        for (const std::string& name : names) {
+            std::string sample_name = name.substr(prefix.length());
+            std::uint32_t sample_id = 0, fragment_count = 0;
+            auto iter = read_info.find(sample_name);
+            if (iter == read_info.end()) {
+                sample_id = read_info.size();
+                read_info[sample_name] = std::make_pair(sample_id, fragment_count);
+            } else {
+                sample_id = iter->second.first;
+                fragment_count = iter->second.second;
+                iter->second.second++;
+            }
+            metadata.addPath(sample_id, 0, 0, fragment_count);
+        }
+        names = std::vector<std::string>();
+    }
+    std::vector<std::string> sample_names(read_info.size());
+    for (auto& p : read_info) {
+        sample_names[p.second.first] = p.first;
+    }
+    read_info = string_hash_map<std::string, std::pair<std::uint32_t, std::uint32_t>>();
+
+    metadata.setSamples(sample_names);
+    metadata.setContigs({ "unknown" });
+    metadata.setHaplotypes(sample_names.size());
+}
+
 std::unique_ptr<gbwt::GBWT> HaplotypeIndexer::build_gbwt(const HandleGraph& graph,
     const std::vector<std::string>& aln_filenames, const std::string& aln_format, size_t parallel_jobs) const {
 
@@ -442,6 +563,7 @@ std::unique_ptr<gbwt::GBWT> HaplotypeIndexer::build_gbwt(const HandleGraph& grap
     // GBWT construction.
     std::vector<std::mutex> builder_mutexes(jobs.size());
     std::vector<std::unique_ptr<gbwt::GBWTBuilder>> builders(jobs.size());
+    std::vector<std::vector<size_t>> quality_values(jobs.size(), std::vector<size_t>(256, 0));
     // This is a bit inefficient, as read names are often longer than the SSO threshold for GCC (but not for Clang).
     // TODO: Maybe use concatenated 0-terminated names?
     std::vector<std::vector<std::string>> read_names(jobs.size());
@@ -468,6 +590,10 @@ std::unique_ptr<gbwt::GBWT> HaplotypeIndexer::build_gbwt(const HandleGraph& grap
                 job_id = 0;
             }
         }
+        for (auto c : aln.quality()) {
+            unsigned char value = io::quality_short_to_char(c);
+            quality_values[job_id][value]++;
+        }
         {
             // Insert the path into the appropriate builder and record the read name.
             std::lock_guard<std::mutex> lock(builder_mutexes[job_id]);
@@ -505,6 +631,21 @@ std::unique_ptr<gbwt::GBWT> HaplotypeIndexer::build_gbwt(const HandleGraph& grap
     std::unique_ptr<gbwt::GBWT> result(new gbwt::GBWT(partial_indexes));
     partial_indexes.clear();
 
+    // Determine the quality score alphabet and canonical Huffman code lengths.
+    // The items are first (count, character value) and then (code length, character value).
+    std::vector<std::pair<size_t, size_t>> present = present_symbols(quality_values);
+    present = canonical_huffman(present);
+    std::string alphabet, code_lengths;
+    for (auto symbol : present) {
+        alphabet.push_back(symbol.second);
+        if (code_lengths.length() > 0) {
+            code_lengths.push_back(',');
+        }
+        code_lengths.append(std::to_string(symbol.first));
+    }
+    result->tags.set("quality_values", alphabet);
+    result->tags.set("quality_lengths", code_lengths);
+
     // Create the metadata.
     if (this->show_progress) {
         #pragma omp critical
@@ -513,34 +654,9 @@ std::unique_ptr<gbwt::GBWT> HaplotypeIndexer::build_gbwt(const HandleGraph& grap
         }
     }
     result->addMetadata();
-    result->metadata.setContigs({ "unknown" });
-    {
-        // We can use 32-bit values, as GBWT metadata uses them as well.
-        string_hash_map<std::string, std::pair<std::uint32_t, std::uint32_t>> read_info; // name -> (sample id, fragment count)
-        for (auto& names : read_names) {
-            for (const std::string& name : names) {
-                std::uint32_t sample_id = 0, fragment_count = 0;
-                auto iter = read_info.find(name);
-                if (iter == read_info.end()) {
-                    sample_id = read_info.size();
-                    read_info[name] = std::make_pair(sample_id, fragment_count);
-                } else {
-                    sample_id = iter->second.first;
-                    fragment_count = iter->second.second;
-                    iter->second.second++;
-                }
-                result->metadata.addPath(sample_id, 0, 0, fragment_count);
-            }
-            names = std::vector<std::string>();
-        }
-        std::vector<std::string> sample_names(read_info.size());
-        for (auto& p : read_info) {
-            sample_names[p.second.first] = p.first;
-        }
-        read_info = string_hash_map<std::string, std::pair<std::uint32_t, std::uint32_t>>();
-        result->metadata.setSamples(sample_names);
-        result->metadata.setHaplotypes(sample_names.size());
-    }
+    std::string prefix = longest_common_prefix(read_names);
+    result->tags.set("sample_prefix", prefix);
+    create_alignment_metadata(read_names, prefix, result->metadata);
     if (this->show_progress) {
         #pragma omp critical
         {
@@ -555,4 +671,6 @@ std::unique_ptr<gbwt::GBWT> HaplotypeIndexer::build_gbwt(const HandleGraph& grap
     return result;
 }
 
+//------------------------------------------------------------------------------
+
 }
diff --git a/src/haplotype_indexer.hpp b/src/haplotype_indexer.hpp
index ca212a9426..ab3a69c587 100644
--- a/src/haplotype_indexer.hpp
+++ b/src/haplotype_indexer.hpp
@@ -140,6 +140,15 @@ class HaplotypeIndexer : public Progressive {
      * Runs approximately the given number of jobs in parallel. The exact
      * number depends on the sizes of weakly connected components in the graph.
      * Each job uses at most 2 threads.
+     *
+     * In order to save space, the longest common prefix of alignment names
+     * is stored as GBWT tag sample_prefix. Only the diverging suffixes are
+     * stored as sample names.
+     *
+     * Additionally, the list of values used in quality strings is stored as GBWT
+     * tag quality_values. Tag quality_codes stores a comma-separated list of
+     * canonical Huffman code lengths for the quality values. The values are
+     * sorted by code length.
      */
     std::unique_ptr<gbwt::GBWT> build_gbwt(const HandleGraph& graph,
         const std::vector<std::string>& aln_filenames, const std::string& aln_format,
diff --git a/src/readfilter.hpp b/src/readfilter.hpp
index 3d38bba2f4..31d2caa1aa 100644
--- a/src/readfilter.hpp
+++ b/src/readfilter.hpp
@@ -1534,6 +1534,8 @@ inline void ReadFilter<Alignment>::emit_tsv(Alignment& read, std::ostream& out)
             out << softclip_start(read);
         } else if (field == "softclip_end") {
             out << softclip_end(read);
+        } else if (field == "identity") {
+            out << read.identity();
         } else if (field == "mapping_quality") {
             out << get_mapq(read); 
         } else if (field == "sequence") {
diff --git a/src/recombinator.cpp b/src/recombinator.cpp
index a4ababbcb5..65d540b2d0 100644
--- a/src/recombinator.cpp
+++ b/src/recombinator.cpp
@@ -66,7 +66,7 @@ hash_map<Haplotypes::Subchain::kmer_type, size_t> Haplotypes::kmer_counts(const
         for (size_t subchain_id = 0; subchain_id < chain.subchains.size(); subchain_id++) {
             const Subchain& subchain = chain.subchains[subchain_id];
             for (size_t kmer_id = 0; kmer_id < subchain.kmers.size(); kmer_id++) {
-                result[subchain.kmers[kmer_id].first] = 0;
+                result[subchain.kmers[kmer_id]] = 0;
             }
         }
     }
@@ -146,51 +146,145 @@ std::string Haplotypes::Subchain::to_string() const {
     return result;
 }
 
+size_t Haplotypes::Subchain::distance(const gbwtgraph::GBZ& gbz, size_t i) const {
+    if (this->type != normal || i >= this->sequences.size()) {
+        return 0;
+    }
+
+    size_t result = 1;
+    gbwt::edge_type curr(this->start, this->sequences[i].second);
+    while (true) {
+        curr = gbz.index.LF(curr);
+        if (curr.first == gbwt::ENDMARKER || curr.first == this->end) {
+            break;
+        }
+        result += gbz.graph.get_length(gbwtgraph::GBWTGraph::node_to_handle(curr.first));
+    }
+
+    return result;
+}
+
+// TODO: What is the right formula?
+double Haplotypes::Subchain::badness(const gbwtgraph::GBZ& gbz) const {
+    double result = 0.0;
+
+    // Factor 1: Subchain length, ideally over a reference path.
+    if (this->type == normal) {
+        size_t expected_length = HaplotypePartitioner::SUBCHAIN_LENGTH;
+        size_t selected = 0;
+        for (size_t i = 0; i < this->sequences.size(); i++) {
+            gbwt::size_type path_id = gbwt::Path::id(this->sequences[i].first);
+            auto found = gbz.graph.id_to_path.find(path_id);
+            if (found != gbz.graph.id_to_path.end()) {
+                selected = i; break;
+            }
+        }
+        size_t length = this->distance(gbz, selected);
+        if (length < expected_length) {
+            result += std::log(static_cast<double>(expected_length) / static_cast<double>(length));
+        }
+    }
+
+    // Factor 2: Number of haplotypes relative to the expected number.
+    size_t expected_haplotypes = gbz.index.metadata.haplotypes();
+    size_t haplotypes = this->sequences.size();
+    if (haplotypes < expected_haplotypes) {
+        result += std::log(static_cast<double>(expected_haplotypes) / static_cast<double>(haplotypes));
+    }
+
+    // Factor 3: Information content of the kmers.
+    // Disabled for the moment.
+    /* double expected_entropy = 4.0 * std::log(static_cast<double>(haplotypes));
+    double entropy = 0.0;
+    for (size_t i = 0; i < this->kmer_counts.size(); i++) {
+        double p = static_cast<double>(this->kmer_counts[i]) / static_cast<double>(haplotypes);
+        entropy -= p * std::log(p);
+    }
+    if (entropy < expected_entropy) {
+        result += expected_entropy - entropy;
+    } */
+
+    return result;
+}
+
 void Haplotypes::Subchain::simple_sds_serialize(std::ostream& out) const {
     sdsl::simple_sds::serialize_value<std::uint64_t>(this->type, out);
     sdsl::simple_sds::serialize_value<gbwt::node_type>(this->start, out);
     sdsl::simple_sds::serialize_value<gbwt::node_type>(this->end, out);
     sdsl::simple_sds::serialize_vector(this->kmers, out);
+    this->kmer_counts.simple_sds_serialize(out);
     sdsl::simple_sds::serialize_vector(this->sequences, out);
     this->kmers_present.simple_sds_serialize(out);
 }
 
-void Haplotypes::Subchain::simple_sds_load(std::istream& in) {
+void load_subchain_header(Haplotypes::Subchain& subchain, std::istream& in) {
     std::uint64_t temp = sdsl::simple_sds::load_value<std::uint64_t>(in);
     switch (temp) {
-    case normal: // Fall through.
-    case prefix: // Fall through.
-    case suffix: // Fall through.
-    case full_haplotype:
-        this->type = static_cast<subchain_t>(temp);
+    case Haplotypes::Subchain::normal: // Fall through.
+    case Haplotypes::Subchain::prefix: // Fall through.
+    case Haplotypes::Subchain::suffix: // Fall through.
+    case Haplotypes::Subchain::full_haplotype:
+        subchain.type = static_cast<Haplotypes::Subchain::subchain_t>(temp);
         break;
     default:
         throw sdsl::simple_sds::InvalidData("Invalid subchain type: " + std::to_string(temp));
     }
 
-    this->start = sdsl::simple_sds::load_value<gbwt::node_type>(in);
-    this->end = sdsl::simple_sds::load_value<gbwt::node_type>(in);
-    bool should_have_start = (this->type == normal || this->type == suffix);
-    bool should_have_end = (this->type == normal || this->type == prefix);
-    if ((this->start != gbwt::ENDMARKER) != should_have_start) {
-        throw sdsl::simple_sds::InvalidData("Subchain start node " + std::to_string(this->start) + " does not match type " + std::to_string(temp));
+    subchain.start = sdsl::simple_sds::load_value<gbwt::node_type>(in);
+    subchain.end = sdsl::simple_sds::load_value<gbwt::node_type>(in);
+    bool should_have_start = (subchain.type == Haplotypes::Subchain::normal || subchain.type == Haplotypes::Subchain::suffix);
+    bool should_have_end = (subchain.type == Haplotypes::Subchain::normal || subchain.type == Haplotypes::Subchain::prefix);
+    if ((subchain.start != gbwt::ENDMARKER) != should_have_start) {
+        throw sdsl::simple_sds::InvalidData("Subchain start node " + std::to_string(subchain.start) + " does not match type " + std::to_string(temp));
     }
-    if ((this->end != gbwt::ENDMARKER) != should_have_end) {
-        throw sdsl::simple_sds::InvalidData("Subchain end node " + std::to_string(this->end) + " does not match type" + std::to_string(temp));
+    if ((subchain.end != gbwt::ENDMARKER) != should_have_end) {
+        throw sdsl::simple_sds::InvalidData("Subchain end node " + std::to_string(subchain.end) + " does not match type" + std::to_string(temp));
     }
+}
 
-    this->kmers = sdsl::simple_sds::load_vector<std::pair<kmer_type, size_t>>(in);
-    this->sequences = sdsl::simple_sds::load_vector<sequence_type>(in);
-    this->kmers_present.simple_sds_load(in);
-    if (kmers_present.size() != kmers.size() * sequences.size()) {
+void load_subchain_kmers_present(Haplotypes::Subchain& subchain, std::istream& in) {
+    subchain.kmers_present.simple_sds_load(in);
+    if (subchain.kmers_present.size() != subchain.kmers.size() * subchain.sequences.size()) {
         throw sdsl::simple_sds::InvalidData("Invalid length for the kmer presence bitvector in subchain from " +
-            std::to_string(this->start) + " to " + std::to_string(this->end));
+            std::to_string(subchain.start) + " to " + std::to_string(subchain.end));
+    }
+}
+
+void Haplotypes::Subchain::load_v1(std::istream& in) {
+    load_subchain_header(*this, in);
+
+    // Kmer and sequence information must be converted to a more compact format.
+    auto kmers_counts = sdsl::simple_sds::load_vector<std::pair<kmer_type, size_t>>(in);
+    auto seqs = sdsl::simple_sds::load_vector<sequence_type>(in);
+    this->kmers = std::vector<kmer_type>(kmers_counts.size());
+    this->kmer_counts = sdsl::int_vector<0>(kmers_counts.size(), 0, sdsl::bits::length(this->sequences.size()));
+    for (size_t i = 0; i < kmers_counts.size(); i++) {
+        this->kmers[i] = kmers_counts[i].first;
+        this->kmer_counts[i] = kmers_counts[i].second;
+    }
+    this->sequences = std::vector<compact_sequence_type>(seqs.size());
+    for (size_t i = 0; i < seqs.size(); i++) {
+        this->sequences[i].first = seqs[i].first;
+        this->sequences[i].second = seqs[i].second;
     }
+
+    load_subchain_kmers_present(*this, in);
+}
+
+void Haplotypes::Subchain::simple_sds_load(std::istream& in) {
+    load_subchain_header(*this, in);
+
+    this->kmers = sdsl::simple_sds::load_vector<kmer_type>(in);
+    this->kmer_counts.simple_sds_load(in);
+    this->sequences = sdsl::simple_sds::load_vector<compact_sequence_type>(in);
+
+    load_subchain_kmers_present(*this, in);
 }
 
 size_t Haplotypes::Subchain::simple_sds_size() const {
     size_t result = sdsl::simple_sds::value_size<std::uint64_t>() + 2 * sdsl::simple_sds::value_size<gbwt::node_type>();
     result += sdsl::simple_sds::vector_size(this->kmers);
+    result += this->kmer_counts.simple_sds_size();
     result += sdsl::simple_sds::vector_size(this->sequences);
     result += this->kmers_present.simple_sds_size();
     return result;
@@ -217,14 +311,25 @@ void Haplotypes::TopLevelChain::simple_sds_load(std::istream& in) {
     }
 }
 
-void Haplotypes::TopLevelChain::load_old(std::istream& in) {
+void Haplotypes::TopLevelChain::load_v1(std::istream& in) {
     this->offset = sdsl::simple_sds::load_value<size_t>(in);
     this->job_id = sdsl::simple_sds::load_value<size_t>(in);
     this->contig_name = "component_" + std::to_string(this->offset);
     size_t subchain_count = sdsl::simple_sds::load_value<size_t>(in);
     this->subchains.resize(subchain_count);
     for (size_t i = 0; i < subchain_count; i++) {
-        this->subchains[i].simple_sds_load(in);
+        this->subchains[i].load_v1(in);
+    }
+}
+
+void Haplotypes::TopLevelChain::load_v2(std::istream& in) {
+    this->offset = sdsl::simple_sds::load_value<size_t>(in);
+    this->job_id = sdsl::simple_sds::load_value<size_t>(in);
+    this->contig_name = sdsl::simple_sds::load_string(in);
+    size_t subchain_count = sdsl::simple_sds::load_value<size_t>(in);
+    this->subchains.resize(subchain_count);
+    for (size_t i = 0; i < subchain_count; i++) {
+        this->subchains[i].load_v1(in);
     }
 }
 
@@ -263,8 +368,10 @@ void Haplotypes::simple_sds_load(std::istream& in) {
     for (auto& chain : this->chains) {
         if (this->header.version == Header::VERSION) {
             chain.simple_sds_load(in);
+        } else if (this->header.version == 2) {
+            chain.load_v2(in);
         } else {
-            chain.load_old(in);
+            chain.load_v1(in);
         }
     }
 
@@ -420,7 +527,14 @@ Haplotypes HaplotypePartitioner::partition_haplotypes(const Parameters& paramete
             result.header.total_subchains += total_subchains;
             result.header.total_kmers += total_kmers;
             if (this->verbosity >= Haplotypes::verbosity_detailed) {
-                std::cerr << "Finished job " << job << " with " << chains.size() << " chains, " << total_subchains << " subchains, and " << total_kmers << " kmers" << std::endl;
+                std::cerr << "Finished job " << job << " with " << chains.size() << " chains (";
+                for (size_t i = 0; i < chains.size(); i++) {
+                    if (i > 0) {
+                        std::cerr << ", ";
+                    }
+                    std::cerr << result.chains[chains[i].offset].contig_name;
+                }
+                std::cerr << "), " << total_subchains << " subchains, and " << total_kmers << " kmers" << std::endl;
             }
         }
     }
@@ -720,7 +834,8 @@ std::vector<HaplotypePartitioner::kmer_type> HaplotypePartitioner::unique_minimi
   empty.
 */
 void present_kmers(const std::vector<std::vector<HaplotypePartitioner::kmer_type>>& sequences,
-    std::vector<std::pair<HaplotypePartitioner::kmer_type, size_t>>& all_kmers,
+    std::vector<HaplotypePartitioner::kmer_type>& all_kmers,
+    sdsl::int_vector<0>& kmer_counts,
     sdsl::bit_vector& kmers_present) {
 
     // Build a map of distinct kmers. For each kmer, record the largest sequence
@@ -744,10 +859,12 @@ void present_kmers(const std::vector<std::vector<HaplotypePartitioner::kmer_type
     // Now take those kmers that occur in some but not in all sequences.
     // Use the first field for storing the offset of the kmer in the vector.
     all_kmers.reserve(present.size());
+    kmer_counts = sdsl::int_vector<0>(present.size(), 0, sdsl::bits::length(sequences.size()));
     size_t offset = 0;
     for (auto iter = present.begin(); iter != present.end(); ++iter) {
         if (iter->second.second < sequences.size()) {
-            all_kmers.push_back({ iter->first, iter->second.second });
+            all_kmers.push_back(iter->first);
+            kmer_counts[offset] = iter->second.second;
             iter->second.first = offset;
             offset++;
         }
@@ -810,7 +927,7 @@ void HaplotypePartitioner::build_subchains(const gbwtgraph::TopLevelChain& chain
             output.subchains.push_back({
                 iter->first.type,
                 gbwtgraph::GBWTGraph::handle_to_node(iter->first.start), gbwtgraph::GBWTGraph::handle_to_node(iter->first.end),
-                {}, {}, sdsl::bit_vector()
+                {}, sdsl::int_vector<0>(0, 0, 64), {}, sdsl::bit_vector()
             });
             Haplotypes::Subchain& subchain = output.subchains.back();
             std::vector<std::vector<kmer_type>> kmers_by_sequence;
@@ -818,8 +935,11 @@ void HaplotypePartitioner::build_subchains(const gbwtgraph::TopLevelChain& chain
             for (sequence_type sequence : iter->second) {
                 kmers_by_sequence.emplace_back(this->unique_minimizers(sequence, iter->first));
             }
-            present_kmers(kmers_by_sequence, subchain.kmers, subchain.kmers_present);
-            subchain.sequences = std::move(iter->second);
+            present_kmers(kmers_by_sequence, subchain.kmers, subchain.kmer_counts, subchain.kmers_present);
+            subchain.sequences = std::vector<Haplotypes::compact_sequence_type>(iter->second.size());
+            for (size_t i = 0; i < iter->second.size(); i++) {
+                subchain.sequences[i] = Haplotypes::compact_sequence_type(iter->second[i].first, iter->second[i].second);
+            }
         }
     }
 
@@ -830,7 +950,7 @@ void HaplotypePartitioner::build_subchains(const gbwtgraph::TopLevelChain& chain
         output.subchains.push_back({
             Haplotypes::Subchain::full_haplotype,
             gbwt::ENDMARKER, gbwt::ENDMARKER,
-            {}, {}, sdsl::bit_vector()
+            {}, sdsl::int_vector<0>(0, 0, 64), {}, sdsl::bit_vector()
         });
         Haplotypes::Subchain& subchain = output.subchains.back();
         gbwt::node_type node = gbwtgraph::GBWTGraph::handle_to_node(chain.handle);
@@ -840,10 +960,10 @@ void HaplotypePartitioner::build_subchains(const gbwtgraph::TopLevelChain& chain
         for (auto seq_id : sequences) {
             kmers_by_sequence.emplace_back(this->unique_minimizers(seq_id));
         }
-        present_kmers(kmers_by_sequence, subchain.kmers, subchain.kmers_present);
+        present_kmers(kmers_by_sequence, subchain.kmers, subchain.kmer_counts, subchain.kmers_present);
         subchain.sequences.reserve(sequences.size());
         for (size_t i = 0; i < sequences.size(); i++) {
-            subchain.sequences.push_back({ sequences[i], 0 });
+            subchain.sequences.push_back(Haplotypes::compact_sequence_type(sequences[i], 0));
         }
     }
 }
@@ -1056,19 +1176,75 @@ void RecombinatorHaplotype::suffix(const gbwt::GBWT& index) {
 }
 
 void RecombinatorHaplotype::insert(gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata) {
-    std::string sample_name = "recombination";
+    std::string sample_name = "recombination"; // TODO: Make this a static class variable.
     metadata.add_haplotype(sample_name, this->contig_name, this->id, this->fragment);
     builder.insert(this->path, true);
 }
 
 //------------------------------------------------------------------------------
 
+/*
+ * An additional haplotype fragment from a bad subchain.
+ *
+ * GBWT metadata will be set as following:
+ *
+ * * Sample name is "fragment".
+ * * Contig name is taken from the top-level chain.
+ * * Haplotype identifier is set during construction.
+ * * Fragment identifier is the identifier of the subchain.
+ */
+struct RecombinatorFragment {
+    // GBWT starting position (inclusive).
+    gbwt::edge_type from;
+
+    // GBWT ending position (inclusive).
+    gbwt::node_type to;
+
+    // Haplotype identifier.
+    size_t haplotype;
+
+    // Fragment / subchain identifier.
+    size_t fragment;
+
+    RecombinatorFragment(gbwt::edge_type from, gbwt::node_type to, size_t haplotype, size_t fragment) :
+        from(from), to(to), haplotype(haplotype), fragment(fragment) {}
+
+    // Generates the GBWT path and the metadata for the fragment.
+    void generate(
+        const gbwt::GBWT& index,
+        gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata,
+        const std::string& contig_name
+    ) const;
+};
+
+void RecombinatorFragment::generate(
+    const gbwt::GBWT& index,
+    gbwt::GBWTBuilder& builder, gbwtgraph::MetadataBuilder& metadata,
+    const std::string& contig_name
+) const {
+    gbwt::vector_type path;
+    for (gbwt::edge_type curr = this->from; curr.first != gbwt::ENDMARKER; curr = index.LF(curr)) {
+        path.push_back(curr.first);
+        if (curr.first == this->to) {
+            break;
+        }
+    }
+
+    std::string sample_name = "fragment"; // TODO: Make this a static class variable.
+    metadata.add_haplotype(sample_name, contig_name, this->haplotype, this->fragment);
+    builder.insert(path, true);
+}
+
+//------------------------------------------------------------------------------
+
 void Recombinator::Statistics::combine(const Statistics& another) {
     this->chains += another.chains;
     this->subchains += another.subchains;
+    this->bad_subchains += another.bad_subchains;
     this->fragments += another.fragments;
     this->full_haplotypes += another.full_haplotypes;
     this->haplotypes = std::max(this->haplotypes, another.haplotypes);
+    this->extra_fragments += another.extra_fragments;
     this->connections += another.connections;
     this->ref_paths += another.ref_paths;
     this->kmers += another.kmers;
@@ -1078,6 +1254,9 @@ void Recombinator::Statistics::combine(const Statistics& another) {
 std::ostream& Recombinator::Statistics::print(std::ostream& out) const {
     out << this->haplotypes << " haplotypes for " << this->chains << " chains ("
         << this->full_haplotypes << " full, " << this->subchains << " subchains, " << this->fragments << " fragments)";
+    if (this->bad_subchains > 0) {
+        out << "; " << this->bad_subchains << " bad subchains with " << this->extra_fragments << " extra fragments";
+    }
     if (this->subchains > 0) {
         double connection_rate = static_cast<double>(this->connections) / (this->subchains * this->haplotypes);
         out << "; connection rate " << connection_rate;
@@ -1123,7 +1302,11 @@ void Recombinator::Parameters::print(std::ostream& out) const {
         out << "- kmer coverage " << this->coverage << std::endl;
     }
     if (this->diploid_sampling) {
-        out << "- diploid sampling (" << this->num_haplotypes << " candidates)" << std::endl;
+        out << "- diploid sampling (" << this->num_haplotypes << " candidates";
+        if (this->extra_fragments) {
+            out << ", badness threshold " << this->badness_threshold;
+        }
+        out << ")" << std::endl;
     } else {
         out << "- heuristic sampling (" << this->num_haplotypes << " haplotypes)" << std::endl;
     }
@@ -1157,6 +1340,10 @@ void recombinator_sanity_checks(const Recombinator::Parameters& parameters) {
         std::string msg = "recombinator_sanity_checks(): number of haplotypes cannot be 0";
         throw std::runtime_error(msg);
     }
+    if (parameters.badness_threshold <= 0.0) {
+        std::string msg = "recombinator_sanity_checks(): badness threshold must be positive";
+        throw std::runtime_error(msg);
+    }
     if (parameters.diploid_sampling && parameters.num_haplotypes < 2) {
         std::string msg = "recombinator_sanity_checks(): diploid sampling requires at least 2 haplotypes";
         throw std::runtime_error(msg);
@@ -1372,7 +1559,7 @@ std::vector<std::pair<Recombinator::kmer_presence, double>> classify_kmers(
     std::vector<std::pair<Recombinator::kmer_presence, double>> kmer_types;
     size_t selected_kmers = 0;
     for (size_t kmer_id = 0; kmer_id < subchain.kmers.size(); kmer_id++) {
-        double count = kmer_counts.at(subchain.kmers[kmer_id].first);
+        double count = kmer_counts.at(subchain.kmers[kmer_id]);
         if (count < absent_threshold) {
             kmer_types.push_back({ Recombinator::absent, -1.0 * parameters.absent_score });
             selected_kmers++;
@@ -1434,10 +1621,15 @@ std::vector<char> Recombinator::classify_kmers(
 
 // Select the best pair of haplotypes from the candidates. Each haplotype gets
 // +1 for getting a kmer right and -1 for getting it wrong.
+// Also returns the remaining non-duplicated haplotypes as extra fragments if
+// the appropriate paremeters have been set.
 std::vector<std::pair<size_t, double>> select_diploid(
+    const gbwtgraph::GBZ& gbz,
     const Haplotypes::Subchain& subchain,
     const std::vector<std::pair<size_t, double>>& candidates,
-    const std::vector<std::pair<Recombinator::kmer_presence, double>>& kmer_types
+    const std::vector<std::pair<Recombinator::kmer_presence, double>>& kmer_types,
+    size_t original_selected,
+    const Recombinator::Parameters& parameters
 ) {
     std::int64_t best_score = std::numeric_limits<std::int64_t>::min();
     size_t best_left = 0, best_right = 1;
@@ -1471,6 +1663,24 @@ std::vector<std::pair<size_t, double>> select_diploid(
         }
     }
 
+    // If this is a bad subchain, we move the selected haplotypes to the front
+    // and return the remaining non-duplicated haplotypes as extra fragments.
+    // Otherwise we return only the selected haplotypes.
+    if (parameters.extra_fragments) {
+        double badness = subchain.badness(gbz);
+        if (badness > parameters.badness_threshold) {
+            std::vector<std::pair<size_t, double>> result;
+            result.push_back(candidates[best_left]);
+            result.push_back(candidates[best_right]);
+            for (size_t i = 0; i < original_selected; i++) {
+                if (i != best_left && i != best_right) {
+                    result.push_back(candidates[i]);
+                }
+            }
+            return result;
+        }
+    }
+
     return { candidates[best_left], candidates[best_right] };
 }
 
@@ -1479,6 +1689,7 @@ std::vector<std::pair<size_t, double>> select_diploid(
 // Updates the local haplotypes with scores and ranks in each round of selection
 // if provided.
 std::vector<std::pair<size_t, double>> select_haplotypes(
+    const gbwtgraph::GBZ& gbz,
     const Haplotypes::Subchain& subchain,
     const hash_map<Haplotypes::Subchain::kmer_type, size_t>& kmer_counts,
     double coverage,
@@ -1558,12 +1769,14 @@ std::vector<std::pair<size_t, double>> select_haplotypes(
         selected_haplotypes.push_back(next);
     }
 
-    // Do diploid sampling if necessary.
+    // Do diploid sampling. If this is a bad subchain, we also return the
+    // extra fragments starting from the third haplotype. But we don't return
+    // duplicated ones.
     if (parameters.diploid_sampling) {
-        return select_diploid(subchain, selected_haplotypes, kmer_types);
+        return select_diploid(gbz, subchain, selected_haplotypes, kmer_types, original_selected, parameters);
+    } else {
+        return selected_haplotypes;
     }
-
-    return selected_haplotypes;
 }
 
 Recombinator::Statistics Recombinator::generate_haplotypes(const Haplotypes::TopLevelChain& chain,
@@ -1592,16 +1805,28 @@ Recombinator::Statistics Recombinator::generate_haplotypes(const Haplotypes::Top
         statistics.full_haplotypes = 1;
     } else {
         bool have_haplotypes = false;
-        for (auto& subchain : chain.subchains) {
+        std::vector<RecombinatorFragment> extra_fragments;
+        for (size_t subchain_id = 0; subchain_id < chain.subchains.size(); subchain_id++) {
+            const auto& subchain = chain.subchains[subchain_id];
             if (subchain.type == Haplotypes::Subchain::full_haplotype) {
                 throw std::runtime_error("Recombinator::generate_haplotypes(): nontrivial chain " + std::to_string(chain.offset) + " contains a subchain with full haplotypes");
             }
             assert(!subchain.sequences.empty());
 
-            // Select the haplotypes greedily.
+            // Select the haplotypes greedily. If we are doing diploid sampling and we get
+            // extra fragments, store them for later processing.
             std::vector<std::pair<size_t, double>> selected_haplotypes = select_haplotypes(
-                subchain, kmer_counts, coverage, &statistics, nullptr, parameters
+                this->gbz, subchain, kmer_counts, coverage, &statistics, nullptr, parameters
             );
+            if (parameters.diploid_sampling && selected_haplotypes.size() > 2) {
+                for (size_t i = 2; i < selected_haplotypes.size(); i++) {
+                    gbwt::edge_type start(subchain.start, subchain.sequences[selected_haplotypes[i].first].second);
+                    extra_fragments.emplace_back(start, subchain.end, i - 1, subchain_id);
+                }
+                statistics.bad_subchains++;
+                statistics.extra_fragments += selected_haplotypes.size() - 2;
+                selected_haplotypes.resize(2);
+            }
 
             // Try to match the existing haplotypes with the selected sequences based on
             // GBWT sequence id.
@@ -1650,6 +1875,11 @@ Recombinator::Statistics Recombinator::generate_haplotypes(const Haplotypes::Top
             }
         }
         statistics.fragments = haplotypes.front().fragment;
+
+        // Add the extra fragments as separate paths.
+        for (const RecombinatorFragment& fragment : extra_fragments) {
+            fragment.generate(this->gbz.index, builder, metadata, chain.contig_name);
+        }
     }
 
     return statistics;
@@ -1696,7 +1926,7 @@ std::vector<Recombinator::LocalHaplotype> Recombinator::extract_sequences(
     double coverage = get_or_estimate_coverage(counts, parameters, this->verbosity);
 
     // Fill in the scores.
-    select_haplotypes(subchain, counts, coverage, nullptr, &result, parameters);
+    select_haplotypes(this->gbz, subchain, counts, coverage, nullptr, &result, parameters);
 
     return result;
 }
diff --git a/src/recombinator.hpp b/src/recombinator.hpp
index 17e6f618b2..b8969917fd 100644
--- a/src/recombinator.hpp
+++ b/src/recombinator.hpp
@@ -37,6 +37,9 @@ namespace vg {
  *
  * Versions:
  *
+ * * Version 3: Subchains use smaller integers when possible. Compatible with
+ *   version 2.
+ *
  * * Version 2: Top-level chains include a contig name. Compatible with version 1.
  *
  * * Version 1: Initial version.
@@ -61,7 +64,7 @@ class Haplotypes {
     /// Header of the serialized file.
     struct Header {
         constexpr static std::uint32_t MAGIC_NUMBER = 0x4C504148; // "HAPL"
-        constexpr static std::uint32_t VERSION = 2;
+        constexpr static std::uint32_t VERSION = 3;
         constexpr static std::uint32_t MIN_VERSION = 1;
         constexpr static std::uint64_t DEFAULT_K = 29;
 
@@ -90,6 +93,9 @@ class Haplotypes {
     /// A GBWT sequence as (sequence identifier, offset in a node).
     typedef std::pair<gbwt::size_type, gbwt::size_type> sequence_type;
 
+    /// A more space-efficient representation of `sequence_type`.
+    typedef std::pair<std::uint32_t, std::uint32_t> compact_sequence_type;
+
     /// Representation of a subchain.
     struct Subchain {
         /// Subchain types.
@@ -118,13 +124,21 @@ class Haplotypes {
 
         /// A vector of distinct kmers. For each kmer, list the kmer itself and the number
         /// of haplotypes it appears in.
-        std::vector<std::pair<kmer_type, size_t>> kmers;
+        std::vector<kmer_type> kmers;
 
-        // TODO: This could be smaller
-        /// Sequences as (GBWT sequence id, offset in the relevant node).
-        std::vector<sequence_type> sequences;
+        /// Number of haplotypes each kmer appears in.
+        sdsl::int_vector<0> kmer_counts;
 
-        // TODO: This needs to be compressed for larger datasets.
+        /// Sequences as (GBWT sequence id, offset in the relevant node).
+        std::vector<compact_sequence_type> sequences;
+
+        // TODO v3: Use an extra bit for each sequence to mark whether the presence for that sequence
+        // is stored explicitly or relative to the last explicit sequence.
+        // We need to cluster the sequences by similarity and store the clusters consecutively.
+        // And then use sd_vector for the sequences with relative presence.
+        // Decompress to a single bitvector when needed.
+        /// A bit vector marking the presence of kmers in the sequences.
+        /// Sequence `i` contains kmer `j` if and only if `kmers_present[i * kmers.size() + j] == 1`.
         sdsl::bit_vector kmers_present;
 
         /// Returns the start node as a GBWTGraph handle.
@@ -142,9 +156,30 @@ class Haplotypes {
         /// Returns a string representation of the type and the boundary nodes.
         std::string to_string() const;
 
+        /// Returns (sequence identifier, offset in a node) for the given sequence.
+        sequence_type get_sequence(size_t i) const {
+            return { this->sequences[i].first, this->sequences[i].second };
+        }
+
+        /// Returns the distance from the last base of `start` to the first base of
+        /// `end` over the given sequence. Returns 0 if the subchain is not normal or
+        /// if the sequence does not exist.
+        size_t distance(const gbwtgraph::GBZ& gbz, size_t i) const;
+
+        /// Returns an estimate of the badness of the subchain.
+        /// The ideal value is 0.0, and higher values indicate worse subchains.
+        /// The estimate is based on the following factors:
+        /// * Length of the subchain.
+        /// * Number of haplotypes relative to the expected number.
+        /// * Information content of the kmers (disabled).
+        double badness(const gbwtgraph::GBZ& gbz) const;
+
         /// Serializes the object to a stream in the simple-sds format.
         void simple_sds_serialize(std::ostream& out) const;
 
+        /// Loads a less space-efficient version 1 or 2 subchain.
+        void load_v1(std::istream& in);
+
         /// Loads the object from a stream in the simple-sds format.
         void simple_sds_load(std::istream& in);
 
@@ -172,8 +207,11 @@ class Haplotypes {
         /// Loads the object from a stream in the simple-sds format.
         void simple_sds_load(std::istream& in);
 
-        /// Loads the old version without a contig name.
-        void load_old(std::istream& in);
+        /// Loads a version 1 chain without a contig name.
+        void load_v1(std::istream& in);
+
+        /// Loads a less space-efficient version 2 chain.
+        void load_v2(std::istream& in);
 
         /// Returns the size of the object in elements.
         size_t simple_sds_size() const;
@@ -392,6 +430,10 @@ class Recombinator {
     /// A reasonable number of candidates for diploid sampling.
     constexpr static size_t NUM_CANDIDATES = 32;
 
+    // TODO: Proper threshold?
+    /// Badness threshold for subchains.
+    constexpr static double BADNESS_THRESHOLD = 4.0;
+
     /// Expected kmer coverage. Use 0 to estimate from kmer counts.
     constexpr static size_t COVERAGE = 0;
 
@@ -425,6 +467,9 @@ class Recombinator {
         /// Number of subchains.
         size_t subchains = 0;
 
+        /// Number of subchains exceeding the badness threshold.
+        size_t bad_subchains = 0;
+
         /// Number of fragments.
         size_t fragments = 0;
 
@@ -434,6 +479,9 @@ class Recombinator {
         /// Number of haplotypes generated.
         size_t haplotypes = 0;
 
+        /// Number of additional haplotype fragments in bad subchains.
+        size_t extra_fragments = 0;
+
         /// Number of times a haplotype was extended from a subchain to the next subchain.
         size_t connections = 0;
 
@@ -491,9 +539,17 @@ class Recombinator {
         /// highest-scoring pair out of them.
         bool diploid_sampling = false;
 
+        /// When using diploid sampling, include the remaining candidates as
+        /// additional fragments in bad subchains.
+        bool extra_fragments = false;
+
+        /// Badness threshold for subchains when using diploid sampling.
+        double badness_threshold = BADNESS_THRESHOLD;
+
         /// Include named and reference paths.
         bool include_reference = false;
 
+        // TODO: Should be use extra_fragments?
         /// Preset parameters for common use cases.
         enum preset_t {
             /// Default parameters.
diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp
index 883558e567..05a204fdf8 100644
--- a/src/subcommand/giraffe_main.cpp
+++ b/src/subcommand/giraffe_main.cpp
@@ -2042,6 +2042,9 @@ int main_giraffe(int argc, char** argv) {
                             std::cerr << "Thread " << thread_num << " now mapping " << aln1.name() << ", " << aln2.name() << std::endl;
                         }
                         
+
+                        check_quality_length(aln1);
+                        check_quality_length(aln2);
                         toUppercaseInPlace(*aln1.mutable_sequence());
                         toUppercaseInPlace(*aln2.mutable_sequence());
 
@@ -2150,6 +2153,7 @@ int main_giraffe(int argc, char** argv) {
                             std::cerr << "Thread " << thread_num << " now mapping " << aln.name() << std::endl;
                         }
                         
+                        check_quality_length(aln);
                         toUppercaseInPlace(*aln.mutable_sequence());
                     
                         // Map the read with the MinimizerMapper.
diff --git a/src/subcommand/haplotypes_main.cpp b/src/subcommand/haplotypes_main.cpp
index 3a3c36a3ba..ec7a9f97bf 100644
--- a/src/subcommand/haplotypes_main.cpp
+++ b/src/subcommand/haplotypes_main.cpp
@@ -1,6 +1,14 @@
 /** \file haplotypes_main.cpp
  *
  * Defines the "vg haplotypes" subcommand, which samples haplotypes by kmer counts in the reads.
+ *
+ * TODO:
+ *
+ * 1. Option to sample non-contiguous haplotypes. This may be important in large
+ *    snarls. Select a suffix of a fragment, all fragments fully within the snarl,
+ *    and the prefix of the fragment exiting the snarl.
+ *
+ * 2. Tests for --linear-structure, --extra-fragments, and #1.
  */
 
 #include "subcommand.hpp"
@@ -25,50 +33,62 @@ using namespace vg;
 
 //----------------------------------------------------------------------------
 
+// Default values for command line parameters.
+
+namespace haplotypes_defaults {
+
 constexpr size_t DEFAULT_MAX_THREADS = 16;
 
-size_t haplotypes_default_threads() {
+size_t threads() {
     size_t threads = omp_get_max_threads();
     threads = std::max(threads, size_t(1));
     return std::min(threads, DEFAULT_MAX_THREADS);
 }
 
-constexpr size_t haplotypes_default_k() {
+constexpr size_t k() {
     return Haplotypes::Header::DEFAULT_K;
 }
 
-constexpr size_t haplotypes_default_w() {
+constexpr size_t w() {
     return gbwtgraph::Key64::WINDOW_LENGTH;
 }
 
-constexpr size_t haplotypes_default_subchain_length() {
+constexpr size_t subchain_length() {
     return HaplotypePartitioner::SUBCHAIN_LENGTH;
 }
 
-constexpr size_t haplotypes_default_n() {
+constexpr size_t n() {
     return Recombinator::NUM_HAPLOTYPES;
 }
 
-constexpr size_t haplotypes_default_candidates() {
+constexpr size_t candidates() {
     return Recombinator::NUM_CANDIDATES;
 }
 
-constexpr size_t haplotypes_default_coverage() {
+constexpr size_t coverage() {
     return Recombinator::COVERAGE;
 }
 
-constexpr double haplotypes_default_discount() {
+constexpr double discount() {
     return Recombinator::PRESENT_DISCOUNT;
 }
 
-constexpr double haplotypes_default_adjustment() {
+constexpr double adjustment() {
     return Recombinator::HET_ADJUSTMENT;
 }
 
-constexpr double haplotypes_default_absent() {
+constexpr double absent() {
     return Recombinator::ABSENT_SCORE;
 }
 
+constexpr double badness() {
+    return Recombinator::BADNESS_THRESHOLD;
+}
+
+}; // namespace haplotypes_defaults
+
+//----------------------------------------------------------------------------
+
 struct HaplotypesConfig {
     enum OperatingMode {
         mode_invalid,
@@ -78,6 +98,7 @@ struct HaplotypesConfig {
         mode_map_variants,
         mode_extract,
         mode_classify,
+        mode_statistics,
     };
 
     OperatingMode mode = mode_invalid;
@@ -90,7 +111,7 @@ struct HaplotypesConfig {
     std::string haplotype_input, kmer_input, vcf_input;
 
     // Computational parameters.
-    size_t k = haplotypes_default_k(), w = haplotypes_default_w();
+    size_t k = haplotypes_defaults::k(), w = haplotypes_defaults::w();
     HaplotypePartitioner::Parameters partitioner_parameters;
     Recombinator::Parameters recombinator_parameters;
 
@@ -101,8 +122,11 @@ struct HaplotypesConfig {
     size_t chain_id = std::numeric_limits<size_t>::max();
     size_t subchain_id = std::numeric_limits<size_t>::max();
 
+    // For subchain statistics.
+    std::string ref_sample;
+
     // Other parameters.
-    size_t threads = haplotypes_default_threads();
+    size_t threads = haplotypes_defaults::threads();
     bool validate = false;
 
     HaplotypesConfig(int argc, char** argv, size_t max_threads);
@@ -118,13 +142,15 @@ void extract_haplotypes(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes,
 
 void classify_kmers(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, const HaplotypesConfig& config);
 
+void subchain_statistics(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, const HaplotypesConfig& config);
+
 //----------------------------------------------------------------------------
 
 int main_haplotypes(int argc, char** argv) {
     double start = gbwt::readTimer();
     gbwt::Verbosity::set(gbwt::Verbosity::SILENT);
     size_t max_threads = omp_get_max_threads();
-    omp_set_num_threads(haplotypes_default_threads());
+    omp_set_num_threads(haplotypes_defaults::threads());
 
     // Parse the arguments.
     HaplotypesConfig config(argc, argv, max_threads);
@@ -177,6 +203,11 @@ int main_haplotypes(int argc, char** argv) {
         classify_kmers(gbz, haplotypes, config);
     }
 
+    // Output statistics on subchain lengths and kmer counts.
+    if (config.mode == HaplotypesConfig::mode_statistics) {
+        subchain_statistics(gbz, haplotypes, config);
+    }
+
     if (config.verbosity >= Haplotypes::verbosity_basic) {
         double seconds = gbwt::readTimer() - start;
         double gib = gbwt::inGigabytes(gbwt::memoryUsage());
@@ -198,6 +229,8 @@ void help_haplotypes(char** argv, bool developer_options) {
     if (developer_options) {
         std::cerr << usage << "-i graph.hapl --vcf-input variants.vcf graph.gbz > output.tsv" << std::endl;
         std::cerr << usage << "-i graph.hapl -k kmers.kff --extract M:N graph.gbz > output.fa" << std::endl;
+        std::cerr << usage << "-i graph.hapl -k kmers.kff --classify output.kmers graph.gbz" << std::endl;
+        std::cerr << usage << "-i graph.hapl --statistics ref_sample graph.gbz > output.tsv" << std::endl;
     }
     std::cerr << std::endl;
 
@@ -214,26 +247,28 @@ void help_haplotypes(char** argv, bool developer_options) {
     std::cerr << "    -k, --kmer-input X        use kmer counts from this KFF file (required for --gbz-output)" << std::endl;
     std::cerr << std::endl;
     std::cerr << "Options for generating haplotype information:" << std::endl;
-    std::cerr << "        --kmer-length N       kmer length for building the minimizer index (default: " << haplotypes_default_k() << ")" << std::endl;
-    std::cerr << "        --window-length N     window length for building the minimizer index (default: " << haplotypes_default_w() << ")" << std::endl;
-    std::cerr << "        --subchain-length N   target length (in bp) for subchains (default: " << haplotypes_default_subchain_length() << ")" << std::endl;
+    std::cerr << "        --kmer-length N       kmer length for building the minimizer index (default: " << haplotypes_defaults::k() << ")" << std::endl;
+    std::cerr << "        --window-length N     window length for building the minimizer index (default: " << haplotypes_defaults::w() << ")" << std::endl;
+    std::cerr << "        --subchain-length N   target length (in bp) for subchains (default: " << haplotypes_defaults::subchain_length() << ")" << std::endl;
     std::cerr << "        --linear-structure    extend subchains to avoid haplotypes visiting them multiple times" << std::endl;
     std::cerr << std::endl;
     std::cerr << "Options for sampling haplotypes:" << std::endl;
     std::cerr << "        --preset X            use preset X (default, haploid, diploid)" << std::endl;
     std::cerr << "        --coverage N          kmer coverage in the KFF file (default: estimate)" << std::endl;
-    std::cerr << "        --num-haplotypes N    generate N haplotypes (default: " << haplotypes_default_n() << ")" << std::endl;
-    std::cerr << "                              sample from N candidates (with --diploid-sampling; default: " << haplotypes_default_candidates() << ")" << std::endl;
-    std::cerr << "        --present-discount F  discount scores for present kmers by factor F (default: " << haplotypes_default_discount() << ")" << std::endl;
-    std::cerr << "        --het-adjustment F    adjust scores for heterozygous kmers by F (default: " << haplotypes_default_adjustment() << ")" << std::endl;
-    std::cerr << "        --absent-score F      score absent kmers -F/+F (default: " << haplotypes_default_absent()  << ")" << std::endl;
+    std::cerr << "        --num-haplotypes N    generate N haplotypes (default: " << haplotypes_defaults::n() << ")" << std::endl;
+    std::cerr << "                              sample from N candidates (with --diploid-sampling; default: " << haplotypes_defaults::candidates() << ")" << std::endl;
+    std::cerr << "        --present-discount F  discount scores for present kmers by factor F (default: " << haplotypes_defaults::discount() << ")" << std::endl;
+    std::cerr << "        --het-adjustment F    adjust scores for heterozygous kmers by F (default: " << haplotypes_defaults::adjustment() << ")" << std::endl;
+    std::cerr << "        --absent-score F      score absent kmers -F/+F (default: " << haplotypes_defaults::absent()  << ")" << std::endl;
     std::cerr << "        --haploid-scoring     use a scoring model without heterozygous kmers" << std::endl;
     std::cerr << "        --diploid-sampling    choose the best pair from the sampled haplotypes" << std::endl;
+    std::cerr << "        --extra-fragments     in diploid sampling, select all candidates in bad subchains" << std::endl;
+    std::cerr << "        --badness F           threshold for the badness of a subchain (default: " << haplotypes_defaults::badness() << ")" << std::endl;
     std::cerr << "        --include-reference   include named and reference paths in the output" << std::endl;
     std::cerr << std::endl;
     std::cerr << "Other options:" << std::endl;
     std::cerr << "    -v, --verbosity N         verbosity level (0 = silent, 1 = basic, 2 = detailed, 3 = debug; default: 0)" << std::endl;
-    std::cerr << "    -t, --threads N           approximate number of threads (default: " << haplotypes_default_threads() << " on this system)" << std::endl;
+    std::cerr << "    -t, --threads N           approximate number of threads (default: " << haplotypes_defaults::threads() << " on this system)" << std::endl;
     std::cerr << std::endl;
     if (developer_options) {
         std::cerr << "Developer options:" << std::endl;
@@ -241,8 +276,9 @@ void help_haplotypes(char** argv, bool developer_options) {
         std::cerr << "        --vcf-input X         map the variants in VCF file X to subchains" << std::endl;
         std::cerr << "        --contig-prefix X     a prefix for transforming VCF contig names into GBWT contig names" << std::endl;
         std::cerr << "        --extract M:N         extract haplotypes in chain M, subchain N in FASTA format" << std::endl;
-        std::cerr << "        --score-output X      write haplotype scores to X" << std::endl;
+        std::cerr << "        --score-output X      write haplotype scores to X (use with --extract)" << std::endl;
         std::cerr << "        --classify X          classify kmers and write output to X" << std::endl;
+        std::cerr << "        --statistics X        output subchain statistics over reference sample X" << std::endl;
         std::cerr << std::endl;
     }
 }
@@ -262,13 +298,16 @@ HaplotypesConfig::HaplotypesConfig(int argc, char** argv, size_t max_threads) {
     constexpr int OPT_ABSENT_SCORE = 1305;
     constexpr int OPT_HAPLOID_SCORING = 1306;
     constexpr int OPT_DIPLOID_SAMPLING = 1307;
-    constexpr int OPT_INCLUDE_REFERENCE = 1308;
+    constexpr int OPT_EXTRA_FRAGMENTS = 1308;
+    constexpr int OPT_BADNESS = 1309;
+    constexpr int OPT_INCLUDE_REFERENCE = 1310;
     constexpr int OPT_VALIDATE = 1400;
     constexpr int OPT_VCF_INPUT = 1500;
     constexpr int OPT_CONTIG_PREFIX = 1501;
     constexpr int OPT_EXTRACT = 1600;
     constexpr int OPT_SCORE_OUTPUT = 1601;
     constexpr int OPT_CLASSIFY = 1602;
+    constexpr int OPT_STATISTICS = 1603;
 
     static struct option long_options[] =
     {
@@ -290,6 +329,8 @@ HaplotypesConfig::HaplotypesConfig(int argc, char** argv, size_t max_threads) {
         { "absent-score", required_argument, 0, OPT_ABSENT_SCORE },
         { "haploid-scoring", no_argument, 0, OPT_HAPLOID_SCORING },
         { "diploid-sampling", no_argument, 0, OPT_DIPLOID_SAMPLING },
+        { "extra-fragments", no_argument, 0, OPT_EXTRA_FRAGMENTS },
+        { "badness", required_argument, 0, OPT_BADNESS },
         { "include-reference", no_argument, 0, OPT_INCLUDE_REFERENCE },
         { "verbosity", required_argument, 0, 'v' },
         { "threads", required_argument, 0, 't' },
@@ -299,6 +340,7 @@ HaplotypesConfig::HaplotypesConfig(int argc, char** argv, size_t max_threads) {
         { "extract", required_argument, 0, OPT_EXTRACT },
         { "score-output", required_argument, 0, OPT_SCORE_OUTPUT },
         { "classify", required_argument, 0, OPT_CLASSIFY },
+        { "statistics", required_argument, 0, OPT_STATISTICS },
         { 0, 0, 0, 0 }
     };
 
@@ -413,6 +455,16 @@ HaplotypesConfig::HaplotypesConfig(int argc, char** argv, size_t max_threads) {
         case OPT_DIPLOID_SAMPLING:
             this->recombinator_parameters.diploid_sampling = true;
             break;
+        case OPT_EXTRA_FRAGMENTS:
+            this->recombinator_parameters.extra_fragments = true;
+            break;
+        case OPT_BADNESS:
+            this->recombinator_parameters.badness_threshold = parse<double>(optarg);
+            if (this->recombinator_parameters.badness_threshold <= 0.0) {
+                std::cerr << "error: [vg haplotypes] badness threshold must be positive" << std::endl;
+                std::exit(EXIT_FAILURE);
+            }
+            break;
         case OPT_INCLUDE_REFERENCE:
             this->recombinator_parameters.include_reference = true;
             break;
@@ -463,6 +515,9 @@ HaplotypesConfig::HaplotypesConfig(int argc, char** argv, size_t max_threads) {
         case OPT_CLASSIFY:
             this->kmer_output = optarg;
             break;
+        case OPT_STATISTICS:
+            this->ref_sample = optarg;
+            break;
 
         case 'h':
         case '?':
@@ -492,6 +547,8 @@ HaplotypesConfig::HaplotypesConfig(int argc, char** argv, size_t max_threads) {
         this->mode = mode_extract;
     } else if (!this->haplotype_input.empty() && !this->kmer_input.empty() && !this->kmer_output.empty()) {
         this->mode = mode_classify;
+    } else if (!this->haplotype_input.empty() && !this->ref_sample.empty()) {
+        this->mode = mode_statistics;
     }
     if (this->mode == mode_invalid) {
         help_haplotypes(argv, false);
@@ -500,7 +557,7 @@ HaplotypesConfig::HaplotypesConfig(int argc, char** argv, size_t max_threads) {
 
     // Use conditional defaults if the user did not override them.
     if (this->recombinator_parameters.diploid_sampling && !num_haplotypes_set) {
-        this->recombinator_parameters.num_haplotypes = haplotypes_default_candidates();
+        this->recombinator_parameters.num_haplotypes = haplotypes_defaults::candidates();
     }
 }
 
@@ -655,26 +712,33 @@ gbwt::size_type path_for_contig(const gbwtgraph::GBZ& gbz, gbwt::size_type conti
     return path_id;
 }
 
-std::pair<gbwt::size_type, size_t> seq_chain_for_path(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, gbwt::size_type path_id, const std::string& contig_name) {
+// Returns the GBWT sequence id for the given path in the orientation it appears in this top-level chain.
+// Returns `gbwt::invalid_sequence()` if the path is not found.
+gbwt::size_type seq_for_chain(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, gbwt::size_type path_id, size_t chain_id) {
     gbwt::size_type sequence_id = gbwt::Path::encode(path_id, false);
     gbwt::size_type reverse_id = gbwt::Path::encode(path_id, true);
+    for (const Haplotypes::Subchain& subchain : haplotypes.chains[chain_id].subchains) {
+        for (const std::pair<gbwt::size_type, size_t>& sequence : subchain.sequences) {
+            if (sequence.first == sequence_id) {
+                return sequence.first;
+            }
+            if (sequence.first == reverse_id) {
+                return sequence.first;
+            }
+        }
+    }
+    return gbwt::invalid_sequence();
+}
+
+std::pair<gbwt::size_type, size_t> seq_chain_for_path(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, gbwt::size_type path_id, const std::string& contig_name) {
     size_t found_chains = 0;
     std::pair<gbwt::size_type, size_t> result(gbwt::invalid_sequence(), haplotypes.components());
     for (size_t chain_id = 0; chain_id < haplotypes.components(); chain_id++) {
-        const Haplotypes::Subchain& subchain = haplotypes.chains[chain_id].subchains.front();
-        for (size_t i = 0; i < subchain.sequences.size(); i++) {
-            if (subchain.sequences[i].first == sequence_id) {
-                result.first = sequence_id;
-                result.second = chain_id;
-                found_chains++;
-                break;
-            }
-            if (subchain.sequences[i].first == reverse_id) {
-                result.first = reverse_id;
-                result.second = chain_id;
-                found_chains++;
-                break;
-            }
+        gbwt::size_type seq_id = seq_for_chain(gbz, haplotypes, path_id, chain_id);
+        if (seq_id != gbwt::invalid_sequence()) {
+            result.first = seq_id;
+            result.second = chain_id;
+            found_chains++;
         }
     }
     if (found_chains != 1) {
@@ -709,29 +773,32 @@ struct ReferenceInterval {
         return this->end - this->start;
     }
 
-    std::string to_string() const {
-        std::string result;
+    char type_as_char() const {
         switch (this->type) {
             case Haplotypes::Subchain::normal:
-                result.push_back('N');
-                break;
+                return 'N';
             case Haplotypes::Subchain::prefix:
-                result.push_back('P');
-                break;
+                return 'P';
             case Haplotypes::Subchain::suffix:
-                result.push_back('S');
-                break;
+                return 'S';
             case Haplotypes::Subchain::full_haplotype:
-                result.push_back('F');
-                break;
+                return 'F';
         }
+        return '?';
+    }
+
+    std::string to_string() const {
+        std::string result;
+        result.push_back(this->type_as_char());
         result += std::to_string(this->id) + "(" + std::to_string(this->start) + ".." + std::to_string(this->end) + ")";
         return result;
     }
 };
 
-std::vector<ReferenceInterval> subchain_intervals(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, gbwt::size_type sequence_id, size_t chain_id, bool reverse) {
-    gbwt::size_type actual_sequence_id = (reverse ? gbwt::Path::reverse(sequence_id) : sequence_id);
+// Also returns the total length of the path / GBWT sequence in bp.
+std::pair<std::vector<ReferenceInterval>, size_t> subchain_intervals(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, gbwt::size_type sequence_id, size_t chain_id) {
+    bool reverse = gbwt::Path::is_reverse(sequence_id);
+    gbwt::size_type actual_sequence_id = gbwt::Path::encode(gbwt::Path::id(sequence_id), false);
     gbwt::vector_type path = gbz.index.extract(actual_sequence_id);
     size_t total_length = 0;
     for (auto gbwt_node : path) {
@@ -778,7 +845,7 @@ std::vector<ReferenceInterval> subchain_intervals(const gbwtgraph::GBZ& gbz, con
             interval.start = seq_offset;
         } else if (subchain.type == Haplotypes::Subchain::prefix) {
             // If a prefix follows a suffix, they cover the same interval.
-            interval.start = result.back().start;
+            interval.start = (result.empty() ? 0 : result.back().start);
         }
         if (subchain.has_end()) {
             while (node_offset < path.size() && path[node_offset] != subchain.end) {
@@ -787,14 +854,14 @@ std::vector<ReferenceInterval> subchain_intervals(const gbwtgraph::GBZ& gbz, con
             }
             interval.end = seq_offset;
             // If a prefix follows a suffix, they cover the same interval.
-            if (subchain.type == Haplotypes::Subchain::prefix) {
+            if (subchain.type == Haplotypes::Subchain::prefix && !result.empty()) {
                 result.back().end = interval.end;
             }
         }
         result.push_back(interval);
     }
 
-    return result;
+    return { result, total_length };
 }
 
 void map_variants(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, const HaplotypesConfig& config) {
@@ -868,7 +935,10 @@ void map_variants(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, const
         size_t offset = iter->second;
         gbwt::size_type sequence_id = offset_to_seq_chain[offset].first;
         gbwt::size_type chain_id = offset_to_seq_chain[offset].second;
-        auto ref_intervals = subchain_intervals(gbz, haplotypes, sequence_id, chain_id, gbwt::Path::is_reverse(sequence_id));
+        std::vector<ReferenceInterval> ref_intervals;
+        size_t total_length;
+        std::tie(ref_intervals, total_length) =
+            subchain_intervals(gbz, haplotypes, sequence_id, chain_id);
         for (auto interval : variant_positions[offset]) {
             size_t low = 0, high = ref_intervals.size();
             bool found = false;
@@ -990,6 +1060,72 @@ void classify_kmers(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, con
 
 //----------------------------------------------------------------------------
 
+gbwt::size_type path_for_sample_contig(
+    const gbwtgraph::GBZ& gbz,
+    const std::string& sample_name, const std::string& contig_name
+) {
+    gbwt::size_type sample_id = gbz.index.metadata.sample(sample_name);
+    if (sample_id >= gbz.index.metadata.samples()) {
+        std::cerr << "error: [vg haplotypes] sample " << sample_name << " not found" << std::endl;
+        std::exit(EXIT_FAILURE);
+    }
+    gbwt::size_type contig_id = gbz.index.metadata.contig(contig_name);
+    if (contig_id >= gbz.index.metadata.contigs()) {
+        std::cerr << "error: [vg haplotypes] contig " << contig_name << " not found" << std::endl;
+        std::exit(EXIT_FAILURE);
+    }
+
+    auto paths = gbz.index.metadata.findPaths(sample_id, contig_id);
+    if (paths.size() != 1) {
+        std::cerr << "error: [vg haplotypes] found " << paths.size() << " paths for sample " << sample_name << ", contig " << contig_name << std::endl;
+        std::exit(EXIT_FAILURE);
+    }
+    return paths.front();
+}
+
+void subchain_statistics(const gbwtgraph::GBZ& gbz, const Haplotypes& haplotypes, const HaplotypesConfig& config) {
+    gbwt::size_type sample_id = gbz.index.metadata.sample(config.ref_sample);
+    if (sample_id >= gbz.index.metadata.samples()) {
+        std::cerr << "error: [vg haplotypes] sample " << config.ref_sample << " not found in the graph" << std::endl;
+        std::exit(EXIT_FAILURE);
+    }
+
+    // Header line: graph name, sample name.
+    std::cout << "H\t" << config.graph_name << "\t" << config.ref_sample << std::endl;
+
+    for (size_t chain_id = 0; chain_id < haplotypes.components(); chain_id++) {
+        gbwt::size_type path_id = path_for_sample_contig(gbz, config.ref_sample, haplotypes.chains[chain_id].contig_name);
+        gbwt::size_type seq_id = seq_for_chain(gbz, haplotypes, path_id, chain_id);
+        if (seq_id == gbwt::invalid_sequence()) {
+            std::cerr << "error: [vg haplotypes] could not determine reference orientation in chain " << chain_id << std::endl;
+            std::exit(EXIT_FAILURE);
+        }
+        std::vector<ReferenceInterval> ref_intervals;
+        size_t total_length;
+        std::tie(ref_intervals, total_length) = subchain_intervals(gbz, haplotypes, seq_id, chain_id);
+
+        // Contig line: chain id, contig name, total length.
+        std::cout
+            << "C\t"
+            << chain_id << "\t"
+            << haplotypes.chains[chain_id].contig_name << "\t"
+            << total_length << std::endl;
+        // For each subchain: type, id, start, end, length, number of kmers.
+        for (size_t i = 0; i < ref_intervals.size(); i++) {
+            size_t kmers = haplotypes.chains[chain_id].subchains[ref_intervals[i].id].kmers.size();
+            std::cout
+                << ref_intervals[i].type_as_char() << "\t"
+                << ref_intervals[i].id << "\t"
+                << ref_intervals[i].start << "\t"
+                << ref_intervals[i].end << "\t"
+                << ref_intervals[i].length() << "\t"
+                << kmers << std::endl;
+        }
+    }
+}
+
+//----------------------------------------------------------------------------
+
 void validate_error(const std::string& header, const std::string& message) {
     std::cerr << "error: [vg haplotypes] ";
     if (!header.empty()) {
@@ -1205,7 +1341,7 @@ void validate_chain(const Haplotypes::TopLevelChain& chain,
         if (subchain.type != Haplotypes::Subchain::full_haplotype) {
             hash_set<Haplotypes::Subchain::kmer_type> all_kmers;
             for (size_t i = 0; i < subchain.kmers.size(); i++) {
-                all_kmers.insert(subchain.kmers[i].first);
+                all_kmers.insert(subchain.kmers[i]);
             }
             if (all_kmers.size() != subchain.kmers.size()) {
                 std::string message = expected_got(subchain.kmers.size(), all_kmers.size()) + " kmers";
@@ -1224,15 +1360,15 @@ void validate_chain(const Haplotypes::TopLevelChain& chain,
                 }
                 for (size_t j = 0, offset = i * subchain.kmers.size(); j < subchain.kmers.size(); j++, offset++) {
                     if (subchain.kmers_present[offset]) {
-                        auto iter = unique_minimizers.find(subchain.kmers[j].first);
+                        auto iter = unique_minimizers.find(subchain.kmers[j]);
                         if (iter == unique_minimizers.end()) {
                             std::string message = "kmer " + std::to_string(j) + " not present in the haplotype";
                             validate_error_sequence(chain_id, subchain_id, i, message);
                         }
-                        used_kmers[subchain.kmers[j].first]++;
+                        used_kmers[subchain.kmers[j]]++;
                         iter->second = true;
                     } else {
-                        if (unique_minimizers.find(subchain.kmers[j].first) != unique_minimizers.end()) {
+                        if (unique_minimizers.find(subchain.kmers[j]) != unique_minimizers.end()) {
                             std::string message = "kmer " + std::to_string(j) + " is present in the haplotype";
                             validate_error_sequence(chain_id, subchain_id, i, message);
                         }
@@ -1247,8 +1383,8 @@ void validate_chain(const Haplotypes::TopLevelChain& chain,
             size_t invalid_count = 0;
             for (size_t kmer_id = 0; kmer_id < subchain.kmers.size(); kmer_id++) {
                 size_t count = 0;
-                auto iter = used_kmers.find(subchain.kmers[kmer_id].first);
-                if (iter == used_kmers.end() || iter->second != subchain.kmers[kmer_id].second) {
+                auto iter = used_kmers.find(subchain.kmers[kmer_id]);
+                if (iter == used_kmers.end() || iter->second != subchain.kmer_counts[kmer_id]) {
                     invalid_count++;
                 }
             }
@@ -1335,7 +1471,7 @@ void validate_haplotypes(const Haplotypes& haplotypes,
         for (size_t subchain_id = 0; subchain_id < chain.subchains.size(); subchain_id++) {
             const Haplotypes::Subchain& subchain = chain.subchains[subchain_id];
             for (size_t i = 0; i < subchain.kmers.size(); i++) {
-                auto iter = kmers.find(subchain.kmers[i].first);
+                auto iter = kmers.find(subchain.kmers[i]);
                 if (iter != kmers.end()) {
                     const Haplotypes::Subchain& prev = haplotypes.chains[iter->second.first].subchains[iter->second.second];
                     if (chain_id == iter->second.first && subchain_id == iter->second.second + 1 && subchain.type == Haplotypes::Subchain::prefix && prev.type == Haplotypes::Subchain::suffix) {
@@ -1346,7 +1482,7 @@ void validate_haplotypes(const Haplotypes& haplotypes,
                         validate_error_subchain(chain_id, subchain_id, message);
                     }
                 }
-                kmers[subchain.kmers[i].first] = { chain_id, subchain_id };
+                kmers[subchain.kmers[i]] = { chain_id, subchain_id };
             }
         }
     }
diff --git a/src/subcommand/map_main.cpp b/src/subcommand/map_main.cpp
index 956a4db47c..ad1fb39f58 100644
--- a/src/subcommand/map_main.cpp
+++ b/src/subcommand/map_main.cpp
@@ -978,6 +978,8 @@ int main_map(int argc, char** argv) {
                 return;
             }
 
+            check_quality_length(alignment);
+
             int tid = omp_get_thread_num();
             vector<Alignment> alignments = mapper[tid]->align_multi(alignment,
                                                                     kmer_size,
@@ -1016,6 +1018,8 @@ int main_map(int argc, char** argv) {
             };
             
             function<void(Alignment&,Alignment&)> lambda = [&](Alignment& aln1, Alignment& aln2) {
+                check_quality_length(aln1);
+                check_quality_length(aln2);
                 auto our_mapper = mapper[omp_get_thread_num()];
                 bool queued_resolve_later = false;
                 auto alnp = our_mapper->align_paired_multi(aln1,
@@ -1067,6 +1071,7 @@ int main_map(int argc, char** argv) {
         } else if (fastq2.empty()) {
             // single
             function<void(Alignment&)> lambda = [&](Alignment& alignment) {
+                        check_quality_length(alignment);
                         int tid = omp_get_thread_num();
                         vector<Alignment> alignments = mapper[tid]->align_multi(alignment,
                                                                                 kmer_size,
@@ -1092,6 +1097,8 @@ int main_map(int argc, char** argv) {
                 }
             };
             function<void(Alignment&,Alignment&)> lambda = [&](Alignment& aln1, Alignment& aln2) {
+                check_quality_length(aln1);
+                check_quality_length(aln2);
                 auto our_mapper = mapper[omp_get_thread_num()];
                 bool queued_resolve_later = false;
                 auto alnp = our_mapper->align_paired_multi(aln1,
@@ -1165,6 +1172,8 @@ int main_map(int argc, char** argv) {
                 }
             };
             function<void(Alignment&,Alignment&)> lambda = [&](Alignment& aln1, Alignment& aln2) {
+                check_quality_length(aln1);
+                check_quality_length(aln2);
                 auto our_mapper = mapper[omp_get_thread_num()];
                 bool queued_resolve_later = false;
                 auto alnp = our_mapper->align_paired_multi(aln1,
@@ -1214,6 +1223,7 @@ int main_map(int argc, char** argv) {
         } else {
             // Processing single-end GAM input
             function<void(Alignment&)> lambda = [&](Alignment& alignment) {
+                check_quality_length(alignment);
                 int tid = omp_get_thread_num();
                 vector<Alignment> alignments = mapper[tid]->align_multi(alignment,
                                                                         kmer_size,
diff --git a/src/subcommand/mpmap_main.cpp b/src/subcommand/mpmap_main.cpp
index 7b00984a5c..f42ff1a0ac 100644
--- a/src/subcommand/mpmap_main.cpp
+++ b/src/subcommand/mpmap_main.cpp
@@ -2115,7 +2115,8 @@ int main_mpmap(int argc, char** argv) {
         if (watchdog) {
             watchdog->check_in(thread_num, alignment.name());
         }
-        
+
+        check_quality_length(alignment);
         toUppercaseInPlace(*alignment.mutable_sequence());
         
         bool is_rna = uses_Us(alignment);
@@ -2181,6 +2182,8 @@ int main_mpmap(int argc, char** argv) {
             watchdog->check_in(thread_num, alignment_1.name());
         }
         
+        check_quality_length(alignment_1);
+        check_quality_length(alignment_2);
         toUppercaseInPlace(*alignment_1.mutable_sequence());
         toUppercaseInPlace(*alignment_2.mutable_sequence());
         
diff --git a/src/subcommand/rna_main.cpp b/src/subcommand/rna_main.cpp
index 515c2112e1..7925a8d00a 100644
--- a/src/subcommand/rna_main.cpp
+++ b/src/subcommand/rna_main.cpp
@@ -21,6 +21,7 @@ using namespace std;
 using namespace vg;
 using namespace vg::subcommand;
 
+string GZ_SUFFIX = ".gz";
 
 void help_rna(char** argv) {
     cerr << "\nusage: " << argv[0] << " rna [options] graph.[vg|pg|hg|gbz] > splicing_graph.[vg|pg|hg]" << endl
@@ -250,6 +251,21 @@ int32_t main_rna(int32_t argc, char** argv) {
         return 1;       
     }
 
+    for (const auto& filenames : {std::cref(transcript_filenames), std::cref(intron_filenames)}) {
+        // For each collection of filenames (see https://stackoverflow.com/a/64794991/)
+
+        for (auto filename : filenames.get()) {
+
+            // taken from https://stackoverflow.com/a/20446239/
+            if (filename.size() >= GZ_SUFFIX.size() &&
+                filename.compare(filename.size() - GZ_SUFFIX.size(), GZ_SUFFIX.size(), GZ_SUFFIX) == 0) {
+
+                cerr << "[vg rna] ERROR: Annotation file " << filename << " appears to be gzipped. Decompress it before use." << endl;    
+                return 1;
+            }
+        }
+    }
+
     if (!haplotypes_filename.empty() && gbz_format) {
 
         cerr << "[vg rna] ERROR: Only one set of haplotypes can be provided (GBZ file contains both a graph and haplotypes). Use either --haplotypes or --gbz-format." << endl;
diff --git a/src/subcommand/view_main.cpp b/src/subcommand/view_main.cpp
index c254058f08..859d54a835 100644
--- a/src/subcommand/view_main.cpp
+++ b/src/subcommand/view_main.cpp
@@ -41,7 +41,7 @@ void help_view(char** argv) {
          << "    -V, --vg-in                input VG format only" << endl
 
          << "    -j, --json                 output JSON format" << endl
-         << "    -J, --json-in              input JSON format" << endl
+         << "    -J, --json-in              input JSON format (use with e.g. -a as necessary)" << endl
          << "    -c, --json-stream          streaming conversion of a VG format graph in line delimited JSON format" << endl
          << "                               (this cannot be loaded directly via -J)" << endl
 
@@ -52,11 +52,11 @@ void help_view(char** argv) {
          << "    -T, --turtle-in            input turtle format." << endl
          << "    -r, --rdf_base_uri         set base uri for the RDF output" << endl
 
-         << "    -a, --align-in             input GAM format" << endl
+         << "    -a, --align-in             input GAM format, or JSON version of GAM format" << endl
          << "    -A, --aln-graph GAM        add alignments from GAM to the graph" << endl
 
-         << "    -q, --locus-in             input stream is Locus format" << endl
-         << "    -z, --locus-out            output stream Locus format" << endl
+         << "    -q, --locus-in             input is Locus format, or JSON version of Locus format" << endl
+         << "    -z, --locus-out            output is Locus format" << endl
          << "    -Q, --loci FILE            input is Locus format for use by dot output" << endl
 
          << "    -d, --dot                  output dot format" << endl
@@ -81,12 +81,12 @@ void help_view(char** argv) {
          << "    -i, --interleaved          fastq is interleaved paired-ended" << endl
 
          << "    -L, --pileup               output VG Pileup format" << endl
-         << "    -l, --pileup-in            input VG Pileup format" << endl
+         << "    -l, --pileup-in            input VG Pileup format, or JSON version of VG Pileup format" << endl
 
          << "    -B, --distance-in          input distance index" << endl
          << "    -R, --snarl-in             input VG Snarl format" << endl
          << "    -E, --snarl-traversal-in   input VG SnarlTraversal format" << endl
-         << "    -K, --multipath-in         input VG MultipathAlignment format (GAMP)" << endl
+         << "    -K, --multipath-in         input VG MultipathAlignment format (GAMP), or JSON version of GAMP format" << endl
          << "    -k, --multipath            output VG MultipathAlignment format (GAMP)" << endl
          << "    -D, --expect-duplicates    don't warn if encountering the same node or edge multiple times" << endl
          << "    -x, --extract-tag TAG      extract and concatenate messages with the given tag" << endl
diff --git a/src/utility.cpp b/src/utility.cpp
index d4f493fba3..8510d98369 100644
--- a/src/utility.cpp
+++ b/src/utility.cpp
@@ -10,6 +10,9 @@
 #include <fstream>
 #include <iostream>
 #include <cctype>
+// We don't define _GNU_SOURCE to get the cpuset functions since we will
+// already have it for libstdc++ on the platforms where we need them
+#include <sched.h>
 
 
 // For setting the temporary directory in submodules.
@@ -118,7 +121,7 @@ void choose_good_thread_count() {
     if (count == 0) {
         // First priority: OMP_NUM_THREADS
         const char* value = getenv("OMP_NUM_THREADS");
-        if (value) {
+        if (value && *value != '\0') {
             // Read the value. Throws if it isn't a legit number.
             count = std::stoi(value);
         }
@@ -144,6 +147,34 @@ void choose_good_thread_count() {
         }
     }
 
+#if !defined(__APPLE__) && defined(_GNU_SOURCE)
+    if (count == 0) {
+        // Next priority: CPU affinity mask (used by Slurm)
+        cpu_set_t mask;
+        if (sched_getaffinity(getpid(), sizeof(cpu_set_t), &mask)) {
+            // TODO: If you have >1024 bits in your mask, glibc can't deal and you will get EINVAL.
+            // We're supposed to then try increasingly large dynamically-allocated CPU flag sets until we find one that works.
+            auto problem = errno;
+            std::cerr << "warning[vg]: Cannot determine CPU count from affinity mask: " << strerror(problem) << std::endl;
+        } else {
+            // We're also supposed to intersect this mask with the actual
+            // existing processors, in case somebody flags on way more
+            // processors than actually exist. But Linux doesn't seem to do
+            // that by default, so we don't worry about it.
+            count = CPU_COUNT(&mask);
+        }
+    }
+#endif
+
+    if (count == 0) {
+        // Next priority: SLURM_JOB_CPUS_PER_NODE
+        const char* value = getenv("SLURM_JOB_CPUS_PER_NODE");
+        if (value && *value != '\0') {
+            // Read the value. Throws if it isn't a legit number.
+            count = std::stoi(value);
+        }
+    }
+
     if (count == 0) {
         // Next priority: hardware concurrency as reported by the STL.
         // This may itself be 0 if ungettable.
diff --git a/src/utility.hpp b/src/utility.hpp
index f798739df0..6d8d5c17f7 100644
--- a/src/utility.hpp
+++ b/src/utility.hpp
@@ -38,8 +38,9 @@ double get_fraction_of_ns(const string& seq);
 /// TODO: Assumes that this is the same for every parallel section.
 int get_thread_count(void);
 /// Decide on and apply a sensible OMP thread count. Pay attention to
-/// OMP_NUM_THREADS if set, the "hardware concurrency", and container limit
-/// information that may be available in /proc.
+/// OMP_NUM_THREADS and SLURM_JOB_CPUS_PER_NODE if set, the "hardware
+/// concurrency", and container limit information that may be available in
+/// /proc.
 void choose_good_thread_count();
 string wrap_text(const string& str, size_t width);
 bool is_number(const string& s);