From afe9f795a911bef543b10ad079c91cb64435e3fe Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Thu, 24 Oct 2024 15:29:05 -0700 Subject: [PATCH 01/13] src/simreads.cpp: refactoring loops so that the memory footprint does not grow with the size of the simulation --- src/simreads.cpp | 353 ++++++++++++++++++++++------------------------- 1 file changed, 164 insertions(+), 189 deletions(-) diff --git a/src/simreads.cpp b/src/simreads.cpp index e5c72d6..eeca66b 100644 --- a/src/simreads.cpp +++ b/src/simreads.cpp @@ -16,94 +16,94 @@ * General Public License for more details. */ +#include "OptionParser.hpp" #include "smithlab_os.hpp" #include "smithlab_utils.hpp" -#include "OptionParser.hpp" -#include "simreads.hpp" #include "AbismalIndex.hpp" #include "cigar_utils.hpp" #include "sam_record.hpp" +#include "simreads.hpp" -#include +#include +#include // for the int8_t and friends #include +#include +#include +#include +#include #include #include -#include -#include -#include -#include -#include // for the int8_t and friends -#include +#include // getpid() -using std::vector; -using std::runtime_error; -using std::string; using std::cerr; using std::endl; using std::function; -using std::istream; -using std::ostream; using std::ifstream; +using std::istream; +using std::istringstream; using std::ofstream; +using std::ostream; using std::ostringstream; -using std::istringstream; +using std::runtime_error; +using std::string; using std::to_string; +using std::vector; +template using num_lim = std::numeric_limits; namespace simreads_random { - // ADS: I made this namespace and functions because different - // implementations of rand() on different OS meant that even with - // the same seed, the results could be different. This meant testing - // didn't work. - bool initialized = false; - std::default_random_engine e; - std::uniform_real_distribution dr; - std::uniform_int_distribution di; - void initialize(const size_t the_seed) { - e = std::default_random_engine(the_seed); - initialized = true; - } - int rand() { - assert(initialized); - // ADS: should have same range as ordinary rand() by properties of - // std::uniform_int_distribution default constructor. - return di(e); - } - double rand_double() { // ADS: in the interval [0, 1] - assert(initialized); - // ADS: default constructor for std::uniform_real_distribution - // sets a range of [0,1) - return dr(e); - } +// ADS: I made this namespace and functions because different +// implementations of rand() on different OS meant that even with +// the same seed, the results could be different. This meant testing +// didn't work. +bool initialized = false; +std::default_random_engine e; +std::uniform_real_distribution dr; +std::uniform_int_distribution di; +void +initialize(const size_t the_seed) { + e = std::default_random_engine(the_seed); + initialized = true; } - +int +rand() { + assert(initialized); + // ADS: should have same range as ordinary rand() by properties of + // std::uniform_int_distribution default constructor. + return di(e); +} +double +rand_double() { // ADS: in the interval [0, 1] + assert(initialized); + // ADS: default constructor for std::uniform_real_distribution + // sets a range of [0,1) + return dr(e); +} +} // namespace simreads_random static string format_fastq_record(const string &name, const string &read) { assert(!name.empty()); ostringstream oss; - oss << '@' << name << endl << read << endl - << '+' << endl << string(read.length(), 'B'); + oss << '@' << name << endl + << read << endl + << '+' << endl + << string(read.length(), 'B'); return oss.str(); } - struct FragInfo { - void set_sequential_name() { - name = "read" + to_string(frag_count++); - } - string - read1() const { + void set_sequential_name() { name = "read" + to_string(frag_count++); } + string read1() const { assert(!name.empty()); string read = seq.substr(0, read_length); for (size_t i = 0; i < read_length - read.length(); ++i) read += random_base(); return format_fastq_record(name + ".1", read); } - string - read2() const { + string read2() const { assert(!name.empty()); string read(seq); revcomp_inplace(read); @@ -112,10 +112,9 @@ struct FragInfo { read += random_base(); return format_fastq_record(name + ".2", read); } - void - erase_info_through_insert() { + void erase_info_through_insert() { const size_t orig_ref_len = end_pos - start_pos; - if (2*read_length < seq.length()) { + if (2 * read_length < seq.length()) { string cigar2(cigar); truncate_cigar_q(cigar, read_length); reverse_cigar(begin(cigar2), end(cigar2)); @@ -123,20 +122,15 @@ struct FragInfo { reverse_cigar(begin(cigar2), end(cigar2)); const size_t rseq_ops = cigar_rseq_ops(cigar) + cigar_rseq_ops(cigar2); cigar = cigar + to_string(orig_ref_len - rseq_ops) + "N" + cigar2; - seq = seq.substr(0, read_length) + - string(orig_ref_len - rseq_ops, 'N') + - seq.substr(seq.length() - read_length, read_length); - + seq = seq.substr(0, read_length) + string(orig_ref_len - rseq_ops, 'N') + + seq.substr(seq.length() - read_length, read_length); } } - void - remove_cigar_match_symbols() { + void remove_cigar_match_symbols() { replace(begin(cigar), end(cigar), '=', 'M'); merge_equal_neighbor_cigar_ops(cigar); } - void - bisulfite_conversion(const bool random_pbat, - const double bs_conv) { + void bisulfite_conversion(const bool random_pbat, const double bs_conv) { if (pbat || (random_pbat && simreads_random::rand_double() < 0.5)) { for (auto it(begin(seq)); it != end(seq); ++it) { if (*it == 'G' && (simreads_random::rand_double() < bs_conv)) @@ -171,7 +165,6 @@ bool FragInfo::pbat = false; size_t FragInfo::frag_count = 0; size_t FragInfo::read_length = 100; - static ostream & operator<<(ostream &out, FragInfo &the_info) { const bool rc = the_info.rc(); @@ -181,12 +174,14 @@ operator<<(ostream &out, FragInfo &the_info) { samflags::set(flags_read, samflags::read_paired); samflags::set(flags_read, samflags::read_pair_mapped); samflags::set(flags_read, samflags::template_first); - samflags::set(flags_read, the_info.rc() ? samflags::read_rc : samflags::mate_rc); + samflags::set(flags_read, + the_info.rc() ? samflags::read_rc : samflags::mate_rc); samflags::set(flags_mate, samflags::read_paired); samflags::set(flags_mate, samflags::read_pair_mapped); samflags::set(flags_mate, samflags::template_last); - samflags::set(flags_mate, the_info.rc() ? samflags::mate_rc : samflags::read_rc); + samflags::set(flags_mate, + the_info.rc() ? samflags::mate_rc : samflags::read_rc); const size_t read_pos = the_info.start_pos + 1; const size_t mate_pos = the_info.end_pos - FragInfo::read_length + 1; @@ -210,64 +205,47 @@ operator<<(ostream &out, FragInfo &the_info) { const size_t pos1 = rc ? mate_pos : read_pos; const size_t pos2 = rc ? read_pos : mate_pos; - return out << the_info.name << ".1\t" - << flags_read << '\t' - << the_info.chrom << '\t' - << pos1 << '\t' - << "255\t" - << cigar1 << '\t' - << "=\t" - << pos2 << "\t" - << tlen << '\t' - << seq1 << "\t" + return out << the_info.name << ".1\t" << flags_read << '\t' << the_info.chrom + << '\t' << pos1 << '\t' << "255\t" << cigar1 << '\t' << "=\t" + << pos2 << "\t" << tlen << '\t' << seq1 << "\t" << "*" << endl - << the_info.name << ".2\t" - << flags_mate << '\t' - << the_info.chrom << '\t' - << pos2 << '\t' - << "255\t" - << cigar2 << '\t' - << "=\t" - << pos1 << "\t" - << -tlen << '\t' - << seq2 << "\t" + << the_info.name << ".2\t" << flags_mate << '\t' << the_info.chrom + << '\t' << pos2 << '\t' << "255\t" << cigar2 << '\t' << "=\t" + << pos1 << "\t" << -tlen << '\t' << seq2 << "\t" << "*"; } - // extract the position of the fragment checking all bases are valid static void -sim_frag_position(const string &genome, const size_t frag_len, - string &the_frag, size_t &the_position) { - static auto is_invalid = [](const char c) {return !valid_base(c);}; +sim_frag_position(const string &genome, const size_t frag_len, string &the_frag, + size_t &the_position) { + static auto is_invalid = [](const char c) { return !valid_base(c); }; const size_t lim = genome.length() - frag_len + 1; do { the_position = simreads_random::rand() % lim; the_frag = string(begin(genome) + the_position, begin(genome) + the_position + frag_len); - } - while (find_if(begin(the_frag), end(the_frag), is_invalid) != end(the_frag)); + } while (find_if(begin(the_frag), end(the_frag), is_invalid) != + end(the_frag)); } - // simulate from a uniform distribution in a range static size_t sim_frag_length(const size_t min_length, const size_t max_length) { assert(max_length >= min_length); - if (min_length == max_length) return min_length; + if (min_length == max_length) + return min_length; const size_t diff = max_length - min_length; return min_length + (simreads_random::rand() % diff); } - struct FragSampler { FragSampler(const string &g, const ChromLookup c, const char sc, const size_t milen, const size_t malen) : genome(g), cl(c), strand_code(sc), min_length(milen), max_length(malen) {} - void - sample_fragment(FragInfo &the_info) const { + void sample_fragment(FragInfo &the_info) const { const size_t frag_len = sim_frag_length(min_length, max_length); sim_frag_position(genome, frag_len, the_info.seq, the_info.start_pos); @@ -277,17 +255,21 @@ struct FragSampler { the_info.start_pos = offset; the_info.end_pos = the_info.start_pos + frag_len; - the_info.set_sequential_name(); // default - the_info.strand = sim_strand(); // based on frag code + the_info.set_sequential_name(); // default + the_info.strand = sim_strand(); // based on frag code if (the_info.strand == '-') revcomp_inplace(the_info.seq); - the_info.cigar = to_string(frag_len) + "M"; // default, no muts + the_info.cigar = to_string(frag_len) + "M"; // default, no muts } char sim_strand() const { - if (strand_code == 'f') return '+'; - else if (strand_code == 'r') return '-'; - else if (strand_code == 'b') return (simreads_random::rand() & 1) ? '+' : '-'; - else throw runtime_error("bad strand code: " + to_string(strand_code)); + if (strand_code == 'f') + return '+'; + else if (strand_code == 'r') + return '-'; + else if (strand_code == 'b') + return (simreads_random::rand() & 1) ? '+' : '-'; + else + throw runtime_error("bad strand code: " + to_string(strand_code)); return '\0'; } const string &genome; @@ -297,12 +279,13 @@ struct FragSampler { size_t max_length; }; - struct FragMutator { FragMutator(const double m, const double s, const double i, const double d) : - mutation_rate(m), substitution_rate(s), - insertion_rate(i), deletion_rate(d) { - const double total = substitution_rate + insertion_rate + deletion_rate; + mutation_rate(m), substitution_rate(s), insertion_rate(i), + deletion_rate(d) { + const double total = + std::max(substitution_rate + insertion_rate + deletion_rate, + num_lim::min()); substitution_rate /= total; insertion_rate /= total; deletion_rate /= total; @@ -332,24 +315,28 @@ struct FragMutator { ++the_info.score; ++i; } - else { //if (mut == '=') { + else { // if (mut == '=') { cigar += "="; seq += the_info.seq[i]; ++i; } } - the_info.cigar.resize(2*cigar.size()); + the_info.cigar.resize(2 * cigar.size()); compress_cigar(begin(cigar), end(cigar), the_info.cigar); swap(seq, the_info.seq); } char sample_mutation() const { - const double x = simreads_random::rand_double(); - if (x > mutation_rate) return '='; + const double x = simreads_random::rand_double(); + if (x > mutation_rate) + return '='; else { - const double y = simreads_random::rand_double(); - if (y < substitution_rate) return 'M'; - else if (y < insertion_rate) return 'I'; - else return 'D'; + const double y = simreads_random::rand_double(); + if (y < substitution_rate) + return 'M'; + else if (y < insertion_rate) + return 'I'; + else + return 'D'; } } string tostring() const { @@ -360,17 +347,15 @@ struct FragMutator { << "deletion_rate=" << deletion_rate; return oss.str(); } - double mutation_rate; - double substitution_rate; - double insertion_rate; - double deletion_rate; + double mutation_rate{}; + double substitution_rate{}; + double insertion_rate{}; + double deletion_rate{}; }; - static void extract_change_type_vals(const string &change_type_vals, - double &substitution_rate, - double &insertion_rate, + double &substitution_rate, double &insertion_rate, double &deletion_rate) { if (!change_type_vals.empty()) { istringstream iss(change_type_vals); @@ -383,7 +368,6 @@ extract_change_type_vals(const string &change_type_vals, } } - int simreads(int argc, const char **argv) { @@ -404,7 +388,7 @@ simreads(int argc, const char **argv) { char strand_arg = 'b'; - size_t rng_seed = std::numeric_limits::max(); + size_t rng_seed = num_lim::max(); double mutation_rate = 0.0; string change_type_vals; @@ -414,35 +398,36 @@ simreads(int argc, const char **argv) { double bs_conv = 1.0; - size_t max_mutations = std::numeric_limits::max(); + size_t max_mutations = num_lim::max(); /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), "simulate reads for " - "testing walt2", "", 1); + OptionParser opt_parse(strip_path(argv[0]), + "simulate reads for " + "testing walt2", + "", 1); opt_parse.set_show_defaults(); opt_parse.add_opt("out", 'o', "output file prefix", true, output_prefix); opt_parse.add_opt("single", '\0', "output single end", false, single_end); opt_parse.add_opt("loc", '\0', "write locations", false, write_locations); - opt_parse.add_opt("read-len", 'l', "read length", false, FragInfo::read_length); - opt_parse.add_opt("min-fraglen", '\0', "min fragment length", - false, min_frag_len); - opt_parse.add_opt("max-fraglen", '\0', "max fragment length", - false, max_frag_len); + opt_parse.add_opt("read-len", 'l', "read length", false, + FragInfo::read_length); + opt_parse.add_opt("min-fraglen", '\0', "min fragment length", false, + min_frag_len); + opt_parse.add_opt("max-fraglen", '\0', "max fragment length", false, + max_frag_len); opt_parse.add_opt("n-reads", 'n', "number of reads", false, n_reads); opt_parse.add_opt("mut", 'm', "mutation rate", false, mutation_rate); - opt_parse.add_opt("bis", 'b', "bisulfite conversion rate", false,\ - bs_conv); + opt_parse.add_opt("bis", 'b', "bisulfite conversion rate", false, bs_conv); opt_parse.add_opt("show-matches", '\0', "show match symbols in cigar", false, show_cigar_matches); - opt_parse.add_opt("changes", 'c', - "change types (comma sep relative vals)", + opt_parse.add_opt("changes", 'c', "change types (comma sep relative vals)", false, change_type_vals); opt_parse.add_opt("max-mut", 'M', "max mutations", false, max_mutations); opt_parse.add_opt("pbat", 'a', "pbat", false, FragInfo::pbat); opt_parse.add_opt("random-pbat", 'R', "random pbat", false, random_pbat); opt_parse.add_opt("strand", 's', "strand {f, r, b}", false, strand_arg); - opt_parse.add_opt("seed", '\0', "rng seed (default: from system)", - false, rng_seed); + opt_parse.add_opt("seed", '\0', "rng seed (default: from system)", false, + rng_seed); opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE); vector leftover_args; opt_parse.parse(argc, argv, leftover_args); @@ -465,12 +450,12 @@ simreads(int argc, const char **argv) { const string genome_file(leftover_args.front()); /****************** END COMMAND LINE OPTIONS *****************/ - extract_change_type_vals(change_type_vals, - substitution_rate, insertion_rate, deletion_rate); + extract_change_type_vals(change_type_vals, substitution_rate, + insertion_rate, deletion_rate); - if (rng_seed == std::numeric_limits::max()) { + if (rng_seed == num_lim::max()) rng_seed = time(0) + getpid(); - } + if (VERBOSE) cerr << "rng seed: " << rng_seed << endl; simreads_random::initialize(rng_seed); @@ -484,74 +469,64 @@ simreads(int argc, const char **argv) { ChromLookup cl; load_genome(genome_file, genome, cl); transform(begin(genome), end(genome), begin(genome), - [](unsigned char c){return toupper(c);}); - - - FragSampler frag_samp(genome, cl, strand_arg, min_frag_len, max_frag_len); - if (VERBOSE) - cerr << "[constructed fragment sampler]" << endl; - if (VERBOSE) - cerr << "[simulating clean frags]" << endl; - vector the_info(n_reads); - for (size_t i = 0; i < n_reads; ++i) - frag_samp.sample_fragment(the_info[i]); - - if (VERBOSE) - cerr << "[mutating the frags]" << endl; - FragMutator frag_mut(mutation_rate, substitution_rate, - insertion_rate, deletion_rate); - - for (size_t i = 0; i < the_info.size(); ++i) - frag_mut.mutate(the_info[i]); - - for (size_t i = 0; i < the_info.size(); ++i) - the_info[i].bisulfite_conversion(random_pbat, bs_conv); - - if (!show_cigar_matches) - for (size_t i = 0; i < the_info.size(); ++i) - the_info[i].remove_cigar_match_symbols(); - + [](unsigned char c) { return toupper(c); }); + ofstream loc_out; if (write_locations) { - const string locations_file = output_prefix + ".sam"; if (VERBOSE) - cerr << "[writing frag locations: " << locations_file << "]" << endl; - ofstream loc_out(locations_file); + cerr << "[opening frag locations file: " << locations_file << "]" + << endl; + loc_out.open(locations_file); if (!loc_out) - throw runtime_error("bad locations file: " + locations_file); - for (size_t i = 0; i < the_info.size(); ++i) { - loc_out << the_info[i] << endl; - } + throw runtime_error("bad locations output file: " + locations_file); } const string read1_outfile = output_prefix + "_1.fq"; if (VERBOSE) - cerr << "[writing read1 fastq: " << read1_outfile << "]" << endl; + cerr << "[opening read1 fastq: " << read1_outfile << "]" << endl; ofstream read1_out(read1_outfile); if (!read1_out) throw runtime_error("bad output file: " + read1_outfile); - for (size_t i = 0; i < the_info.size(); ++i) - read1_out << the_info[i].read1() << endl; - + ofstream read2_out; if (!single_end) { const string read2_outfile = output_prefix + "_2.fq"; if (VERBOSE) - cerr << "[writing read2 fastq: " << read2_outfile << "]" << endl; - ofstream read2_out(read2_outfile); + cerr << "[opening read2 fastq: " << read2_outfile << "]" << endl; + read2_out.open(read2_outfile); if (!read2_out) throw runtime_error("bad output file: " + read2_outfile); - for (size_t i = 0; i < the_info.size(); ++i) - read2_out << the_info[i].read2() << endl; + } + + FragSampler frag_samp(genome, cl, strand_arg, min_frag_len, max_frag_len); + if (VERBOSE) + cerr << "[constructed fragment sampler]" << endl; + + FragMutator frag_mut(mutation_rate, substitution_rate, insertion_rate, + deletion_rate); + if (VERBOSE) + cerr << "[constructed mutator]" << endl; + + if (VERBOSE) + cerr << "[simulating frags]" << endl; + + for (size_t i = 0; i < n_reads; ++i) { + FragInfo info; + frag_samp.sample_fragment(info); + frag_mut.mutate(info); + info.bisulfite_conversion(random_pbat, bs_conv); + if (!show_cigar_matches) + info.remove_cigar_match_symbols(); + if (write_locations) + loc_out << info << '\n'; + read1_out << info.read1() << '\n'; + if (!single_end) + read2_out << info.read2() << '\n'; } } - catch (const runtime_error &e) { + catch (const std::exception &e) { cerr << e.what() << endl; return EXIT_FAILURE; } - catch (std::bad_alloc &ba) { - cerr << "ERROR: could not allocate memory" << endl; - return EXIT_FAILURE; - } return EXIT_SUCCESS; } From 2283443336cf80e7452551ef8a98ca4ca0af3257 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Thu, 24 Oct 2024 19:39:05 -0700 Subject: [PATCH 02/13] src/simreads.cpp: adding option to output in FASTA format which helps for speed and disk footprint when running tests on massive simulated test data sets --- src/simreads.cpp | 81 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 56 insertions(+), 25 deletions(-) diff --git a/src/simreads.cpp b/src/simreads.cpp index eeca66b..ced770b 100644 --- a/src/simreads.cpp +++ b/src/simreads.cpp @@ -47,6 +47,8 @@ using std::ofstream; using std::ostream; using std::ostringstream; using std::runtime_error; +using std::size; +using std::size_t; using std::string; using std::to_string; using std::vector; @@ -83,15 +85,28 @@ rand_double() { // ADS: in the interval [0, 1] } } // namespace simreads_random -static string +static inline string format_fastq_record(const string &name, const string &read) { assert(!name.empty()); - ostringstream oss; - oss << '@' << name << endl - << read << endl - << '+' << endl - << string(read.length(), 'B'); - return oss.str(); + string s; + s += '@'; + s += name; + s += '\n'; + s += read; + s += "\n+\n"; + s += string(size(read), 'B'); + return s; +} + +static inline string +format_fasta_record(const string &name, const string &read) { + assert(!name.empty()); + string s; + s += '>'; + s += name; + s += '\n'; + s += read; + return s; } struct FragInfo { @@ -99,22 +114,24 @@ struct FragInfo { string read1() const { assert(!name.empty()); string read = seq.substr(0, read_length); - for (size_t i = 0; i < read_length - read.length(); ++i) + for (size_t i = 0; i < read_length - size(read); ++i) read += random_base(); - return format_fastq_record(name + ".1", read); + return fasta_format ? format_fasta_record(name + ".1", read) + : format_fastq_record(name + ".1", read); } string read2() const { assert(!name.empty()); string read(seq); revcomp_inplace(read); read = read.substr(0, read_length); - for (size_t i = 0; i < read_length - read.length(); ++i) + for (size_t i = 0; i < read_length - size(read); ++i) read += random_base(); - return format_fastq_record(name + ".2", read); + return fasta_format ? format_fasta_record(name + ".2", read) + : format_fastq_record(name + ".2", read); } void erase_info_through_insert() { const size_t orig_ref_len = end_pos - start_pos; - if (2 * read_length < seq.length()) { + if (2 * read_length < size(seq)) { string cigar2(cigar); truncate_cigar_q(cigar, read_length); reverse_cigar(begin(cigar2), end(cigar2)); @@ -123,7 +140,7 @@ struct FragInfo { const size_t rseq_ops = cigar_rseq_ops(cigar) + cigar_rseq_ops(cigar2); cigar = cigar + to_string(orig_ref_len - rseq_ops) + "N" + cigar2; seq = seq.substr(0, read_length) + string(orig_ref_len - rseq_ops, 'N') + - seq.substr(seq.length() - read_length, read_length); + seq.substr(size(seq) - read_length, read_length); } } void remove_cigar_match_symbols() { @@ -148,20 +165,22 @@ struct FragInfo { bool rc() const { return strand == '-'; } string chrom; - size_t start_pos; - size_t end_pos; + size_t start_pos{}; + size_t end_pos{}; string name; - double score; - char strand; + double score{}; + char strand{}; string seq; string cigar; static bool pbat; + static bool fasta_format; static size_t frag_count; static size_t read_length; }; bool FragInfo::pbat = false; +bool FragInfo::fasta_format = false; size_t FragInfo::frag_count = 0; size_t FragInfo::read_length = 100; @@ -222,7 +241,7 @@ sim_frag_position(const string &genome, const size_t frag_len, string &the_frag, size_t &the_position) { static auto is_invalid = [](const char c) { return !valid_base(c); }; - const size_t lim = genome.length() - frag_len + 1; + const size_t lim = size(genome) - frag_len + 1; do { the_position = simreads_random::rand() % lim; the_frag = string(begin(genome) + the_position, @@ -296,7 +315,7 @@ struct FragMutator { string seq, cigar; size_t i = 0; the_info.score = 0; - while (i < the_info.seq.length()) { + while (i < size(the_info.seq)) { // select a mutation or not const char mut = sample_mutation(); if (mut == 'I') { @@ -426,6 +445,8 @@ simreads(int argc, const char **argv) { opt_parse.add_opt("pbat", 'a', "pbat", false, FragInfo::pbat); opt_parse.add_opt("random-pbat", 'R', "random pbat", false, random_pbat); opt_parse.add_opt("strand", 's', "strand {f, r, b}", false, strand_arg); + opt_parse.add_opt("fasta", '\0', "output fasta format (no quality scores)", + false, FragInfo::fasta_format); opt_parse.add_opt("seed", '\0', "rng seed (default: from system)", false, rng_seed); opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE); @@ -481,18 +502,28 @@ simreads(int argc, const char **argv) { throw runtime_error("bad locations output file: " + locations_file); } - const string read1_outfile = output_prefix + "_1.fq"; - if (VERBOSE) - cerr << "[opening read1 fastq: " << read1_outfile << "]" << endl; + const string read1_outfile = + output_prefix + (FragInfo::fasta_format ? "_1.fa" : "_1.fq"); + if (VERBOSE) { + if (FragInfo::fasta_format) + cerr << "[opening read1 fastq: " << read1_outfile << "]" << endl; + else + cerr << "[opening read1 fasta: " << read1_outfile << "]" << endl; + } ofstream read1_out(read1_outfile); if (!read1_out) throw runtime_error("bad output file: " + read1_outfile); ofstream read2_out; if (!single_end) { - const string read2_outfile = output_prefix + "_2.fq"; - if (VERBOSE) - cerr << "[opening read2 fastq: " << read2_outfile << "]" << endl; + const string read2_outfile = + output_prefix + (FragInfo::fasta_format ? "_2.fa" : "_2.fq"); + if (VERBOSE) { + if (FragInfo::fasta_format) + cerr << "[opening read2 fastq: " << read2_outfile << "]" << endl; + else + cerr << "[opening read2 fasta: " << read2_outfile << "]" << endl; + } read2_out.open(read2_outfile); if (!read2_out) throw runtime_error("bad output file: " + read2_outfile); From 811806e2f3c70e10c3d25e69a4ac2a2f28fe7563 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Thu, 24 Oct 2024 19:39:43 -0700 Subject: [PATCH 03/13] data/md5sum.txt: updating test output hashes for changes to order of operations within simreads, which causes the rng to be used in a different order and changes the random output --- data/md5sum.txt | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/data/md5sum.txt b/data/md5sum.txt index 58ec784..ed55168 100644 --- a/data/md5sum.txt +++ b/data/md5sum.txt @@ -1,16 +1,16 @@ -ae8e662c3ab23ba742a45a9784390e63 tests/reads_1.fq -ddeea92ba05c5c6a909c136d5059140d tests/reads.mstats -7c43e9b1409c6725c70aeb18d8db3413 tests/reads_pbat_pe_1.fq -21c3b088e956de62e10a6d190ffb8744 tests/reads_pbat_pe_2.fq -8c170b022512012f078396cc19c41600 tests/reads_pbat_pe.mstats -299e0bc8d7862326d693a08b9fc70d76 tests/reads_pbat_pe.sam -ae8e662c3ab23ba742a45a9784390e63 tests/reads_pe_1.fq -d2723d2e99d2a3492af508633d61b30a tests/reads_pe_2.fq -6e2ba01e98f0effc005641ba6a7eb8b6 tests/reads_pe.mstats -5e661e3bb770f84bee1bf09ff9ec4f30 tests/reads_pe.sam -2e033e6c0f2e93000877d6f8f40bcc99 tests/reads_rpbat_pe_1.fq -a28bc47b567130bbcdc45b1daa54ae32 tests/reads_rpbat_pe_2.fq -ecf7559bdeecb3f5d23e2c85e1eb22ff tests/reads_rpbat_pe.mstats -5f1812a2d4e26e6fd447a8bf5fd9b132 tests/reads_rpbat_pe.sam -b621241624f8a8d030b42dd47838d5dd tests/reads.sam +5b564e80467f0600d1dc01e7fbb45cdb tests/reads_1.fq +38ac3541d7233d7e946c2e4b87591519 tests/reads.mstats +2e492e9d438dee062363449cfcbdb7df tests/reads_pbat_pe_1.fq +744e216ed05e34dead8e293721b1376b tests/reads_pbat_pe_2.fq +fc153607ee4799cca9d040976f4182ea tests/reads_pbat_pe.mstats +533826ad127ec9b5cb95229691cdac43 tests/reads_pbat_pe.sam +6b080ecdc9e7c58c24601ba95f475dcc tests/reads_pe_1.fq +3eb9b169c45590eea443715284c80dce tests/reads_pe_2.fq +093aa787322a3e5250fb229a62eac201 tests/reads_pe.mstats +b1c5b9d7a991d22dc1958badb5ef6224 tests/reads_pe.sam +4253b0de3da8715e8851c90962d02c1e tests/reads_rpbat_pe_1.fq +18daaa57cde00d36d612495aa4acc35b tests/reads_rpbat_pe_2.fq +4be779558f7fb70775c32a80f5d38455 tests/reads_rpbat_pe.mstats +8b33c64513bd6989823a2c47bf632963 tests/reads_rpbat_pe.sam +acb6582d5b8def45a0d38d8fee5c0edb tests/reads.sam bcbf01be810cbf4051292813eb6b9225 tests/tRex1.idx From d594e66187a95d799022525357982ba583b0c90c Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Fri, 25 Oct 2024 12:44:42 -0700 Subject: [PATCH 04/13] src/abismal.cpp: adding version number in help output for abismal --- src/abismal.cpp | 223 ++++++++++++++++++++++++++++++------------------ 1 file changed, 142 insertions(+), 81 deletions(-) diff --git a/src/abismal.cpp b/src/abismal.cpp index 5c9fef1..10c0741 100644 --- a/src/abismal.cpp +++ b/src/abismal.cpp @@ -96,7 +96,7 @@ get_strand_code(const char strand, const conversion_type conv) { } struct ReadLoader { - ReadLoader(const string &fn): cur_line{0}, filename{fn}, in{fn, "r"} {} + ReadLoader(const string &fn) : cur_line{0}, filename{fn}, in{fn, "r"} {} bool good() const { return in; } @@ -132,7 +132,8 @@ struct ReadLoader { [](const char c) { return c != 'N'; }) < min_read_length) line.clear(); else { - while (line.back() == 'N') line.pop_back(); // remove Ns from 3' + while (line.back() == 'N') + line.pop_back(); // remove Ns from 3' line = line.substr(line.find_first_of("ACGT")); // removes Ns from 5' } reads.emplace_back(line); @@ -161,7 +162,8 @@ const uint32_t ReadLoader::min_read_length = // alignment matrix for a batch of reads static inline void update_max_read_length(size_t &max_length, const vector &reads) { - for (auto &i : reads) max_length = max(max_length, i.size()); + for (auto &i : reads) + max_length = max(max_length, i.size()); } struct se_element { // size = 8 @@ -169,10 +171,10 @@ struct se_element { // size = 8 flags_t flags; // 2 bytes uint32_t pos; // 4 bytes - se_element(): diffs(MAX_DIFFS), flags(0), pos(0) {} + se_element() : diffs(MAX_DIFFS), flags(0), pos(0) {} - se_element(const score_t d, const flags_t f, const uint32_t p) - : diffs(d), flags(f), pos(p) {} + se_element(const score_t d, const flags_t f, const uint32_t p) : + diffs(d), flags(f), pos(p) {} bool operator==(const se_element &rhs) const { return pos == rhs.pos && flags == rhs.flags; @@ -248,13 +250,15 @@ valid_hit(const se_element s, const uint32_t readlen) { return s.diffs < static_cast(se_element::invalid_hit_frac * readlen); } -template static inline T +template +static inline T max16(const T x, const T y) { return (x > y) ? x : y; } struct se_candidates { - se_candidates(): sz(1), best(se_element()), v(vector(max_size)) {} + se_candidates() : + sz(1), best(se_element()), v(vector(max_size)) {} inline bool full() const { return sz == max_size; }; @@ -284,7 +288,9 @@ struct se_candidates { pop_heap(begin(v), begin(v) + sz); v[sz - 1] = se_element(d, s, p); } - else { v[sz++] = se_element(d, s, p); } + else { + v[sz++] = se_element(d, s, p); + } push_heap(begin(v), begin(v) + sz); } @@ -357,7 +363,8 @@ static inline bool chrom_and_posn(const ChromLookup &cl, const bam_cigar_t &cig, const uint32_t p, uint32_t &r_p, uint32_t &r_e, uint32_t &r_chr) { const uint32_t ref_ops = cigar_rseq_ops(cig); - if (!cl.get_chrom_idx_and_offset(p, ref_ops, r_chr, r_p)) return false; + if (!cl.get_chrom_idx_and_offset(p, ref_ops, r_chr, r_p)) + return false; r_e = r_p + ref_ops; return true; } @@ -370,7 +377,8 @@ format_se(const bool allow_ambig, const se_element &res, const ChromLookup &cl, bam_rec &sr) { const bool ambig = res.ambig(); const bool valid = !res.empty(); - if (!allow_ambig && ambig) return map_ambig; + if (!allow_ambig && ambig) + return map_ambig; uint32_t ref_s = 0, ref_e = 0, chrom_idx = 0; if (!valid || !chrom_and_posn(cl, cigar, res.pos, ref_s, ref_e, chrom_idx)) @@ -378,9 +386,11 @@ format_se(const bool allow_ambig, const se_element &res, const ChromLookup &cl, // ADS: we might be doing format_se for a mate in paried reads uint16_t flag = 0; - if (res.rc()) flag |= BAM_FREVERSE; + if (res.rc()) + flag |= BAM_FREVERSE; - if (allow_ambig && ambig) flag |= BAM_FSECONDARY; + if (allow_ambig && ambig) + flag |= BAM_FSECONDARY; // flag |= BAM_FREAD1; // ADS: this might be wrong... @@ -401,20 +411,23 @@ format_se(const bool allow_ambig, const se_element &res, const ChromLookup &cl, read.data(), // const char *seq, nullptr, // const char *qual, 16); // size_t l_aux); - if (ret < 0) throw runtime_error("failed to format bam"); + if (ret < 0) + throw runtime_error("failed to format bam"); ret = bam_aux_update_int(sr.b, "NM", res.diffs); - if (ret < 0) throw runtime_error("bam_aux_update_int"); + if (ret < 0) + throw runtime_error("bam_aux_update_int"); ret = bam_aux_append(sr.b, "CV", 'A', 1, (uint8_t *)(res.elem_is_a_rich() ? "A" : "T")); - if (ret < 0) throw runtime_error("bam_aux_append"); + if (ret < 0) + throw runtime_error("bam_aux_append"); return ambig ? map_ambig : map_unique; } struct pe_element { - pe_element(): aln_score(0), r1(se_element()), r2(se_element()) {} + pe_element() : aln_score(0), r1(se_element()), r2(se_element()) {} score_t diffs() const { return r1.diffs + r2.diffs; } @@ -503,10 +516,12 @@ format_pe(const bool allow_ambig, const pe_element &p, const ChromLookup &cl, bam_rec &sr1, bam_rec &sr2) { static const uint8_t cv[2] = {'T', 'A'}; - if (p.empty()) return map_unmapped; + if (p.empty()) + return map_unmapped; const bool ambig = p.ambig(); - if (!allow_ambig && ambig) return map_ambig; + if (!allow_ambig && ambig) + return map_ambig; uint32_t r_s1 = 0, r_e1 = 0, chr1 = 0; // positions in chroms (0-based) uint32_t r_s2 = 0, r_e2 = 0, chr2 = 0; @@ -559,13 +574,16 @@ format_pe(const bool allow_ambig, const pe_element &p, const ChromLookup &cl, read1.data(), // const char *seq, nullptr, // const char *qual, 16); // size_t l_aux); - if (ret < 0) throw runtime_error("error formatting bam"); + if (ret < 0) + throw runtime_error("error formatting bam"); ret = bam_aux_update_int(sr1.b, "NM", p.r1.diffs); - if (ret < 0) throw runtime_error("error adding aux field"); + if (ret < 0) + throw runtime_error("error adding aux field"); ret = bam_aux_append(sr1.b, "CV", 'A', 1, cv + p.r1.elem_is_a_rich()); - if (ret < 0) throw runtime_error("error adding aux field"); + if (ret < 0) + throw runtime_error("error adding aux field"); sr2.b = bam_init1(); ret = bam_set1(sr2.b, @@ -584,19 +602,22 @@ format_pe(const bool allow_ambig, const pe_element &p, const ChromLookup &cl, read2.data(), // const char *seq, nullptr, // const char *qual, 16); // size_t l_aux); - if (ret < 0) throw runtime_error("failed to format bam"); + if (ret < 0) + throw runtime_error("failed to format bam"); ret = bam_aux_update_int(sr2.b, "NM", p.r2.diffs); - if (ret < 0) throw runtime_error("error adding aux field"); + if (ret < 0) + throw runtime_error("error adding aux field"); ret = bam_aux_append(sr2.b, "CV", 'A', 1, cv + p.r2.elem_is_a_rich()); - if (ret < 0) throw runtime_error("error adding aux field"); + if (ret < 0) + throw runtime_error("error adding aux field"); return ambig ? map_ambig : map_unique; } struct pe_candidates { - pe_candidates(): v(vector(max_size_large)) {} + pe_candidates() : v(vector(max_size_large)) {} inline void reset(const uint32_t readlen) { v.front().reset(readlen); @@ -735,7 +756,8 @@ struct se_map_stats { reads_unmapped += !valid; skipped_reads += read.empty(); - if (valid && (allow_ambig || !ambig)) update_error_rate(s.diffs, cigar); + if (valid && (allow_ambig || !ambig)) + update_error_rate(s.diffs, cigar); } void update_error_rate(const score_t diffs, const bam_cigar_t &cigar) { @@ -765,7 +787,8 @@ struct se_map_stats { assign_values(); string t; - for (size_t i = 0; i < n_tabs; ++i) t += tab; + for (size_t i = 0; i < n_tabs; ++i) + t += tab; ostringstream oss; // clang-format off oss << t << "total_reads: " << total_reads << endl @@ -951,7 +974,8 @@ select_output(const bool allow_ambig, const ChromLookup &cl, name1, name2, cig1, cig2, sr1, sr2); if (!best.should_report(allow_ambig) || pe_map_type == map_unmapped) { - if (pe_map_type == map_unmapped) best.reset(); + if (pe_map_type == map_unmapped) + best.reset(); if (format_se(allow_ambig, se1, cl, read1, name1, cig1, sr1) == map_unmapped) se1.reset(); @@ -993,7 +1017,7 @@ full_compare(const score_t cutoff, const PackedRead::const_iterator read_end, return d; } -template +template static inline void check_hits(const uint32_t offset, const PackedRead::const_iterator read_st, const PackedRead::const_iterator read_end, @@ -1016,12 +1040,13 @@ check_hits(const uint32_t offset, const PackedRead::const_iterator read_st, full_compare(res.cutoff, read_end, ((the_pos & 15u) << 2), read_st, genome_st + (the_pos >> 4)); - if (diffs <= res.cutoff) res.update(specific, diffs, strand_code, the_pos); + if (diffs <= res.cutoff) + res.update(specific, diffs, strand_code, the_pos); } } struct compare_bases { - compare_bases(const genome_iterator g_): g(g_) {} + compare_bases(const genome_iterator g_) : g(g_) {} bool operator()(const uint32_t mid, const two_letter_t chr) const { return get_bit(*(g + mid)) < chr; @@ -1030,7 +1055,8 @@ struct compare_bases { const genome_iterator g; }; -template static uint32_t +template +static uint32_t find_candidates(const uint32_t max_candidates, const Read::const_iterator read_start, const genome_iterator gi, const uint32_t read_lim, vector::const_iterator &low, @@ -1062,14 +1088,15 @@ find_candidates(const uint32_t max_candidates, return p; } -template static inline three_letter_t +template +static inline three_letter_t get_three_letter_num_fast(const uint8_t nt) { return (the_conv == c_to_t) ? nt & 5 : // C=T=0, A=1, G=4 nt & 10; // A=G=0, C=2, T=8 } -template struct compare_bases_three { - compare_bases_three(const genome_iterator g_): g(g_) {} +template struct compare_bases_three { + compare_bases_three(const genome_iterator g_) : g(g_) {} bool operator()(const uint32_t mid, const three_letter_t chr) const { return get_three_letter_num_fast(*(g + mid)) < chr; @@ -1078,7 +1105,7 @@ template struct compare_bases_three { const genome_iterator g; }; -template +template static uint32_t find_candidates_three(const uint32_t max_candidates, const Read::const_iterator read_start, @@ -1131,7 +1158,8 @@ get_conv_type(const uint16_t strand_code) { : (c_to_t)); } -template static void +template +static void process_seeds(const uint32_t max_candidates, const vector::const_iterator counter_st, const vector::const_iterator counter_three_st, @@ -1198,7 +1226,8 @@ process_seeds(const uint32_t max_candidates, shift_three_key(*(read_idx + seed::key_weight_three), k_three); } - if (!res.should_do_sensitive()) return; + if (!res.should_do_sensitive()) + return; read_idx = begin(read_seed); get_1bit_hash(read_idx, k); @@ -1237,7 +1266,8 @@ process_seeds(const uint32_t max_candidates, } } -template static void +template +static void prep_read(const string &r, Read &pread) { pread.resize(r.size()); for (size_t i = 0; i != r.size(); ++i) @@ -1273,7 +1303,8 @@ pack_read(const Read &pread, PackedRead &packed_pread) { } // do not fill the flanking position - if (pread_ind == sz) return; + if (pread_ind == sz) + return; // now put only the remaining bases in the last pos. The rest // should match any base in the reference @@ -1282,7 +1313,8 @@ pack_read(const Read &pread, PackedRead &packed_pread) { while (pread_ind < sz) *it |= (static_cast(pread[pread_ind++]) << ((j++) << 2)); - while (j < NUM_BASES_PER_ELEMENT) *it |= base_match_any << ((j++) << 2); + while (j < NUM_BASES_PER_ELEMENT) + *it |= base_match_any << ((j++) << 2); } static inline bool @@ -1350,7 +1382,8 @@ align_se_candidates(const Read &pread_t, const Read &pread_t_rc, best.diffs = simple_aln::edit_distance(best_scr, len, cigar); // do not report and count it as unmapped if not valid - if (!valid(best, len, readlen, cutoff)) best.reset(); + if (!valid(best, len, readlen, cutoff)) + best.reset(); } else best.reset(); @@ -1363,11 +1396,13 @@ valid_bam_rec(const bam_rec &b) { static inline void reset_bam_rec(bam_rec &b) { - if (b.b) bam_destroy1(b.b); + if (b.b) + bam_destroy1(b.b); b.b = nullptr; } -template static void +template +static void map_single_ended(const bool show_progress, const bool allow_ambig, const AbismalIndex &abismal_index, ReadLoader &rl, se_map_stats &se_stats, bamxx::bam_header &hdr, @@ -1458,14 +1493,16 @@ map_single_ended(const bool show_progress, const bool allow_ambig, } } for (size_t i = 0; i < n_reads; ++i) { - if (valid_bam_rec(mr[i])) reset_bam_rec(mr[i]); + if (valid_bam_rec(mr[i])) + reset_bam_rec(mr[i]); se_stats.update(allow_ambig, reads[i], cigar[i], bests[i]); cigar[i].clear(); } if (show_progress) #pragma omp critical { - if (progress.time_to_report(the_byte)) progress.report(cerr, the_byte); + if (progress.time_to_report(the_byte)) + progress.report(cerr, the_byte); } } } @@ -1569,14 +1606,16 @@ map_single_ended_rand(const bool show_progress, const bool allow_ambig, throw runtime_error("failed to write bam"); } for (size_t i = 0; i < n_reads; ++i) { - if (valid_bam_rec(mr[i])) reset_bam_rec(mr[i]); + if (valid_bam_rec(mr[i])) + reset_bam_rec(mr[i]); se_stats.update(allow_ambig, reads[i], cigar[i], bests[i]); cigar[i].clear(); } if (show_progress) #pragma omp critical { - if (progress.time_to_report(the_byte)) progress.report(cerr, the_byte); + if (progress.time_to_report(the_byte)) + progress.report(cerr, the_byte); } } } @@ -1589,7 +1628,8 @@ format_time_in_sec(const double t) { return oss.str(); } -template static void +template +static void run_single_ended(const bool show_progress, const bool allow_ambig, const string &reads_file, const AbismalIndex &abismal_index, se_map_stats &se_stats, bamxx::bam_header &hdr, @@ -1622,7 +1662,8 @@ best_single(const pe_candidates &pres, se_candidates &res) { res.update(false, i->diffs, i->flags, i->pos); } -template static void +template +static void best_pair(const pe_candidates &res1, const pe_candidates &res2, const Read &pread1, const Read &pread2, bam_cigar_t &cigar1, bam_cigar_t &cigar2, vector &mem_scr1, @@ -1736,7 +1777,8 @@ best_pair(const pe_candidates &res1, const pe_candidates &res2, } } -template static bool +template +static bool select_maps(const Read &pread1, const Read &pread2, bam_cigar_t &cig1, bam_cigar_t &cig2, pe_candidates &res1, pe_candidates &res2, vector &mem_scr1, se_candidates &res_se1, @@ -1752,8 +1794,8 @@ select_maps(const Read &pread1, const Read &pread2, bam_cigar_t &cig1, return true; } -template +template static inline bool map_fragments(const uint32_t max_candidates, const string &read1, const string &read2, @@ -1770,7 +1812,8 @@ map_fragments(const uint32_t max_candidates, const string &read1, res1.reset(read1.size()); res2.reset(read2.size()); - if (read1.empty() && read2.empty()) return false; + if (read1.empty() && read2.empty()) + return false; if (!read1.empty()) { prep_read(read1, pread1); @@ -1793,7 +1836,8 @@ map_fragments(const uint32_t max_candidates, const string &read1, mem_scr1, res_se1, res_se2, aln, best); } -template static void +template +static void map_paired_ended(const bool show_progress, const bool allow_ambig, const AbismalIndex &abismal_index, ReadLoader &rl1, ReadLoader &rl2, pe_map_stats &pe_stats, @@ -1944,8 +1988,10 @@ map_paired_ended(const bool show_progress, const bool allow_ambig, } } for (size_t i = 0; i < n_reads; ++i) { - if (valid_bam_rec(mr1[i])) reset_bam_rec(mr1[i]); - if (valid_bam_rec(mr2[i])) reset_bam_rec(mr2[i]); + if (valid_bam_rec(mr1[i])) + reset_bam_rec(mr1[i]); + if (valid_bam_rec(mr2[i])) + reset_bam_rec(mr2[i]); pe_stats.update(allow_ambig, reads1[i], reads2[i], cigar1[i], cigar2[i], bests[i], bests_se1[i], bests_se2[i]); cigar1[i].clear(); @@ -1954,7 +2000,8 @@ map_paired_ended(const bool show_progress, const bool allow_ambig, if (show_progress) #pragma omp critical { - if (progress.time_to_report(the_byte)) progress.report(cerr, the_byte); + if (progress.time_to_report(the_byte)) + progress.report(cerr, the_byte); } } } @@ -2119,8 +2166,10 @@ map_paired_ended_rand(const bool show_progress, const bool allow_ambig, } } for (size_t i = 0; i < n_reads; ++i) { - if (valid_bam_rec(mr1[i])) reset_bam_rec(mr1[i]); - if (valid_bam_rec(mr2[i])) reset_bam_rec(mr2[i]); + if (valid_bam_rec(mr1[i])) + reset_bam_rec(mr1[i]); + if (valid_bam_rec(mr2[i])) + reset_bam_rec(mr2[i]); pe_stats.update(allow_ambig, reads1[i], reads2[i], cigar1[i], cigar2[i], bests[i], bests_se1[i], bests_se2[i]); cigar1[i].clear(); @@ -2129,12 +2178,14 @@ map_paired_ended_rand(const bool show_progress, const bool allow_ambig, if (show_progress) #pragma omp critical { - if (progress.time_to_report(the_byte)) progress.report(cerr, the_byte); + if (progress.time_to_report(the_byte)) + progress.report(cerr, the_byte); } } } -template static void +template +static void run_paired_ended(const bool show_progress, const bool allow_ambig, const string &reads_file1, const string &reads_file2, const AbismalIndex &abismal_index, pe_map_stats &pe_stats, @@ -2209,6 +2260,10 @@ abismal_make_sam_header(const ChromLookup &cl, const int argc, int abismal(int argc, const char **argv) { try { + + const string description = + string("map bisulfite converted reads (v") + VERSION + string(")"); + bool VERBOSE = false; bool GA_conversion = false; bool allow_ambig = false; @@ -2223,7 +2278,7 @@ abismal(int argc, const char **argv) { string stats_outfile = ""; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), "map bisulfite converted reads", + OptionParser opt_parse(strip_path(argv[0]), description, " []"); opt_parse.set_show_defaults(); opt_parse.add_opt("index", 'i', "index file", false, index_file); @@ -2255,30 +2310,30 @@ abismal(int argc, const char **argv) { vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - cerr << "help requested" << endl; cerr << opt_parse.help_message() << endl; + cerr << opt_parse.about_message() << endl; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { - cerr << "about requested" << endl; cerr << opt_parse.about_message() << endl; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << "missing required option" << endl; + cerr << "Missing required option." << endl; cerr << opt_parse.option_missing_message() << endl; return EXIT_SUCCESS; } if (leftover_args.size() != 1 && leftover_args.size() != 2) { cerr << opt_parse.help_message() << endl; + cerr << opt_parse.about_message() << endl; return EXIT_SUCCESS; } if (n_threads <= 0) { - cerr << "please choose a positive number of threads" << endl; + cerr << "Please choose a positive number of threads." << endl; return EXIT_SUCCESS; } if (index_file.empty() == genome_file.empty()) { - cerr << "please select either an index file (-i) or a genome file (-g)" + cerr << "Please select either an index file (-i) or a genome file (-g)." << endl; return EXIT_SUCCESS; } @@ -2307,15 +2362,16 @@ abismal(int argc, const char **argv) { const int n_procs = omp_get_num_procs(); int num_threads_fulfilled = 1; #pragma omp parallel - { num_threads_fulfilled = omp_get_num_threads(); } + { + num_threads_fulfilled = omp_get_num_threads(); + } if (VERBOSE && n_threads > n_procs) - print_with_time( - "[WARNING] requesting more threads than the " - "maximum of " + - to_string(n_procs) + - " processors available in " - "this device"); + print_with_time("[WARNING] requesting more threads than the " + "maximum of " + + to_string(n_procs) + + " processors available in " + "this device"); if (VERBOSE) print_with_time("using " + to_string(num_threads_fulfilled) + @@ -2345,7 +2401,8 @@ abismal(int argc, const char **argv) { const double start_time = omp_get_wtime(); if (!index_file.empty()) { - if (VERBOSE) print_with_time("loading index " + index_file); + if (VERBOSE) + print_with_time("loading index " + index_file); abismal_index.read(index_file); if (VERBOSE) @@ -2353,7 +2410,8 @@ abismal(int argc, const char **argv) { format_time_in_sec(omp_get_wtime() - start_time)); } else { - if (VERBOSE) print_with_time("indexing genome " + genome_file); + if (VERBOSE) + print_with_time("indexing genome " + genome_file); abismal_index.create_index(genome_file); if (VERBOSE) print_with_time("indexing time: " + @@ -2371,14 +2429,17 @@ abismal(int argc, const char **argv) { pe_map_stats pe_stats; bamxx::bam_out out(outfile, write_bam_fmt); - if (!out) throw runtime_error("failed to open output file: " + outfile); + if (!out) + throw runtime_error("failed to open output file: " + outfile); bamxx::bam_header hdr; int ret = abismal_make_sam_header(abismal_index.cl, argc, argv, hdr); - if (ret < 0) throw runtime_error("error formatting header"); + if (ret < 0) + throw runtime_error("error formatting header"); - if (!out.write(hdr)) throw runtime_error("error writing header"); + if (!out.write(hdr)) + throw runtime_error("error writing header"); if (reads_file2.empty()) { if (GA_conversion || pbat_mode) From d39e5a8c248eb8ac63c421660a8ca04b02d0844c Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Fri, 25 Oct 2024 14:51:07 -0700 Subject: [PATCH 05/13] src/abismalidx.cpp: reporting version with help message --- src/abismalidx.cpp | 40 +++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/src/abismalidx.cpp b/src/abismalidx.cpp index 7574025..566d227 100644 --- a/src/abismalidx.cpp +++ b/src/abismalidx.cpp @@ -15,45 +15,51 @@ * General Public License for more details. */ -#include "smithlab_os.hpp" -#include "smithlab_utils.hpp" +#include "abismalidx.hpp" +#include "AbismalIndex.hpp" + #include "OptionParser.hpp" #include "dna_four_bit.hpp" +#include "smithlab_os.hpp" +#include "smithlab_utils.hpp" -#include "abismalidx.hpp" -#include "AbismalIndex.hpp" +#include #include -#include +#include +#include #include -#include -#include +#include #include -#include +#include #include -#include +#include -using std::vector; -using std::runtime_error; -using std::string; using std::cerr; using std::endl; +using std::runtime_error; +using std::string; using std::unordered_set; +using std::vector; int abismalidx(int argc, const char **argv) { try { + + const string description = + string("build abismal index (v") + VERSION + string(")"); + string target_regions_file; bool VERBOSE = false; size_t n_threads = 1; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), "build abismal index", + OptionParser opt_parse(strip_path(argv[0]), description, " ", 2); opt_parse.set_show_defaults(); - opt_parse.add_opt("targets", 'A', "target regions", - false, target_regions_file); + opt_parse.add_opt("targets", 'A', "target regions", false, + target_regions_file); opt_parse.add_opt("threads", 't', "number of threads", false, n_threads); opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE); @@ -101,9 +107,9 @@ abismalidx(int argc, const char **argv) { abismal_index.write(outfile); if (VERBOSE) - cerr << "[total indexing time: " << omp_get_wtime() - start_time << "]" << endl; + cerr << "[total indexing time: " << omp_get_wtime() - start_time << "]" + << endl; /****************** END BUILDING INDEX *************/ - } catch (const std::runtime_error &e) { cerr << e.what() << endl; From ea2ec3554569772324c6b6254595724fd3384191 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Fri, 25 Oct 2024 14:51:42 -0700 Subject: [PATCH 06/13] src/simreads.cpp: setting default simulation behavior to allowing non-ACGT bases within fragments --- src/simreads.cpp | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/src/simreads.cpp b/src/simreads.cpp index ced770b..57dc5f8 100644 --- a/src/simreads.cpp +++ b/src/simreads.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018 Andrew D. Smith + * Copyright (C) 2018-2024 Andrew D. Smith * * Author: Andrew D. Smith * @@ -37,6 +37,8 @@ #include // getpid() +using std::cbegin; +using std::cend; using std::cerr; using std::endl; using std::function; @@ -238,16 +240,16 @@ operator<<(ostream &out, FragInfo &the_info) { // extract the position of the fragment checking all bases are valid static void sim_frag_position(const string &genome, const size_t frag_len, string &the_frag, - size_t &the_position) { + size_t &the_position, const bool require_valid) { static auto is_invalid = [](const char c) { return !valid_base(c); }; const size_t lim = size(genome) - frag_len + 1; do { the_position = simreads_random::rand() % lim; - the_frag = string(begin(genome) + the_position, - begin(genome) + the_position + frag_len); - } while (find_if(begin(the_frag), end(the_frag), is_invalid) != - end(the_frag)); + the_frag = string(cbegin(genome) + the_position, + cbegin(genome) + the_position + frag_len); + } while (require_valid && find_if(cbegin(the_frag), cend(the_frag), + is_invalid) != cend(the_frag)); } // simulate from a uniform distribution in a range @@ -262,11 +264,14 @@ sim_frag_length(const size_t min_length, const size_t max_length) { struct FragSampler { FragSampler(const string &g, const ChromLookup c, const char sc, - const size_t milen, const size_t malen) : - genome(g), cl(c), strand_code(sc), min_length(milen), max_length(malen) {} + const size_t milen, const size_t malen, + const bool require_valid) : + genome(g), cl(c), strand_code(sc), min_length(milen), max_length(malen), + require_valid(require_valid) {} void sample_fragment(FragInfo &the_info) const { const size_t frag_len = sim_frag_length(min_length, max_length); - sim_frag_position(genome, frag_len, the_info.seq, the_info.start_pos); + sim_frag_position(genome, frag_len, the_info.seq, the_info.start_pos, + require_valid); uint32_t offset = 0, chrom_idx = 0; cl.get_chrom_idx_and_offset(the_info.start_pos, chrom_idx, offset); @@ -293,9 +298,10 @@ struct FragSampler { } const string &genome; ChromLookup cl; - char strand_code; - size_t min_length; - size_t max_length; + char strand_code{}; + size_t min_length{}; + size_t max_length{}; + bool require_valid{}; }; struct FragMutator { @@ -400,6 +406,7 @@ simreads(int argc, const char **argv) { bool single_end = false; bool show_cigar_matches = true; bool random_pbat = false; + bool require_valid = false; size_t n_reads = 100; size_t min_frag_len = 100; @@ -449,6 +456,8 @@ simreads(int argc, const char **argv) { false, FragInfo::fasta_format); opt_parse.add_opt("seed", '\0', "rng seed (default: from system)", false, rng_seed); + opt_parse.add_opt("require-valid", '\0', "require valid bases in fragments", + false, require_valid); opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE); vector leftover_args; opt_parse.parse(argc, argv, leftover_args); @@ -529,7 +538,8 @@ simreads(int argc, const char **argv) { throw runtime_error("bad output file: " + read2_outfile); } - FragSampler frag_samp(genome, cl, strand_arg, min_frag_len, max_frag_len); + FragSampler frag_samp(genome, cl, strand_arg, min_frag_len, max_frag_len, + require_valid); if (VERBOSE) cerr << "[constructed fragment sampler]" << endl; From fbdaced9acd4a1cc832c4ea58b3b7f1f91335317 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Fri, 25 Oct 2024 15:21:21 -0700 Subject: [PATCH 07/13] src/abismalidx.cpp and src/abismal.cpp: cleaning up version and help printing stuff --- src/abismal.cpp | 4 ++-- src/abismalidx.cpp | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/abismal.cpp b/src/abismal.cpp index 10c0741..2a3ac3a 100644 --- a/src/abismal.cpp +++ b/src/abismal.cpp @@ -2261,8 +2261,8 @@ int abismal(int argc, const char **argv) { try { - const string description = - string("map bisulfite converted reads (v") + VERSION + string(")"); + const string version_str = string("(v") + VERSION + string(")"); + const string description = "map bisulfite converted reads " + version_str; bool VERBOSE = false; bool GA_conversion = false; diff --git a/src/abismalidx.cpp b/src/abismalidx.cpp index 566d227..0f69d1e 100644 --- a/src/abismalidx.cpp +++ b/src/abismalidx.cpp @@ -47,8 +47,8 @@ abismalidx(int argc, const char **argv) { try { - const string description = - string("build abismal index (v") + VERSION + string(")"); + const string version_str = string("(v") + VERSION + string(")"); + const string description = "build abismal index " + version_str; string target_regions_file; bool VERBOSE = false; @@ -67,6 +67,7 @@ abismalidx(int argc, const char **argv) { opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { cerr << opt_parse.help_message() << endl; + cerr << opt_parse.about_message() << endl; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { From 8921c59eb5673e1511aa3318d47d592a153ff886 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Fri, 25 Oct 2024 16:17:10 -0700 Subject: [PATCH 08/13] src/simreads.cpp: fixing major bug when trying to write locations for simulated reads --- src/simreads.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/simreads.cpp b/src/simreads.cpp index 57dc5f8..6f02ad1 100644 --- a/src/simreads.cpp +++ b/src/simreads.cpp @@ -402,7 +402,6 @@ simreads(int argc, const char **argv) { string locations_file; bool VERBOSE = false; - bool write_locations = false; bool single_end = false; bool show_cigar_matches = true; bool random_pbat = false; @@ -434,7 +433,8 @@ simreads(int argc, const char **argv) { opt_parse.set_show_defaults(); opt_parse.add_opt("out", 'o', "output file prefix", true, output_prefix); opt_parse.add_opt("single", '\0', "output single end", false, single_end); - opt_parse.add_opt("loc", '\0', "write locations", false, write_locations); + opt_parse.add_opt("loc", '\0', "write locations here", false, + locations_file); opt_parse.add_opt("read-len", 'l', "read length", false, FragInfo::read_length); opt_parse.add_opt("min-fraglen", '\0', "min fragment length", false, @@ -502,7 +502,7 @@ simreads(int argc, const char **argv) { [](unsigned char c) { return toupper(c); }); ofstream loc_out; - if (write_locations) { + if (!locations_file.empty()) { if (VERBOSE) cerr << "[opening frag locations file: " << locations_file << "]" << endl; @@ -558,7 +558,7 @@ simreads(int argc, const char **argv) { info.bisulfite_conversion(random_pbat, bs_conv); if (!show_cigar_matches) info.remove_cigar_match_symbols(); - if (write_locations) + if (!locations_file.empty()) loc_out << info << '\n'; read1_out << info.read1() << '\n'; if (!single_end) From 45460a4ed4e587324700bb5f0e5e9cd1343e1912 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Fri, 25 Oct 2024 17:46:52 -0700 Subject: [PATCH 09/13] src/simreads.cpp: using uint64_t for the position of the simulated read in the genome because otherwise it was not enough for the human genome --- src/simreads.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/simreads.cpp b/src/simreads.cpp index 6f02ad1..078557d 100644 --- a/src/simreads.cpp +++ b/src/simreads.cpp @@ -53,6 +53,7 @@ using std::size; using std::size_t; using std::string; using std::to_string; +using std::uint64_t; using std::vector; template using num_lim = std::numeric_limits; @@ -65,7 +66,7 @@ namespace simreads_random { bool initialized = false; std::default_random_engine e; std::uniform_real_distribution dr; -std::uniform_int_distribution di; +std::uniform_int_distribution di; void initialize(const size_t the_seed) { e = std::default_random_engine(the_seed); From 49a97192ada966449d17e7be465ff81d52d7e2ca Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Fri, 25 Oct 2024 20:01:23 -0700 Subject: [PATCH 10/13] src/simreads.cpp: code formatting --- src/simreads.cpp | 44 +++++++++++++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/src/simreads.cpp b/src/simreads.cpp index 078557d..161f6fd 100644 --- a/src/simreads.cpp +++ b/src/simreads.cpp @@ -37,9 +37,11 @@ #include // getpid() +using std::begin; using std::cbegin; using std::cend; using std::cerr; +using std::end; using std::endl; using std::function; using std::ifstream; @@ -53,6 +55,7 @@ using std::size; using std::size_t; using std::string; using std::to_string; +using std::transform; using std::uint64_t; using std::vector; @@ -207,7 +210,7 @@ operator<<(ostream &out, FragInfo &the_info) { const size_t read_pos = the_info.start_pos + 1; const size_t mate_pos = the_info.end_pos - FragInfo::read_length + 1; - const int tlen = rc ? (-the_info.seq.size()) : (the_info.seq.size()); + const int tlen = rc ? -size(the_info.seq) : size(the_info.seq); string cigar1 = the_info.cigar; string cigar2 = the_info.cigar; @@ -227,15 +230,30 @@ operator<<(ostream &out, FragInfo &the_info) { const size_t pos1 = rc ? mate_pos : read_pos; const size_t pos2 = rc ? read_pos : mate_pos; - return out << the_info.name << ".1\t" << flags_read << '\t' << the_info.chrom - << '\t' << pos1 << '\t' << "255\t" << cigar1 << '\t' << "=\t" - << pos2 << "\t" << tlen << '\t' << seq1 << "\t" - << "*" << endl - - << the_info.name << ".2\t" << flags_mate << '\t' << the_info.chrom - << '\t' << pos2 << '\t' << "255\t" << cigar2 << '\t' << "=\t" - << pos1 << "\t" << -tlen << '\t' << seq2 << "\t" + // clang-format off + return out << the_info.name << ".1" << '\t' + << flags_read << '\t' + << the_info.chrom << '\t' + << pos1 << '\t' + << "255" << '\t' + << cigar1 << '\t' + << "=" << '\t' + << pos2 << '\t' + << tlen << '\t' + << seq1 << '\t' + << "*" << '\n' + << the_info.name << ".2" << '\t' + << flags_mate << '\t' + << the_info.chrom << '\t' + << pos2 << '\t' + << "255" << '\t' + << cigar2 << '\t' + << "=" << '\t' + << pos1 << '\t' + << -tlen << '\t' + << seq2 << '\t' << "*"; + // clang-format on } // extract the position of the fragment checking all bases are valid @@ -347,8 +365,8 @@ struct FragMutator { ++i; } } - the_info.cigar.resize(2 * cigar.size()); - compress_cigar(begin(cigar), end(cigar), the_info.cigar); + the_info.cigar.resize(2 * size(cigar)); + compress_cigar(cbegin(cigar), cend(cigar), the_info.cigar); swap(seq, the_info.seq); } char sample_mutation() const { @@ -474,7 +492,7 @@ simreads(int argc, const char **argv) { cerr << opt_parse.option_missing_message() << endl; return EXIT_SUCCESS; } - if (leftover_args.size() != 1) { + if (size(leftover_args) != 1) { cerr << opt_parse.help_message() << endl; return EXIT_SUCCESS; } @@ -499,7 +517,7 @@ simreads(int argc, const char **argv) { string genome; ChromLookup cl; load_genome(genome_file, genome, cl); - transform(begin(genome), end(genome), begin(genome), + transform(cbegin(genome), cend(genome), begin(genome), [](unsigned char c) { return toupper(c); }); ofstream loc_out; From 1cbbac132d94d1a7bc7ce6e55c0abca98ba6c898 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sat, 26 Oct 2024 16:33:59 -0700 Subject: [PATCH 11/13] src/simreads.cpp: trying to debug an issue with integer values that seems to prevent simulating all the way through the human genome --- src/simreads.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/simreads.cpp b/src/simreads.cpp index 161f6fd..7ceaff2 100644 --- a/src/simreads.cpp +++ b/src/simreads.cpp @@ -75,7 +75,8 @@ initialize(const size_t the_seed) { e = std::default_random_engine(the_seed); initialized = true; } -int + +uint64_t rand() { assert(initialized); // ADS: should have same range as ordinary rand() by properties of @@ -292,7 +293,7 @@ struct FragSampler { sim_frag_position(genome, frag_len, the_info.seq, the_info.start_pos, require_valid); - uint32_t offset = 0, chrom_idx = 0; + uint64_t offset = 0, chrom_idx = 0; cl.get_chrom_idx_and_offset(the_info.start_pos, chrom_idx, offset); the_info.chrom = cl.names[chrom_idx]; the_info.start_pos = offset; From 80a2d95bf22d4931fb5331c8da248f718d837598 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sat, 26 Oct 2024 17:11:56 -0700 Subject: [PATCH 12/13] src/simreads.cpp: apparently found the bug and it involved returning the int from the rng, despite the values generated by the rng being larger and the values it got assigned to being larger, they were squeezed through int --- src/simreads.cpp | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/src/simreads.cpp b/src/simreads.cpp index 7ceaff2..712d35f 100644 --- a/src/simreads.cpp +++ b/src/simreads.cpp @@ -293,7 +293,8 @@ struct FragSampler { sim_frag_position(genome, frag_len, the_info.seq, the_info.start_pos, require_valid); - uint64_t offset = 0, chrom_idx = 0; + uint32_t offset = 0; + uint32_t chrom_idx = 0; cl.get_chrom_idx_and_offset(the_info.start_pos, chrom_idx, offset); the_info.chrom = cl.names[chrom_idx]; the_info.start_pos = offset; @@ -306,15 +307,12 @@ struct FragSampler { the_info.cigar = to_string(frag_len) + "M"; // default, no muts } char sim_strand() const { - if (strand_code == 'f') - return '+'; - else if (strand_code == 'r') - return '-'; - else if (strand_code == 'b') - return (simreads_random::rand() & 1) ? '+' : '-'; - else - throw runtime_error("bad strand code: " + to_string(strand_code)); - return '\0'; + switch (strand_code) { + case 'f': return '+'; + case 'r': return '-'; + case 'b': return (simreads_random::rand() & 1) ? '+' : '-'; + default: std::abort(); + } } const string &genome; ChromLookup cl; From 6c03831e60cb1964a5bd64ace0b705364eb0c698 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sat, 26 Oct 2024 18:11:23 -0700 Subject: [PATCH 13/13] data/md5sum.txt: updating test output file hashes for changes in rng from simulations --- data/md5sum.txt | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/data/md5sum.txt b/data/md5sum.txt index 81142e0..1be10d3 100644 --- a/data/md5sum.txt +++ b/data/md5sum.txt @@ -1,16 +1,16 @@ -5b564e80467f0600d1dc01e7fbb45cdb tests/reads_1.fq -38ac3541d7233d7e946c2e4b87591519 tests/reads.mstats -2e492e9d438dee062363449cfcbdb7df tests/reads_pbat_pe_1.fq -744e216ed05e34dead8e293721b1376b tests/reads_pbat_pe_2.fq -fc153607ee4799cca9d040976f4182ea tests/reads_pbat_pe.mstats -22893c68f28daad043c93d5c8c28eadc tests/reads_pbat_pe.sam -6b080ecdc9e7c58c24601ba95f475dcc tests/reads_pe_1.fq -3eb9b169c45590eea443715284c80dce tests/reads_pe_2.fq -093aa787322a3e5250fb229a62eac201 tests/reads_pe.mstats -cb1b28601416069132db0f9a9442abec tests/reads_pe.sam -4253b0de3da8715e8851c90962d02c1e tests/reads_rpbat_pe_1.fq -18daaa57cde00d36d612495aa4acc35b tests/reads_rpbat_pe_2.fq -4be779558f7fb70775c32a80f5d38455 tests/reads_rpbat_pe.mstats -ca9a943134a9f099b2e8d136088fb978 tests/reads_rpbat_pe.sam -ae1c1b996c7af56983c5952538d1c8ee tests/reads.sam +8dc13cfc9c7af9c5ed368d5e2462275e tests/reads_1.fq +04f73fa36d90fbe9e157e5c3214e585a tests/reads.mstats +da3003dc18e6ecfcf0586705444ca6f8 tests/reads_pbat_pe_1.fq +bbfd18b0cc7cf93ad4ed317ef996f3ee tests/reads_pbat_pe_2.fq +7d2e72d75fe2e55f978dffa76a492c61 tests/reads_pbat_pe.mstats +2415d44925cc23fa60a0d78c9af04509 tests/reads_pbat_pe.sam +d49efb9f420b4edeb4647217fa247e6d tests/reads_pe_1.fq +3627c807f259109ea0c4c2b7bcd12320 tests/reads_pe_2.fq +0efe8fb80106f2edb370e8f7e7c1bbb6 tests/reads_pe.mstats +3c10bd0ce3f7d458a05a334a351d96ff tests/reads_pe.sam +3dd98274d8b707878aaad8246c6df9f6 tests/reads_rpbat_pe_1.fq +fcf5e4d93ac62dafcfb279e95103ad0e tests/reads_rpbat_pe_2.fq +19329a8ec659a82dc82e56a0ca6999b0 tests/reads_rpbat_pe.mstats +c6980b863b86d7e491e3f9358943bcd3 tests/reads_rpbat_pe.sam +0f2ad3720fd50961494222a3cf1dbef1 tests/reads.sam bcbf01be810cbf4051292813eb6b9225 tests/tRex1.idx