From 53ad0b7a0bd3ab81d45d6ee1fe9d28e20e3184cc Mon Sep 17 00:00:00 2001 From: wdecoster Date: Tue, 23 Jul 2024 13:45:48 +0200 Subject: [PATCH] with --debug, store time information regarding time taken to genotype repeat this requires repeat instances to be mutable this is done to be able to identify repeats that are slow to genotype may revert later --- Cargo.toml | 3 ++- src/call.rs | 12 ++++++------ src/consensus.rs | 1 + src/genotype.rs | 36 +++++++++++++++++++++++++++--------- src/parse_bam.rs | 2 ++ src/phase_insertions.rs | 6 ++++++ src/repeats.rs | 9 ++++++++- src/vcf.rs | 27 +++++++++++++++++++++++++-- 8 files changed, 77 insertions(+), 19 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index ca4de92..65097c5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "STRdust" -version = "0.8.6" +version = "0.8.7" edition = "2021" @@ -28,6 +28,7 @@ petgraph = "0.6.4" hts-sys = "2.1.1" reqwest = { version = "0.12.5", features = ["blocking", "json"] } indicatif = { version = "0.17.1", features = ["rayon"] } +chrono = "0.4.38" [dev-dependencies] ctor = "*" diff --git a/src/call.rs b/src/call.rs index f8b948e..90a607b 100644 --- a/src/call.rs +++ b/src/call.rs @@ -21,8 +21,8 @@ pub fn genotype_repeats(args: Cli) { // The indexedreader is created once and passed on to the function let num_intervals = repeats.num_intervals; let mut bam = parse_bam::create_bam_reader(&args.bam, &args.fasta); - for repeat in repeats.progress_count(num_intervals as u64) { - if let Ok(output) = genotype::genotype_repeat_singlethreaded(&repeat, &args, &mut bam) { + for mut repeat in repeats.progress_count(num_intervals as u64) { + if let Ok(output) = genotype::genotype_repeat_singlethreaded(&mut repeat, &args, &mut bam) { writeln!(handle, "{output}").expect("Failed writing the result."); } } @@ -39,8 +39,8 @@ pub fn genotype_repeats(args: Cli) { repeats .par_bridge() .progress_count(num_intervals as u64) - .for_each(|repeat| { - if let Ok(output) = genotype::genotype_repeat_multithreaded(&repeat, &args) { + .for_each(|mut repeat| { + if let Ok(output) = genotype::genotype_repeat_multithreaded(&mut repeat, &args) { let mut geno = genotypes.lock().expect("Unable to lock genotypes mutex"); geno.push(output); } else { @@ -66,8 +66,8 @@ pub fn genotype_repeats(args: Cli) { repeats .par_bridge() .progress_count(num_intervals as u64) - .for_each(|repeat| { - if let Ok(output) = genotype::genotype_repeat_multithreaded(&repeat, &args) { + .for_each(|mut repeat| { + if let Ok(output) = genotype::genotype_repeat_multithreaded(&mut repeat, &args) { println!("{output}"); } }); diff --git a/src/consensus.rs b/src/consensus.rs index 0dfd232..3bc75b6 100644 --- a/src/consensus.rs +++ b/src/consensus.rs @@ -197,6 +197,7 @@ mod tests { chrom: "chr1".to_string(), start: 1, end: 100, + created: None }, ); println!("Consensus: {}", cons.seq.unwrap()); diff --git a/src/genotype.rs b/src/genotype.rs index 5f3a3c4..f557653 100644 --- a/src/genotype.rs +++ b/src/genotype.rs @@ -7,19 +7,25 @@ use rust_htslib::bam; // when running multithreaded, the indexedreader has to be created every time again // this is probably expensive pub fn genotype_repeat_multithreaded( - repeat: &crate::repeats::RepeatInterval, + repeat: &mut crate::repeats::RepeatInterval, args: &Cli, ) -> Result { + if args.debug { + repeat.set_time_stamp() + } let mut bam = parse_bam::create_bam_reader(&args.bam, &args.fasta); genotype_repeat(repeat, args, &mut bam) } // when running singlethreaded, the indexedreader is created once and simply passed on pub fn genotype_repeat_singlethreaded( - repeat: &crate::repeats::RepeatInterval, + repeat: &mut crate::repeats::RepeatInterval, args: &Cli, bam: &mut bam::IndexedReader, ) -> Result { + if args.debug { + repeat.set_time_stamp() + } genotype_repeat(repeat, args, bam) } @@ -42,18 +48,19 @@ fn genotype_repeat( repeat, "N", ".".to_string(), + args )) } }; let repeat_compressed_reference = repeat.make_repeat_compressed_sequence(&args.fasta, flanking); - if args.debug { - // write the repeat compressed reference to a file - use std::fs; - let header = format!(">{chrom}:{start}-{end}\n", chrom = repeat.chrom, start = repeat.start, end = repeat.end); - let fas = String::from_utf8(repeat_compressed_reference.clone()).expect("Unable to convert repeat compressed reference to string"); - fs::write("repeat_compressed.fa", header + &fas).expect("Unable to write repeat compressed reference to file"); - } + // if args.debug { + // // write the repeat compressed reference to a file + // use std::fs; + // let header = format!(">{chrom}:{start}-{end}\n", chrom = repeat.chrom, start = repeat.start, end = repeat.end); + // let fas = String::from_utf8(repeat_compressed_reference.clone()).expect("Unable to convert repeat compressed reference to string"); + // fs::write("repeat_compressed.fa", header + &fas).expect("Unable to write repeat compressed reference to file"); + // } // alignments can be extracted in an unphased manner, if the chromosome is --haploid or the --unphased is set // this means that --haploid overrides the phases which could be present in the bam file @@ -69,6 +76,7 @@ fn genotype_repeat( repeat, &repeat_ref_seq, 0.to_string(), + args )); } }; @@ -119,6 +127,7 @@ fn genotype_repeat( repeat, &repeat_ref_seq, insertions.len().to_string(), + args )); } // there is only one haplotype, haploid, so this gets duplicated for reporting in the VCF module @@ -144,6 +153,7 @@ fn genotype_repeat( repeat, &repeat_ref_seq, insertions.len().to_string(), + args )); } // handle the special case where there is only 1 read for the repeat (but minimal support is 1 so it passes) @@ -156,6 +166,7 @@ fn genotype_repeat( all_insertions, reads.ps, flags, + args )); } @@ -231,6 +242,7 @@ fn genotype_repeat( repeat, reads.ps, flags, + args )) } @@ -344,6 +356,7 @@ mod tests { chrom: String::from("chr7"), start: 154654404, end: 154654432, + created: None }; let flanking = 2000; let minlen = 5; @@ -383,6 +396,7 @@ mod tests { chrom: String::from("chr7"), start: 154654404, end: 154654432, + created: None }; let args = Cli { bam: String::from("test_data/small-test-phased.bam"), @@ -412,6 +426,7 @@ mod tests { chrom: String::from("chr7"), start: 154654404, end: 154654432, + created: None }; let args = Cli { bam: String::from("test_data/small-test-phased.bam"), @@ -458,6 +473,7 @@ mod tests { chrom: String::from("chr7"), start: 154654404, end: 154654432, + created: None }; let mut bam = parse_bam::create_bam_reader(&args.bam, &args.fasta); let genotype = genotype_repeat(&repeat, &args, &mut bam); @@ -487,6 +503,7 @@ mod tests { chrom: String::from("chr7"), start: 154654404, end: 154654432, + created: None }; let mut bam = parse_bam::create_bam_reader(&args.bam, &args.fasta); let genotype = genotype_repeat(&repeat, &args, &mut bam); @@ -517,6 +534,7 @@ mod tests { chrom: String::from("chr7"), start: 154654404, end: 154654432, + created: None }; let mut bam = parse_bam::create_bam_reader(&args.bam, &args.fasta); let genotype = genotype_repeat(&repeat, &args, &mut bam); diff --git a/src/parse_bam.rs b/src/parse_bam.rs index 719642c..664d238 100644 --- a/src/parse_bam.rs +++ b/src/parse_bam.rs @@ -124,6 +124,7 @@ fn test_get_overlapping_reads() { chrom: String::from("chr7"), start: 154654404, end: 154654432, + created: None, }; let unphased = false; let mut bam = create_bam_reader(&bam, &fasta); @@ -138,6 +139,7 @@ fn test_get_overlapping_reads_url() { chrom: String::from("chr20"), start: 154654404, end: 154654432, + created: None }; let unphased = false; let mut bam = create_bam_reader(&bam, &fasta); diff --git a/src/phase_insertions.rs b/src/phase_insertions.rs index 32822c2..b0da167 100644 --- a/src/phase_insertions.rs +++ b/src/phase_insertions.rs @@ -332,6 +332,7 @@ mod tests { chrom: "chr7".to_string(), start: 154654404, end: 154654432, + created: None }, false, ); @@ -372,6 +373,7 @@ mod tests { chrom: "chr7".to_string(), start: 154654404, end: 154654432, + created: None }, false, ); @@ -411,6 +413,7 @@ mod tests { chrom: "chr7".to_string(), start: 154654404, end: 154654432, + created: None }, false, ); @@ -453,6 +456,7 @@ mod tests { chrom: "chr7".to_string(), start: 154654404, end: 154654432, + created: None }, false, ); @@ -493,6 +497,7 @@ mod tests { chrom: "chr7".to_string(), start: 154654404, end: 154654432, + created: None }, false, ); @@ -564,6 +569,7 @@ mod tests { chrom: "chr7".to_string(), start: 154654404, end: 154654432, + created: None }, false, ); diff --git a/src/repeats.rs b/src/repeats.rs index 08c3c93..c7b488d 100644 --- a/src/repeats.rs +++ b/src/repeats.rs @@ -80,6 +80,7 @@ impl Clone for RepeatInterval { chrom: self.chrom.clone(), start: self.start, end: self.end, + created: self.created, } } } @@ -105,6 +106,7 @@ pub struct RepeatInterval { pub chrom: String, pub start: u32, pub end: u32, + pub created: Option>, } impl fmt::Display for RepeatInterval { @@ -144,7 +146,7 @@ impl RepeatInterval { .expect("Failed parsing chromosome length from fai file") > end { - return Some(Self { chrom, start, end }); + return Some(Self { chrom, start, end, created: None }); } } // if the chromosome is not in the fai file or the end does not fit the interval, return None @@ -157,6 +159,7 @@ impl RepeatInterval { chrom: chrom.to_string(), start, end, + created: None } } @@ -201,6 +204,10 @@ impl RepeatInterval { } Some(repeat_ref_sequence) } + + pub fn set_time_stamp(& mut self) { + self.created = Some(chrono::Utc::now()); + } } #[cfg(test)] diff --git a/src/vcf.rs b/src/vcf.rs index 7e233dc..1c68865 100644 --- a/src/vcf.rs +++ b/src/vcf.rs @@ -52,6 +52,7 @@ pub struct VCFRecord { pub score: (String, String), pub somatic_info_field: String, pub outliers: String, + pub time_taken: String, pub ps: Option, // phase set identifier pub flags: String, pub allele: (String, String), @@ -66,6 +67,7 @@ impl VCFRecord { repeat: &crate::repeats::RepeatInterval, ps: Option, flag: Vec, + args: &crate::Cli, ) -> VCFRecord { // since I use .pop() to format the two consensus sequences, the order is reversed let allele2 = Allele::from_consensus( @@ -142,6 +144,11 @@ impl VCFRecord { } else { format!("{};", flag.join(";")) }; + let time_taken = if args.debug { + format!(";TIME={}", chrono::Utc::now() - repeat.created.expect("Failed accessing timestamp") ) + } else { + "".to_string() + }; VCFRecord { chrom: repeat.chrom.clone(), start: repeat.start, @@ -155,6 +162,7 @@ impl VCFRecord { score: (allele1.score, allele2.score), somatic_info_field, outliers, + time_taken, ps, flags, allele: (genotype1.to_string(), genotype2.to_string()), @@ -165,7 +173,13 @@ impl VCFRecord { repeat: &crate::repeats::RepeatInterval, repeat_ref_seq: &str, support: String, + args: &crate::Cli, ) -> VCFRecord { + let time_taken = if args.debug { + format!(";TIME={}", chrono::Utc::now() - repeat.created.expect("Failed accessing timestamp") ) + } else { + "".to_string() + }; VCFRecord { chrom: repeat.chrom.clone(), start: repeat.start, @@ -179,6 +193,7 @@ impl VCFRecord { score: (".".to_string(), ".".to_string()), somatic_info_field: "".to_string(), outliers: "".to_string(), + time_taken, ps: None, flags: "".to_string(), allele: (".".to_string(), ".".to_string()), @@ -191,7 +206,8 @@ impl VCFRecord { repeat_ref_seq: &str, all_insertions: Option>, ps: Option, - flag: Vec, + flag: Vec, + args: &crate::Cli, ) -> VCFRecord { let somatic_info_field = match all_insertions { Some(somatic_insertions) => { @@ -199,6 +215,11 @@ impl VCFRecord { } None => "".to_string(), }; + let time_taken = if args.debug { + format!(";TIME={}", chrono::Utc::now() - repeat.created.expect("Failed accessing timestamp") ) + } else { + "".to_string() + }; VCFRecord { chrom: repeat.chrom.clone(), start: repeat.start, @@ -212,6 +233,7 @@ impl VCFRecord { score: (".".to_string(), ".".to_string()), somatic_info_field, outliers: "".to_string(), + time_taken, ps, flags : if flag.is_empty() { "".to_string() @@ -234,7 +256,7 @@ impl fmt::Display for VCFRecord { }; write!( f, - "{chrom}\t{start}\t.\t{ref}\t{alt}\t.\t.\t{flags}END={end};STDEV={sd1},{sd2}{somatic}{outliers}\t{FORMAT}\t{genotype1}|{genotype2}:{l1},{l2}:{fl1},{fl2}:{sup1},{sup2}:{score1},{score2}{ps}", + "{chrom}\t{start}\t.\t{ref}\t{alt}\t.\t.\t{flags}END={end};STDEV={sd1},{sd2}{somatic}{outliers}{time_taken}\t{FORMAT}\t{genotype1}|{genotype2}:{l1},{l2}:{fl1},{fl2}:{sup1},{sup2}:{score1},{score2}{ps}", chrom = self.chrom, start = self.start, flags = self.flags, @@ -249,6 +271,7 @@ impl fmt::Display for VCFRecord { sd2 = self.std_dev.1, somatic = self.somatic_info_field, outliers = self.outliers, + time_taken = self.time_taken, genotype1 = self.allele.0, genotype2 = self.allele.1, sup1 = self.support.0,