From 0729cc0a93879c8281d2a8437b421eb6782152eb Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Mon, 21 Oct 2024 09:13:04 -0700 Subject: [PATCH] (feat) initial centroid implementation for tdf --- crates/sage-cloudpath/src/tdf.rs | 156 ++++++++++++++++++++++++++++++- crates/sage/src/fdr.rs | 2 +- crates/sage/src/modification.rs | 2 +- 3 files changed, 156 insertions(+), 4 deletions(-) diff --git a/crates/sage-cloudpath/src/tdf.rs b/crates/sage-cloudpath/src/tdf.rs index 01d38e5..8d12266 100644 --- a/crates/sage-cloudpath/src/tdf.rs +++ b/crates/sage-cloudpath/src/tdf.rs @@ -3,10 +3,91 @@ use sage_core::{ mass::Tolerance, spectrum::{Precursor, RawSpectrum, Representation}, }; +use timsrust::converters::ConvertableDomain; pub use timsrust::readers::SpectrumReaderConfig as BrukerSpectrumProcessor; pub struct TdfReader; +use std::cmp::Ordering; + +fn squash_frame(mz_array: &[f32], intensity_array: &[f32], tol_ppm: f32) -> (Vec, Vec) { + // Make sure the mz array is sorted + assert!(mz_array.windows(2).all(|x| x[0] <= x[1])); + + let arr_len = mz_array.len(); + let mut touched = vec![false; arr_len]; + let mut global_num_touched = 0; + + let mut order: Vec = (0..arr_len).collect(); + order.sort_unstable_by(|&a, &b| { + intensity_array[b] + .partial_cmp(&intensity_array[a]) + .unwrap_or(Ordering::Equal) + }); + + let mut agg_mz = vec![0.0; arr_len]; + let mut agg_intensity = vec![0.0; arr_len]; + + let utol = tol_ppm / 1e6; + + for &idx in &order { + if touched[idx] { + continue; + } + + let mz = mz_array[idx]; + let da_tol = mz * utol; + let left_e = mz - da_tol; + let right_e = mz + da_tol; + + let ss_start = mz_array.partition_point(|&x| x < left_e); + let ss_end = mz_array.partition_point(|&x| x <= right_e); + + let slice_width = ss_end - ss_start; + let local_num_touched = touched[ss_start..ss_end].iter().filter(|&&x| x).count(); + let local_num_untouched = slice_width - local_num_touched; + + if local_num_touched == slice_width { + continue; + } + + let mut curr_intensity = 0.0; + let mut curr_weighted_mz = 0.0; + + for i in ss_start..ss_end { + if !touched[i] && intensity_array[i] > 0.0 { + curr_intensity += intensity_array[i]; + curr_weighted_mz += mz_array[i] * intensity_array[i]; + } + } + + if curr_intensity > 0.0 { + curr_weighted_mz /= curr_intensity; + + agg_intensity[idx] = curr_intensity; + agg_mz[idx] = curr_weighted_mz; + + touched[ss_start..ss_end].iter_mut().for_each(|x| *x = true); + global_num_touched += local_num_untouched; + } + + if global_num_touched == arr_len { + break; + } + } + + // Drop the zeros and sort + let mut result: Vec<(f32, f32)> = agg_mz + .into_iter() + .zip(agg_intensity.into_iter()) + .filter(|&(mz, intensity)| mz > 0.0 && intensity > 0.0) + .collect(); + + result.sort_unstable_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(Ordering::Equal)); + + result.into_iter().unzip() +} + impl TdfReader { pub fn parse( &self, @@ -14,10 +95,16 @@ impl TdfReader { file_id: usize, bruker_spectrum_processor: BrukerSpectrumProcessor, ) -> Result, timsrust::TimsRustError> { + let tdf_path = std::path::Path::new(path_name.as_ref()).join("analysis.tdf"); let spectrum_reader = timsrust::readers::SpectrumReader::build() .with_path(path_name.as_ref()) .with_config(bruker_spectrum_processor) .finalize()?; + let frame_reader = timsrust::readers::FrameReader::new(path_name.as_ref())?; + let metadata = timsrust::readers::MetadataReader::new(tdf_path)?; + let mz_converter = metadata.mz_converter; + + // Get ms2 spectra let spectra: Vec = (0..spectrum_reader.len()) .into_par_iter() .filter_map(|index| match spectrum_reader.get(index) { @@ -33,6 +120,9 @@ impl TdfReader { precursors: vec![precursor], representation: Representation::Centroid, scan_start_time: dda_precursor.rt as f32 / 60.0, + // I am pretty sure this is wrong ... + // The retention time is most certainly not what we want for the + // injection time. ion_injection_time: dda_precursor.rt as f32, total_ion_current: 0.0, mz: dda_spectrum.mz_values.iter().map(|&x| x as f32).collect(), @@ -42,11 +132,73 @@ impl TdfReader { }; Some(spectrum) } - None => None, + None => { + log::warn!("No precursor found for spectrum {:?}", dda_spectrum.index); + None + } }, - Err(_) => None, + // Q: should we raise/propagate/log an error here? + Err(x) => { + log::error!("error parsing spectrum: {:?}", x); + None + } }) .collect(); + + // Get MS1 spectra + let ms1_spectra = frame_reader + .get_all_ms1() + .into_iter() + .filter_map(|frame| match frame { + Ok(frame) => { + let mz: Vec = frame + .tof_indices + .iter() + .map(|&x| mz_converter.convert(x as f64) as f32) + .collect(); + let intensity: Vec = frame.intensities.iter().map(|&x| x as f32).collect(); + + // Sort the mzs and intensities by mz + let mut indices: Vec = (0..mz.len()).collect(); + indices.sort_by(|&i, &j| { + mz[i] + .partial_cmp(&mz[j]) + .unwrap_or(std::cmp::Ordering::Equal) + }); + let sorted_mz: Vec = indices.iter().map(|&i| mz[i].clone()).collect(); + let sorted_inten: Vec = + indices.iter().map(|&i| intensity[i].clone()).collect(); + + // Squash the mobility dimension + let (mz, intensity) = squash_frame(&sorted_mz, &sorted_inten, 20.0); + let scan_start_time = frame.rt as f32 / 60.0; + let ion_injection_time = 100.0; + let total_ion_current = 0.0; + let id = frame.index.to_string(); + + let spec = RawSpectrum { + file_id, + precursors: vec![], + representation: Representation::Centroid, + scan_start_time, + ion_injection_time, + mz, + ms_level: 1, + id, + intensity, + total_ion_current, + }; + Some(spec) + } + Err(x) => { + log::error!("error parsing spectrum: {:?}", x); + None + } + }); + + // Merge the two + let spectra: Vec = ms1_spectra.chain(spectra.into_iter()).collect(); + Ok(spectra) } diff --git a/crates/sage/src/fdr.rs b/crates/sage/src/fdr.rs index 75658a3..554ee7b 100644 --- a/crates/sage/src/fdr.rs +++ b/crates/sage/src/fdr.rs @@ -235,7 +235,7 @@ pub fn picked_precursor( .map(|score| ((score.ix, score.decoy), score.q)) .collect::>(); - peaks.par_iter_mut().for_each(|((ix), (peak, _))| { + peaks.par_iter_mut().for_each(|(ix, (peak, _))| { peak.q_value = scores[ix]; }); passing diff --git a/crates/sage/src/modification.rs b/crates/sage/src/modification.rs index 5d025ea..afdef9a 100644 --- a/crates/sage/src/modification.rs +++ b/crates/sage/src/modification.rs @@ -4,7 +4,7 @@ use std::{ str::FromStr, }; -use serde::{de::Visitor, Deserialize, Serialize}; +use serde::Serialize; use crate::mass::VALID_AA;