From 0e393331c46681768192db53c9a76395dd53ffa0 Mon Sep 17 00:00:00 2001 From: acesnik Date: Fri, 3 May 2024 14:27:46 -0700 Subject: [PATCH 1/2] starting on recording start and end positions in output matrix --- crates/sage-cli/src/output.rs | 10 +++++++--- crates/sage/src/enzyme.rs | 29 +++++++++++++++++++++++------ crates/sage/src/peptide.rs | 9 +++++++++ 3 files changed, 39 insertions(+), 9 deletions(-) diff --git a/crates/sage-cli/src/output.rs b/crates/sage-cli/src/output.rs index 8fc5a638..698192c8 100644 --- a/crates/sage-cli/src/output.rs +++ b/crates/sage-cli/src/output.rs @@ -30,6 +30,8 @@ impl Runner { .format(peptide.proteins.len()) .as_bytes(), ); + record.push_field(itoa::Buffer::new().format(peptide.start_position).as_bytes()); + record.push_field(itoa::Buffer::new().format(peptide.end_position).as_bytes()); record.push_field(filenames[feature.file_id].as_bytes()); record.push_field(feature.spec_id.as_bytes()); record.push_field(itoa::Buffer::new().format(feature.rank).as_bytes()); @@ -161,12 +163,14 @@ impl Runner { "peptide", "proteins", "num_proteins", + "start_positions", + "end_positions", "filename", - "scannr", + "scan", "rank", "label", - "expmass", - "calcmass", + "measured_mass", + "calculated_mass", "charge", "peptide_len", "missed_cleavages", diff --git a/crates/sage/src/enzyme.rs b/crates/sage/src/enzyme.rs index 88c7ab5f..ba51b54a 100644 --- a/crates/sage/src/enzyme.rs +++ b/crates/sage/src/enzyme.rs @@ -23,6 +23,10 @@ pub struct Digest { pub missed_cleavages: u8, /// Is this an N-terminal peptide of the protein? pub position: Position, + /// What residue position does this start at (1-based inclusive)? + pub start_position: usize, + /// What residue position does this end at (1-based inclusive)? + pub end_position: usize } #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)] @@ -58,6 +62,8 @@ impl Digest { sequence: sequence.into_iter().collect(), missed_cleavages: self.missed_cleavages, position: self.position, + start_position: self.start_position, + end_position: self.end_position } } } @@ -76,6 +82,7 @@ impl std::hash::Hash for Digest { fn hash(&self, state: &mut H) { self.sequence.hash(state); self.position.hash(state); + self.start_position.hash(state); } } @@ -306,6 +313,8 @@ impl EnzymeParameters { semi_enzymatic: site.semi_enzymatic, position, protein: protein.clone(), + start_position: start + 1, + end_position: end }); } } @@ -330,6 +339,8 @@ mod test { missed_cleavages: 0, position: Position::Nterm, protein: Arc::new(String::default()), + start_position: 1, + end_position: 6 }, Digest { decoy: false, @@ -338,6 +349,8 @@ mod test { missed_cleavages: 0, position: Position::Nterm, protein: Arc::new(String::default()), + start_position: 1, + end_position: 6 }, ]; @@ -353,6 +366,8 @@ mod test { missed_cleavages: 0, position: Position::Nterm, protein: Arc::new(String::default()), + start_position: 1, + end_position: 6 }, Digest { decoy: false, @@ -361,6 +376,8 @@ mod test { missed_cleavages: 0, position: Position::Internal, protein: Arc::new(String::default()), + start_position: 7, + end_position: 12 }, ]; @@ -373,11 +390,11 @@ mod test { fn trypsin() { let sequence = "MADEEKLPPGWEKRMSRSSGRVYYFNHITNASQWERPSGN"; let expected = vec![ - ("MADEEK".into(), Position::Nterm), - ("LPPGWEK".into(), Position::Internal), - ("MSR".into(), Position::Internal), - ("SSGR".into(), Position::Internal), - ("VYYFNHITNASQWERPSGN".into(), Position::Cterm), + ("MADEEK".into(), Position::Nterm, 1, 6), + ("LPPGWEK".into(), Position::Internal, 7, 13), + ("MSR".into(), Position::Internal, 14, 16), + ("SSGR".into(), Position::Internal, 17, 20), + ("VYYFNHITNASQWERPSGN".into(), Position::Cterm, 21, 40), ]; let tryp = EnzymeParameters { @@ -391,7 +408,7 @@ mod test { expected, tryp.digest(sequence, Arc::default()) .into_iter() - .map(|d| (d.sequence, d.position)) + .map(|d| (d.sequence, d.position, d.start_position, d.end_position)) .collect::>() ); } diff --git a/crates/sage/src/peptide.rs b/crates/sage/src/peptide.rs index 7c731fa6..c6004957 100644 --- a/crates/sage/src/peptide.rs +++ b/crates/sage/src/peptide.rs @@ -28,6 +28,10 @@ pub struct Peptide { pub position: Position, pub proteins: Vec>, + /// What residue does this peptide start at in the protein (1-based inclusive)? + pub start_position: Vec>, + /// What residue does this peptide end at in the protein (1-based inclusive)? + pub end_position: Vec>, } impl Peptide { @@ -66,6 +70,8 @@ impl Debug for Peptide { .field("monoisotopic", &self.monoisotopic) .field("missed_cleavages", &self.missed_cleavages) .field("position", &self.position) + .field("start_position", &self.start_position) + .field("end_position", &self.end_position) .finish() } } @@ -313,6 +319,7 @@ impl Peptide { s[1..n].reverse(); pep.sequence = Arc::from(s.into_boxed_slice()); pep.modifications[1..n].reverse(); + pep.start_position = pep.prot } pep } @@ -373,6 +380,8 @@ impl TryFrom for Peptide { missed_cleavages: value.missed_cleavages, semi_enzymatic: value.semi_enzymatic, proteins: vec![value.protein], + start_position: value.start_position, + end_position: value.end_position, }) } } From 745e5aff21e703ef18b37bb47a75d89df884f749 Mon Sep 17 00:00:00 2001 From: acesnik Date: Tue, 7 May 2024 14:50:46 -0700 Subject: [PATCH 2/2] incorporating feedback, still in progress --- crates/sage-cli/src/output.rs | 4 ++-- crates/sage-cli/tests/integration.rs | 2 -- crates/sage/src/enzyme.rs | 18 ++++++++++++------ crates/sage/src/peptide.rs | 11 ++++++----- 4 files changed, 20 insertions(+), 15 deletions(-) diff --git a/crates/sage-cli/src/output.rs b/crates/sage-cli/src/output.rs index 698192c8..cc95e2b7 100644 --- a/crates/sage-cli/src/output.rs +++ b/crates/sage-cli/src/output.rs @@ -30,8 +30,8 @@ impl Runner { .format(peptide.proteins.len()) .as_bytes(), ); - record.push_field(itoa::Buffer::new().format(peptide.start_position).as_bytes()); - record.push_field(itoa::Buffer::new().format(peptide.end_position).as_bytes()); + record.push_field(peptide.start_position.iter().map(|&x| x.to_string()).collect::>().join(";").as_bytes()); + record.push_field(peptide.end_position.iter().map(|&x| x.to_string()).collect::>().join(";").as_bytes()); record.push_field(filenames[feature.file_id].as_bytes()); record.push_field(feature.spec_id.as_bytes()); record.push_field(itoa::Buffer::new().format(feature.rank).as_bytes()); diff --git a/crates/sage-cli/tests/integration.rs b/crates/sage-cli/tests/integration.rs index 872d4f07..28879de1 100644 --- a/crates/sage-cli/tests/integration.rs +++ b/crates/sage-cli/tests/integration.rs @@ -1,7 +1,5 @@ use sage_core::database::Builder; -use sage_core::enzyme::Digest; use sage_core::mass::Tolerance; -use sage_core::peptide::Peptide; use sage_core::scoring::Scorer; use sage_core::spectrum::SpectrumProcessor; diff --git a/crates/sage/src/enzyme.rs b/crates/sage/src/enzyme.rs index ba51b54a..bbb945e0 100644 --- a/crates/sage/src/enzyme.rs +++ b/crates/sage/src/enzyme.rs @@ -18,15 +18,22 @@ pub struct Digest { /// Cleaved peptide sequence pub sequence: String, /// Protein accession - pub protein: Arc, + pub protein: ProteinAssignment, /// Missed cleavages pub missed_cleavages: u8, /// Is this an N-terminal peptide of the protein? pub position: Position, /// What residue position does this start at (1-based inclusive)? - pub start_position: usize, + pub start_position: u32, /// What residue position does this end at (1-based inclusive)? - pub end_position: usize + pub end_position: u32 +} + +#[derive(Clone, PartialOrd, Ord, Debug, Default)] +pub struct ProteinAssignment { + identifier: Arc, + start_position: u32, + end_position: u32 } #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)] @@ -82,7 +89,6 @@ impl std::hash::Hash for Digest { fn hash(&self, state: &mut H) { self.sequence.hash(state); self.position.hash(state); - self.start_position.hash(state); } } @@ -313,8 +319,8 @@ impl EnzymeParameters { semi_enzymatic: site.semi_enzymatic, position, protein: protein.clone(), - start_position: start + 1, - end_position: end + start_position: (start + 1) as u32, + end_position: end as u32 }); } } diff --git a/crates/sage/src/peptide.rs b/crates/sage/src/peptide.rs index c6004957..613974ff 100644 --- a/crates/sage/src/peptide.rs +++ b/crates/sage/src/peptide.rs @@ -29,9 +29,9 @@ pub struct Peptide { pub proteins: Vec>, /// What residue does this peptide start at in the protein (1-based inclusive)? - pub start_position: Vec>, + pub start_position: Vec, /// What residue does this peptide end at in the protein (1-based inclusive)? - pub end_position: Vec>, + pub end_position: Vec, } impl Peptide { @@ -319,7 +319,8 @@ impl Peptide { s[1..n].reverse(); pep.sequence = Arc::from(s.into_boxed_slice()); pep.modifications[1..n].reverse(); - pep.start_position = pep.prot + pep.start_position = pep.start_position; // TODO: calculate start/end in reversed protein sequences? + pep.end_position = pep.end_position; } pep } @@ -380,8 +381,8 @@ impl TryFrom for Peptide { missed_cleavages: value.missed_cleavages, semi_enzymatic: value.semi_enzymatic, proteins: vec![value.protein], - start_position: value.start_position, - end_position: value.end_position, + start_position: vec![value.start_position], + end_position: vec![value.end_position], }) } }