diff --git a/mzLib/Omics/BioPolymerWithSetModsExtensions.cs b/mzLib/Omics/BioPolymerWithSetModsExtensions.cs index 2e5d29718..20d0e7abe 100644 --- a/mzLib/Omics/BioPolymerWithSetModsExtensions.cs +++ b/mzLib/Omics/BioPolymerWithSetModsExtensions.cs @@ -18,9 +18,9 @@ public static string FullSequenceWithMassShift(this IBioPolymerWithSetMods withS var subsequence = new StringBuilder(); // modification on peptide N-terminus - if (withSetMods.AllModsOneIsNterminus.TryGetValue(1, out Modification mod)) + if (withSetMods.AllModsOneIsNterminus.TryGetValue(1, out Modification? mod)) { - subsequence.Append('[' + mod.MonoisotopicMass.RoundedDouble(6).ToString() + ']'); + subsequence.Append($"[{mod.MonoisotopicMass.RoundedDouble(6)}]"); } for (int r = 0; r < withSetMods.Length; r++) @@ -32,11 +32,11 @@ public static string FullSequenceWithMassShift(this IBioPolymerWithSetMods withS { if (mod.MonoisotopicMass > 0) { - subsequence.Append("[+" + mod.MonoisotopicMass.RoundedDouble(6).ToString() + ']'); + subsequence.Append($"[+{mod.MonoisotopicMass.RoundedDouble(6)}]"); } else { - subsequence.Append("[" + mod.MonoisotopicMass.RoundedDouble(6).ToString() + ']'); + subsequence.Append($"[{mod.MonoisotopicMass.RoundedDouble(6)}]"); } } } @@ -46,11 +46,11 @@ public static string FullSequenceWithMassShift(this IBioPolymerWithSetMods withS { if (mod.MonoisotopicMass > 0) { - subsequence.Append("[+" + mod.MonoisotopicMass.RoundedDouble(6).ToString() + ']'); + subsequence.Append($"[+{mod.MonoisotopicMass.RoundedDouble(6)}]"); } else { - subsequence.Append("[" + mod.MonoisotopicMass.RoundedDouble(6).ToString() + ']'); + subsequence.Append($"[{mod.MonoisotopicMass.RoundedDouble(6)}]"); } } return subsequence.ToString(); @@ -68,14 +68,15 @@ public static string EssentialSequence(this IBioPolymerWithSetMods withSetMods, string essentialSequence = withSetMods.BaseSequence; if (modstoWritePruned != null) { - var sbsequence = new StringBuilder(); + var sbsequence = new StringBuilder(withSetMods.FullSequence.Length); // variable modification on peptide N-terminus if (withSetMods.AllModsOneIsNterminus.TryGetValue(1, out Modification pep_n_term_variable_mod)) { if (modstoWritePruned.ContainsKey(pep_n_term_variable_mod.ModificationType)) { - sbsequence.Append('[' + pep_n_term_variable_mod.ModificationType + ":" + pep_n_term_variable_mod.IdWithMotif + ']'); + sbsequence.Append( + $"[{pep_n_term_variable_mod.ModificationType}:{pep_n_term_variable_mod.IdWithMotif}]"); } } for (int r = 0; r < withSetMods.Length; r++) @@ -86,7 +87,8 @@ public static string EssentialSequence(this IBioPolymerWithSetMods withSetMods, { if (modstoWritePruned.ContainsKey(residue_variable_mod.ModificationType)) { - sbsequence.Append('[' + residue_variable_mod.ModificationType + ":" + residue_variable_mod.IdWithMotif + ']'); + sbsequence.Append( + $"[{residue_variable_mod.ModificationType}:{residue_variable_mod.IdWithMotif}]"); } } } @@ -96,7 +98,8 @@ public static string EssentialSequence(this IBioPolymerWithSetMods withSetMods, { if (modstoWritePruned.ContainsKey(pep_c_term_variable_mod.ModificationType)) { - sbsequence.Append('[' + pep_c_term_variable_mod.ModificationType + ":" + pep_c_term_variable_mod.IdWithMotif + ']'); + sbsequence.Append( + $"[{pep_c_term_variable_mod.ModificationType}:{pep_c_term_variable_mod.IdWithMotif}]"); } } @@ -112,12 +115,13 @@ public static string EssentialSequence(this IBioPolymerWithSetMods withSetMods, /// public static string DetermineFullSequence(this IBioPolymerWithSetMods withSetMods) { - var subSequence = new StringBuilder(); + // start string builder with initial capacity to avoid resizing costs. + var subSequence = new StringBuilder(withSetMods.BaseSequence.Length + withSetMods.AllModsOneIsNterminus.Count * 30); // modification on peptide N-terminus - if (withSetMods.AllModsOneIsNterminus.TryGetValue(1, out Modification mod)) + if (withSetMods.AllModsOneIsNterminus.TryGetValue(1, out Modification? mod)) { - subSequence.Append('[' + mod.ModificationType + ":" + mod.IdWithMotif + ']'); + subSequence.Append($"[{mod.ModificationType}:{mod.IdWithMotif}]"); } for (int r = 0; r < withSetMods.Length; r++) @@ -127,14 +131,14 @@ public static string DetermineFullSequence(this IBioPolymerWithSetMods withSetMo // modification on this residue if (withSetMods.AllModsOneIsNterminus.TryGetValue(r + 2, out mod)) { - subSequence.Append('[' + mod.ModificationType + ":" + mod.IdWithMotif + ']'); + subSequence.Append($"[{mod.ModificationType}:{mod.IdWithMotif}]"); } } // modification on peptide C-terminus if (withSetMods.AllModsOneIsNterminus.TryGetValue(withSetMods.Length + 2, out mod)) { - subSequence.Append('[' + mod.ModificationType + ":" + mod.IdWithMotif + ']'); + subSequence.Append($"[{mod.ModificationType}:{mod.IdWithMotif}]"); } return subSequence.ToString(); diff --git a/mzLib/Omics/Digestion/DigestionProduct.cs b/mzLib/Omics/Digestion/DigestionProduct.cs index 55aed3255..13fe51610 100644 --- a/mzLib/Omics/Digestion/DigestionProduct.cs +++ b/mzLib/Omics/Digestion/DigestionProduct.cs @@ -1,15 +1,13 @@ -using System; -using System.Collections.Generic; -using System.ComponentModel; -using System.Linq; -using System.Text; -using System.Threading.Tasks; +using MzLibUtil; using Omics.Modifications; namespace Omics.Digestion { public abstract class DigestionProduct { + protected static readonly DictionaryPool> DictionaryPool = new(); + protected static readonly DictionaryPool FixedModDictionaryPool = new(8); + protected string _baseSequence; protected DigestionProduct(IBioPolymer parent, int oneBasedStartResidue, int oneBasedEndResidue, int missedCleavages, @@ -41,33 +39,65 @@ protected DigestionProduct(IBioPolymer parent, int oneBasedStartResidue, int one public int Length => BaseSequence.Length; //how many residues long the peptide is public char this[int zeroBasedIndex] => BaseSequence[zeroBasedIndex]; + #region Digestion Helper Methods + + /// + /// Generates all possible variable modification patterns for a peptide, which includes variable and localized modifications but excludes fixed mods + /// + /// A dictionary of possible variable modifications with their positions. + /// The maximum number of modifications allowed for the peptide. + /// The length of the peptide. + /// An enumerable of dictionaries representing different modification patterns. + /// + /// This method generates all possible combinations of variable modifications for a given peptide. + /// It first calculates the total number of available modifications and the maximum number of variable modifications allowed. + /// Then, it iterates through all possible numbers of modifications and generates the corresponding modification patterns. + /// The returned dictionary is then appended with fixed modifications and used to construct a peptide with set mods + /// protected static IEnumerable> GetVariableModificationPatterns(Dictionary> possibleVariableModifications, int maxModsForPeptide, int peptideLength) { - if (possibleVariableModifications.Count == 0) - { - yield return null; - } - else - { - var possible_variable_modifications = new Dictionary>(possibleVariableModifications); + if (possibleVariableModifications.Count <= 0) + yield break; + + int[] baseVariableModificationPattern = new int[peptideLength + 4]; + int totalAvailableMods = possibleVariableModifications.Values.Sum(modList => modList?.Count ?? 0); + int maxVariableMods = Math.Min(totalAvailableMods, maxModsForPeptide); - int[] base_variable_modification_pattern = new int[peptideLength + 4]; - var totalAvailableMods = possible_variable_modifications.Sum(b => b.Value == null ? 0 : b.Value.Count); - for (int variable_modifications = 0; variable_modifications <= Math.Min(totalAvailableMods, maxModsForPeptide); variable_modifications++) + for (int variable_modifications = 0; variable_modifications <= maxVariableMods; variable_modifications++) + { + foreach (int[] variable_modification_pattern in GetVariableModificationPatternsRecursive(possibleVariableModifications.ToList(), + possibleVariableModifications.Count - variable_modifications, baseVariableModificationPattern, 0)) { - foreach (int[] variable_modification_pattern in GetVariableModificationPatterns(new List>>(possible_variable_modifications), - possible_variable_modifications.Count - variable_modifications, base_variable_modification_pattern, 0)) + // use modification pattern to construct a dictionary of modifications for the peptide + var modificationPattern = new Dictionary(possibleVariableModifications.Count); + + foreach (KeyValuePair> kvp in possibleVariableModifications) { - yield return GetNewVariableModificationPattern(variable_modification_pattern, possible_variable_modifications); + int modIndex = variable_modification_pattern[kvp.Key] - 1; + if (modIndex >= 0) + { + modificationPattern.Add(kvp.Key, kvp.Value[modIndex]); + } } + + yield return modificationPattern; } } } - protected Dictionary GetFixedModsOneIsNorFivePrimeTerminus(int length, - IEnumerable allKnownFixedModifications) + /// + /// Sets the fixed modifications for the peptide, considering the N-terminal and C-terminal positions, by populating the dictionary. + /// + /// The length of the peptide. + /// A collection of all known fixed modifications. + /// A reference to a dictionary that will hold the fixed modifications, with the key representing the position. + /// + /// This method iterates through all known fixed modifications and assigns them to the appropriate positions in the peptide. + /// It considers different location restrictions such as N-terminal, C-terminal, and anywhere within the peptide. + /// + protected void PopulateFixedModsOneIsNorFivePrimeTerminus(int length, + IEnumerable allKnownFixedModifications, in Dictionary fixedModsOneIsNterminus) { - var fixedModsOneIsNterminus = new Dictionary(length + 3); foreach (Modification mod in allKnownFixedModifications) { switch (mod.LocationRestriction) @@ -76,18 +106,16 @@ protected Dictionary GetFixedModsOneIsNorFivePrimeTerminus(in case "Oligo 5'-terminal.": case "N-terminal.": case "Peptide N-terminal.": - //the modification is protease associated and is applied to the n-terminal cleaved residue, not at the beginign of the protein - if (mod.ModificationType == "Protease" && ModificationLocalization.ModFits(mod, Parent.BaseSequence, 1, length, OneBasedStartResidue)) + //the modification is protease associated and is applied to the n-terminal cleaved residue, not at the beginning of the protein + if (ModificationLocalization.ModFits(mod, Parent.BaseSequence, 1, length, OneBasedStartResidue)) { - if (OneBasedStartResidue != 1) + if (mod.ModificationType == "Protease") { - fixedModsOneIsNterminus[2] = mod; + if (OneBasedStartResidue != 1) + fixedModsOneIsNterminus[2] = mod; } - } - //Normal N-terminal peptide modification - else if (ModificationLocalization.ModFits(mod, Parent.BaseSequence, 1, length, OneBasedStartResidue)) - { - fixedModsOneIsNterminus[1] = mod; + else //Normal N-terminal peptide modification + fixedModsOneIsNterminus[1] = mod; } break; @@ -106,17 +134,15 @@ protected Dictionary GetFixedModsOneIsNorFivePrimeTerminus(in case "C-terminal.": case "Peptide C-terminal.": //the modification is protease associated and is applied to the c-terminal cleaved residue, not if it is at the end of the protein - if (mod.ModificationType == "Protease" && ModificationLocalization.ModFits(mod, Parent.BaseSequence, length, length, OneBasedStartResidue + length - 1)) + if (ModificationLocalization.ModFits(mod, Parent.BaseSequence, length, length, OneBasedStartResidue + length - 1)) { - if (OneBasedEndResidue != Parent.Length) + if (mod.ModificationType == "Protease") { - fixedModsOneIsNterminus[length + 1] = mod; + if (OneBasedEndResidue != Parent.Length) + fixedModsOneIsNterminus[length + 1] = mod; } - } - //Normal C-terminal peptide modification - else if (ModificationLocalization.ModFits(mod, Parent.BaseSequence, length, length, OneBasedStartResidue + length - 1)) - { - fixedModsOneIsNterminus[length + 2] = mod; + else //Normal C-terminal peptide modification + fixedModsOneIsNterminus[length + 2] = mod; } break; @@ -124,11 +150,144 @@ protected Dictionary GetFixedModsOneIsNorFivePrimeTerminus(in throw new NotSupportedException("This terminus localization is not supported."); } } - return fixedModsOneIsNterminus; } + /// + /// Populates the variable modifications dictionary from both the variable modifications and the localized mods from xml reading, + /// considering the N-terminal, C-terminal, and internal positions. + /// + /// A list of all variable modifications. + /// A reference to a dictionary that will hold the variable modifications, with the key representing the position. + /// + /// This method iterates through all variable modifications and assigns them to the appropriate positions in the peptide. + /// It considers different location restrictions such as N-terminal, C-terminal, and anywhere within the peptide. + /// + protected void PopulateVariableModifications(List allVariableMods, in Dictionary> twoBasedDictToPopulate) + { + int peptideLength = OneBasedEndResidue - OneBasedStartResidue + 1; + var pepNTermVariableMods = new List(); + twoBasedDictToPopulate.Add(1, pepNTermVariableMods); + + var pepCTermVariableMods = new List(); + twoBasedDictToPopulate.Add(peptideLength + 2, pepCTermVariableMods); + + // VARIABLE MODS + foreach (Modification variableModification in allVariableMods) + { + // Check if can be a n-term mod + if (CanBeNTerminalOrFivePrime(variableModification, peptideLength) && !ModificationLocalization.UniprotModExists(Parent, 1, variableModification)) + { + pepNTermVariableMods.Add(variableModification); + } + + for (int r = 0; r < peptideLength; r++) + { + if (ModificationLocalization.ModFits(variableModification, Parent.BaseSequence, r + 1, peptideLength, OneBasedStartResidue + r) + && variableModification.LocationRestriction == "Anywhere." && !ModificationLocalization.UniprotModExists(Parent, r + 1, variableModification)) + { + if (!twoBasedDictToPopulate.TryGetValue(r + 2, out var residueVariableMods)) + { + residueVariableMods = new List() { variableModification }; + twoBasedDictToPopulate.Add(r + 2, residueVariableMods); + } + else + { + residueVariableMods.Add(variableModification); + } + } + } + // Check if can be a c-term mod + if (CanBeCTerminalOrThreePrime(variableModification, peptideLength) && !ModificationLocalization.UniprotModExists(Parent, peptideLength, variableModification)) + { + pepCTermVariableMods.Add(variableModification); + } + } + + // LOCALIZED MODS + foreach (var kvp in Parent.OneBasedPossibleLocalizedModifications) + { + bool inBounds = kvp.Key >= OneBasedStartResidue && kvp.Key <= OneBasedEndResidue; + if (!inBounds) + { + continue; + } + + int locInPeptide = kvp.Key - OneBasedStartResidue + 1; + foreach (Modification modWithMass in kvp.Value) + { + if (modWithMass is not Modification variableModification) + continue; + + // Check if can be a n-term mod + if (locInPeptide == 1 && CanBeNTerminalOrFivePrime(variableModification, peptideLength) && !Parent.IsDecoy) + { + pepNTermVariableMods.Add(variableModification); + } + + int r = locInPeptide - 1; + if (r >= 0 && r < peptideLength + && (Parent.IsDecoy || + (ModificationLocalization.ModFits(variableModification, Parent.BaseSequence, r + 1, peptideLength, OneBasedStartResidue + r) + && variableModification.LocationRestriction == "Anywhere."))) + { + if (!twoBasedDictToPopulate.TryGetValue(r + 2, out var residueVariableMods)) + { + residueVariableMods = new List() { variableModification }; + twoBasedDictToPopulate.Add(r + 2, residueVariableMods); + } + else + { + residueVariableMods.Add(variableModification); + } + } + + // Check if can be a c-term mod + if (locInPeptide == peptideLength && CanBeCTerminalOrThreePrime(variableModification, peptideLength) && !Parent.IsDecoy) + { + pepCTermVariableMods.Add(variableModification); + } + } + } + } + + /// + /// Appends fixed modifications to the variable modification pattern when no variable mod exists. + /// + /// The dictionary containing fixed modifications. + /// The dictionary containing the variable modification pattern. + /// The number of fixed modifications appended. + /// + /// This method iterates through the fixed modifications and adds them to the variable modification pattern + /// if they are not already present. The number of fixed modifications appended is returned via the out parameter. + /// + protected void AppendFixedModificationsToVariable(in Dictionary fixedModDict, in Dictionary variableModPattern, out int numFixedMods) + { + numFixedMods = 0; + foreach (var fixedModPattern in fixedModDict) + { + if (variableModPattern.ContainsKey(fixedModPattern.Key)) + continue; + + numFixedMods++; + variableModPattern.Add(fixedModPattern.Key, fixedModPattern.Value); + } + } - private static IEnumerable GetVariableModificationPatterns(List>> possibleVariableModifications, + /// + /// Recursively generates all possible variable modification patterns for a peptide. + /// + /// A list of key-value pairs representing possible variable modifications and their positions. + /// The number of unmodified residues desired in the pattern. + /// An array representing the current modification pattern. + /// The current index in the list of possible modifications. + /// An enumerable of arrays representing different modification patterns. The array index corresponds to the location of the modification + /// in the peptide, while the value at that index determines which index in the list of modifications + /// to add to the final variable modification pattern + /// + /// This method uses recursion to generate all possible combinations of variable modifications for a given peptide. + /// It considers both modified and unmodified residues and generates patterns accordingly. + /// + private static IEnumerable GetVariableModificationPatternsRecursive(List>> possibleVariableModifications, int unmodifiedResiduesDesired, int[] variableModificationPattern, int index) { if (index < possibleVariableModifications.Count - 1) @@ -136,7 +295,7 @@ private static IEnumerable GetVariableModificationPatterns(List 0) { variableModificationPattern[possibleVariableModifications[index].Key] = 0; - foreach (int[] new_variable_modification_pattern in GetVariableModificationPatterns(possibleVariableModifications, + foreach (int[] new_variable_modification_pattern in GetVariableModificationPatternsRecursive(possibleVariableModifications, unmodifiedResiduesDesired - 1, variableModificationPattern, index + 1)) { yield return new_variable_modification_pattern; @@ -147,7 +306,7 @@ private static IEnumerable GetVariableModificationPatterns(List GetVariableModificationPatterns(List GetNewVariableModificationPattern(int[] variableModificationArray, - IEnumerable>> possibleVariableModifications) + /// + /// Determines if a modification can be applied to the N-terminal or 5' end of the peptide. + /// + /// The modification to check. + /// The length of the peptide. + /// True if the modification can be applied to the N-terminal or 5' end; otherwise, false. + private bool CanBeNTerminalOrFivePrime(Modification mod, int peptideLength) { - var modification_pattern = new Dictionary(); - - foreach (KeyValuePair> kvp in possibleVariableModifications) - { - if (variableModificationArray[kvp.Key] > 0) - { - modification_pattern.Add(kvp.Key, kvp.Value[variableModificationArray[kvp.Key] - 1]); - } - } - - return modification_pattern; + return mod.LocationRestriction is "5'-terminal." or "Oligo 5'-terminal." or "N-terminal." or "Peptide N-terminal." + && ModificationLocalization.ModFits(mod, Parent.BaseSequence, 1, peptideLength, OneBasedStartResidue); } + /// + /// Determines if a modification can be applied to the C-terminal or 3' end of the peptide. + /// + /// The modification to check. + /// The length of the peptide. + /// True if the modification can be applied to the C-terminal or 3' end; otherwise, false. + private bool CanBeCTerminalOrThreePrime(Modification mod, int peptideLength) + { + return mod.LocationRestriction is "3'-terminal." or "Oligo 3'-terminal." or "C-terminal." or "Peptide C-terminal." + && ModificationLocalization.ModFits(mod, Parent.BaseSequence, peptideLength, peptideLength, OneBasedStartResidue + peptideLength - 1); + } + #endregion } } diff --git a/mzLib/Omics/Modifications/ModificationLocalization.cs b/mzLib/Omics/Modifications/ModificationLocalization.cs index bbf25d1a3..e2c57fa2d 100644 --- a/mzLib/Omics/Modifications/ModificationLocalization.cs +++ b/mzLib/Omics/Modifications/ModificationLocalization.cs @@ -2,19 +2,28 @@ { public static class ModificationLocalization { + // This method is called a ton (8.8 billion times in Bottom-Up Jenkins as of 1.0.6) in MetaMorpheus. If changes are made, ensure they are efficient. public static bool ModFits(Modification attemptToLocalize, string sequence, int digestionProductOneBasedIndex, int digestionProductLength, int bioPolymerOneBasedIndex) { // First find the capital letter... - var motif = attemptToLocalize.Target; - var motifStartLocation = motif.ToString().IndexOf(motif.ToString().First(b => char.IsUpper(b))); + var motif = attemptToLocalize.Target.ToString(); + var motifStartLocation = -1; + for (int i = 0; i < motif.Length; i++) + { + if (!char.IsUpper(motif[i])) + continue; + + motifStartLocation = i; + break; + } // Look up starting at and including the capital letter var proteinToMotifOffset = bioPolymerOneBasedIndex - motifStartLocation - 1; var indexUp = 0; - while (indexUp < motif.ToString().Length) + while (indexUp < motif.Length) { if (indexUp + proteinToMotifOffset < 0 || indexUp + proteinToMotifOffset >= sequence.Length - || !MotifMatches(motif.ToString()[indexUp], sequence[indexUp + proteinToMotifOffset])) + || !MotifMatches(motif[indexUp], sequence[indexUp + proteinToMotifOffset])) { return false; } @@ -56,11 +65,14 @@ public static bool UniprotModExists(IBioPolymer bioPolymer, int i, Modification private static bool MotifMatches(char motifChar, char sequenceChar) { char upperMotifChar = char.ToUpper(motifChar); - return upperMotifChar.Equals('X') - || upperMotifChar.Equals(sequenceChar) - || upperMotifChar.Equals('B') && new[] { 'D', 'N' }.Contains(sequenceChar) - || upperMotifChar.Equals('J') && new[] { 'I', 'L' }.Contains(sequenceChar) - || upperMotifChar.Equals('Z') && new[] { 'E', 'Q' }.Contains(sequenceChar); + return upperMotifChar switch + { + 'X' => true, + 'B' => sequenceChar is 'D' or 'N', + 'J' => sequenceChar is 'I' or 'L', + 'Z' => sequenceChar is 'E' or 'Q', + _ => upperMotifChar == sequenceChar + }; } } } \ No newline at end of file diff --git a/mzLib/Proteomics/ProteolyticDigestion/Protease.cs b/mzLib/Proteomics/ProteolyticDigestion/Protease.cs index 3274e8884..9ad375330 100644 --- a/mzLib/Proteomics/ProteolyticDigestion/Protease.cs +++ b/mzLib/Proteomics/ProteolyticDigestion/Protease.cs @@ -76,69 +76,27 @@ public CleavageSpecificity GetCleavageSpecificity(Protein protein, int startInde /// /// /// - internal List GetUnmodifiedPeptides(Protein protein, int maximumMissedCleavages, InitiatorMethionineBehavior initiatorMethionineBehavior, + internal IEnumerable GetUnmodifiedPeptides(Protein protein, int maximumMissedCleavages, InitiatorMethionineBehavior initiatorMethionineBehavior, int minPeptideLength, int maxPeptideLength, Protease specificProtease, bool topDownTruncationSearch = false) { - List peptides = new List(); - - // proteolytic cleavage in one spot (N) - if (CleavageSpecificity == CleavageSpecificity.SingleN) + return CleavageSpecificity switch { - peptides = SingleN_Digestion(protein, initiatorMethionineBehavior, maximumMissedCleavages, minPeptideLength, maxPeptideLength, specificProtease); - } + // proteolytic cleavage in one spot (N) + CleavageSpecificity.SingleN => SingleN_Digestion(protein, initiatorMethionineBehavior, maximumMissedCleavages, minPeptideLength, maxPeptideLength, specificProtease), - // proteolytic cleavage in one spot (C) - else if (CleavageSpecificity == CleavageSpecificity.SingleC) - { - peptides = SingleC_Digestion(protein, initiatorMethionineBehavior, maximumMissedCleavages, minPeptideLength, maxPeptideLength, specificProtease); - } + // proteolytic cleavage in one spot (C) + CleavageSpecificity.SingleC => SingleC_Digestion(protein, initiatorMethionineBehavior, maximumMissedCleavages, minPeptideLength, maxPeptideLength, specificProtease), - //top-down - else if (CleavageSpecificity == CleavageSpecificity.None) - { - if (!topDownTruncationSearch)//standard top-down - { - // retain methionine - if ((initiatorMethionineBehavior != InitiatorMethionineBehavior.Cleave || protein[0] != 'M') - && ValidLength(protein.Length, minPeptideLength, maxPeptideLength)) - { - peptides.Add(new ProteolyticPeptide(protein, 1, protein.Length, 0, CleavageSpecificity.Full, "full")); - } + //top-down + CleavageSpecificity.None => TopDownDigestion(protein, initiatorMethionineBehavior, minPeptideLength, maxPeptideLength, topDownTruncationSearch), - // cleave methionine - if ((initiatorMethionineBehavior != InitiatorMethionineBehavior.Retain && protein[0] == 'M') - && ValidLength(protein.Length - 1, minPeptideLength, maxPeptideLength)) - { - peptides.Add(new ProteolyticPeptide(protein, 2, protein.Length, 0, CleavageSpecificity.Full, "full:M cleaved")); - } - } + // Full proteolytic cleavage + CleavageSpecificity.Full => FullDigestion(protein, initiatorMethionineBehavior, maximumMissedCleavages, minPeptideLength, maxPeptideLength), - // Also digest using the proteolysis product start/end indices - peptides.AddRange( - protein.ProteolysisProducts - .Where(proteolysisProduct => proteolysisProduct.OneBasedEndPosition.HasValue && proteolysisProduct.OneBasedBeginPosition.HasValue - && ValidLength(proteolysisProduct.OneBasedEndPosition.Value - proteolysisProduct.OneBasedBeginPosition.Value + 1, minPeptideLength, maxPeptideLength)) - .Select(proteolysisProduct => - new ProteolyticPeptide(protein, proteolysisProduct.OneBasedBeginPosition.Value, proteolysisProduct.OneBasedEndPosition.Value, 0, CleavageSpecificity.None, proteolysisProduct.Type))); - } - - // Full proteolytic cleavage - else if (CleavageSpecificity == CleavageSpecificity.Full) - { - peptides.AddRange(FullDigestion(protein, initiatorMethionineBehavior, maximumMissedCleavages, minPeptideLength, maxPeptideLength)); - } - - // Cleavage rules for semi-specific search - else if (CleavageSpecificity == CleavageSpecificity.Semi) - { - peptides.AddRange(SemiProteolyticDigestion(protein, initiatorMethionineBehavior, maximumMissedCleavages, minPeptideLength, maxPeptideLength)); - } - else - { - throw new NotImplementedException(); - } - - return peptides; + // Cleavage rules for semi-specific search + CleavageSpecificity.Semi => SemiProteolyticDigestion(protein, initiatorMethionineBehavior, maximumMissedCleavages, minPeptideLength, maxPeptideLength), + _ => throw new NotImplementedException() + }; } /// @@ -281,6 +239,46 @@ private IEnumerable FullDigestion(Protein protein, Initiator } } + /// + /// Gets protein intervals for top-down digestion. + /// + /// + /// + /// + /// + /// + /// + private IEnumerable TopDownDigestion(Protein protein, InitiatorMethionineBehavior initiatorMethionineBehavior, + int minPeptideLength, int maxPeptideLength, bool topDownTruncationSearch) + { + if (!topDownTruncationSearch) // standard top-down + { + // retain methionine + if ((initiatorMethionineBehavior != InitiatorMethionineBehavior.Cleave || protein[0] != 'M') + && ValidLength(protein.Length, minPeptideLength, maxPeptideLength)) + { + yield return new ProteolyticPeptide(protein, 1, protein.Length, 0, CleavageSpecificity.Full, "full"); + } + + // cleave methionine + if ((initiatorMethionineBehavior != InitiatorMethionineBehavior.Retain && protein[0] == 'M') + && ValidLength(protein.Length - 1, minPeptideLength, maxPeptideLength)) + { + yield return new ProteolyticPeptide(protein, 2, protein.Length, 0, CleavageSpecificity.Full, "full:M cleaved"); + } + } + + // Also digest using the proteolysis product start/end indices + foreach (var proteolysisProduct in protein.ProteolysisProducts) + { + if (proteolysisProduct.OneBasedEndPosition.HasValue && proteolysisProduct.OneBasedBeginPosition.HasValue + && ValidLength(proteolysisProduct.OneBasedEndPosition.Value - proteolysisProduct.OneBasedBeginPosition.Value + 1, minPeptideLength, maxPeptideLength)) + { + yield return new ProteolyticPeptide(protein, proteolysisProduct.OneBasedBeginPosition.Value, proteolysisProduct.OneBasedEndPosition.Value, 0, CleavageSpecificity.None, proteolysisProduct.Type); + } + } + } + /// /// Gets the protein intervals based on semiSpecific digestion rules /// This is the classic, slow semi-specific digestion that generates each semi-specific peptide pre-search diff --git a/mzLib/Proteomics/ProteolyticDigestion/ProteolyticPeptide.cs b/mzLib/Proteomics/ProteolyticDigestion/ProteolyticPeptide.cs index 954ce449a..615a3618d 100644 --- a/mzLib/Proteomics/ProteolyticDigestion/ProteolyticPeptide.cs +++ b/mzLib/Proteomics/ProteolyticDigestion/ProteolyticPeptide.cs @@ -1,7 +1,5 @@ using System; using System.Collections.Generic; -using System.Linq; -using System.Security.Cryptography; using Omics.Digestion; using Omics.Modifications; @@ -14,7 +12,6 @@ namespace Proteomics.ProteolyticDigestion [Serializable] public class ProteolyticPeptide : DigestionProduct { - internal ProteolyticPeptide(Protein protein, int oneBasedStartResidueInProtein, int oneBasedEndResidueInProtein, int missedCleavages, CleavageSpecificity cleavageSpecificityForFdrCategory, string peptideDescription = null, string baseSequence = null) : base(protein, oneBasedStartResidueInProtein, oneBasedEndResidueInProtein, missedCleavages, cleavageSpecificityForFdrCategory, peptideDescription, baseSequence) { @@ -51,142 +48,40 @@ public string PeptideDescription /// /// /// - internal IEnumerable GetModifiedPeptides(IEnumerable allKnownFixedModifications, + internal IEnumerable GetModifiedPeptides(List allKnownFixedModifications, DigestionParams digestionParams, List variableModifications) { + int variable_modification_isoforms = 0; int peptideLength = OneBasedEndResidue - OneBasedStartResidue + 1; int maximumVariableModificationIsoforms = digestionParams.MaxModificationIsoforms; int maxModsForPeptide = digestionParams.MaxModsForPeptide; - var twoBasedPossibleVariableAndLocalizeableModifications = new Dictionary>(peptideLength + 4); - - var pepNTermVariableMods = new List(); - twoBasedPossibleVariableAndLocalizeableModifications.Add(1, pepNTermVariableMods); + var twoBasedPossibleVariableAndLocalizeableModifications = DictionaryPool.Get(); + var fixedModDictionary = FixedModDictionaryPool.Get(); - var pepCTermVariableMods = new List(); - twoBasedPossibleVariableAndLocalizeableModifications.Add(peptideLength + 2, pepCTermVariableMods); - - foreach (Modification variableModification in variableModifications) + try { - // Check if can be a n-term mod - if (CanBeNTerminalMod(variableModification, peptideLength) && !ModificationLocalization.UniprotModExists(Protein, 1, variableModification)) - { - pepNTermVariableMods.Add(variableModification); - } + PopulateVariableModifications(variableModifications, in twoBasedPossibleVariableAndLocalizeableModifications); + PopulateFixedModsOneIsNorFivePrimeTerminus(peptideLength, allKnownFixedModifications, in fixedModDictionary); - for (int r = 0; r < peptideLength; r++) - { - if (ModificationLocalization.ModFits(variableModification, Protein.BaseSequence, r + 1, peptideLength, OneBasedStartResidue + r) - && variableModification.LocationRestriction == "Anywhere." && !ModificationLocalization.UniprotModExists(Protein, r + 1, variableModification)) - { - if (!twoBasedPossibleVariableAndLocalizeableModifications.TryGetValue(r + 2, out List residueVariableMods)) - { - residueVariableMods = new List { variableModification }; - twoBasedPossibleVariableAndLocalizeableModifications.Add(r + 2, residueVariableMods); - } - else - { - residueVariableMods.Add(variableModification); - } - } - } - // Check if can be a c-term mod - if (CanBeCTerminalMod(variableModification, peptideLength) && !ModificationLocalization.UniprotModExists(Protein, peptideLength, variableModification)) + foreach (Dictionary variableModPattern in GetVariableModificationPatterns(twoBasedPossibleVariableAndLocalizeableModifications, maxModsForPeptide, peptideLength)) { - pepCTermVariableMods.Add(variableModification); - } - } + AppendFixedModificationsToVariable(in fixedModDictionary, in variableModPattern, out int numFixedMods); - // LOCALIZED MODS - foreach (var kvp in Protein.OneBasedPossibleLocalizedModifications) - { - bool inBounds = kvp.Key >= OneBasedStartResidue && kvp.Key <= OneBasedEndResidue; - if (!inBounds) - { - continue; - } + yield return new PeptideWithSetModifications(Protein, digestionParams, OneBasedStartResidue, OneBasedEndResidue, + CleavageSpecificityForFdrCategory, PeptideDescription, MissedCleavages, variableModPattern, numFixedMods); - int locInPeptide = kvp.Key - OneBasedStartResidueInProtein + 1; - foreach (Modification modWithMass in kvp.Value) - { - if (modWithMass is Modification variableModification) + variable_modification_isoforms++; + if (variable_modification_isoforms == maximumVariableModificationIsoforms) { - // Check if can be a n-term mod - if (locInPeptide == 1 && CanBeNTerminalMod(variableModification, peptideLength) && !Protein.IsDecoy) - { - pepNTermVariableMods.Add(variableModification); - } - - int r = locInPeptide - 1; - if (r >= 0 && r < peptideLength - && (Protein.IsDecoy || - (ModificationLocalization.ModFits(variableModification, Protein.BaseSequence, r + 1, peptideLength, OneBasedStartResidueInProtein + r) - && variableModification.LocationRestriction == "Anywhere."))) - { - if (!twoBasedPossibleVariableAndLocalizeableModifications.TryGetValue(r + 2, out List residueVariableMods)) - { - residueVariableMods = new List { variableModification }; - twoBasedPossibleVariableAndLocalizeableModifications.Add(r + 2, residueVariableMods); - } - else - { - residueVariableMods.Add(variableModification); - } - } - - // Check if can be a c-term mod - if (locInPeptide == peptideLength && CanBeCTerminalMod(variableModification, peptideLength) && !Protein.IsDecoy) - { - pepCTermVariableMods.Add(variableModification); - } + yield break; } } } - - int variable_modification_isoforms = 0; - - foreach (Dictionary kvp in GetVariableModificationPatterns(twoBasedPossibleVariableAndLocalizeableModifications, maxModsForPeptide, peptideLength)) + finally { - int numFixedMods = 0; - foreach (var ok in GetFixedModsOneIsNorFivePrimeTerminus(peptideLength, allKnownFixedModifications)) - { - if (!kvp.ContainsKey(ok.Key)) - { - numFixedMods++; - kvp.Add(ok.Key, ok.Value); - } - } - yield return new PeptideWithSetModifications(Protein, digestionParams, OneBasedStartResidue, OneBasedEndResidue, - CleavageSpecificityForFdrCategory, PeptideDescription, MissedCleavages, kvp, numFixedMods); - variable_modification_isoforms++; - if (variable_modification_isoforms == maximumVariableModificationIsoforms) - { - yield break; - } + FixedModDictionaryPool.Return(fixedModDictionary); + DictionaryPool.Return(twoBasedPossibleVariableAndLocalizeableModifications); } } - - /// - /// Determines whether given modification can be an N-terminal modification - /// - /// - /// - /// - private bool CanBeNTerminalMod(Modification variableModification, int peptideLength) - { - return ModificationLocalization.ModFits(variableModification, Protein.BaseSequence, 1, peptideLength, OneBasedStartResidue) - && (variableModification.LocationRestriction == "N-terminal." || variableModification.LocationRestriction == "Peptide N-terminal."); - } - - /// - /// Determines whether given modification can be a C-terminal modification - /// - /// - /// - /// - private bool CanBeCTerminalMod(Modification variableModification, int peptideLength) - { - return ModificationLocalization.ModFits(variableModification, Protein.BaseSequence, peptideLength, peptideLength, OneBasedStartResidue + peptideLength - 1) - && (variableModification.LocationRestriction == "C-terminal." || variableModification.LocationRestriction == "Peptide C-terminal."); - } } } \ No newline at end of file diff --git a/mzLib/Test/TestMsFraggerCombinedResults.cs b/mzLib/Test/FileReadingTests/TestMsFraggerCombinedResults.cs similarity index 83% rename from mzLib/Test/TestMsFraggerCombinedResults.cs rename to mzLib/Test/FileReadingTests/TestMsFraggerCombinedResults.cs index bd5d8834d..731284adb 100644 --- a/mzLib/Test/TestMsFraggerCombinedResults.cs +++ b/mzLib/Test/FileReadingTests/TestMsFraggerCombinedResults.cs @@ -1,15 +1,12 @@ using NUnit.Framework; using Readers; -using System; using System.Collections.Generic; using System.Linq; -using Assert = NUnit.Framework.Legacy.ClassicAssert; using System.IO; using TopDownProteomics; -using OxyPlot; using System.Diagnostics.CodeAnalysis; -namespace Test +namespace Test.FileReadingTests { [ExcludeFromCodeCoverage] internal class TestMsFraggerCombinedResults @@ -22,8 +19,8 @@ public void TestLoadResultsCount(string path) MsFraggerCombinedResults ms = new MsFraggerCombinedResults(filePath); ms.LoadResults(); - Assert.That(ms.AllPsmFiles.Count.Equals(2)); - Assert.That(ms.Results.Count.Equals(8)); + NUnit.Framework.Assert.That(ms.AllPsmFiles.Count.Equals(2)); + NUnit.Framework.Assert.That(ms.Results.Count.Equals(8)); } [Test] @@ -36,8 +33,8 @@ public void TestLoadResults(string path) List results = ms.Results.Select(psm => psm.FileName).ToList(); - Assert.That((results.Count(s => s.Contains("A_1"))).Equals(4)); - Assert.That((results.Count(s => s.Contains("A_2"))).Equals(4)); + NUnit.Framework.Assert.That(results.Count(s => s.Contains("A_1")).Equals(4)); + NUnit.Framework.Assert.That(results.Count(s => s.Contains("A_2")).Equals(4)); } [Test] @@ -61,8 +58,8 @@ public void TestFileNameToFilePathWithParameter(string path) foreach (var fileName in results) { - Assert.That(allFiles.TryGetValue(fileName, out var output)); - Assert.That(filePaths.Contains(output)); + NUnit.Framework.Assert.That(allFiles.TryGetValue(fileName, out var output)); + NUnit.Framework.Assert.That(filePaths.Contains(output)); } } @@ -80,8 +77,8 @@ public void TestFileNameToFilePathWithoutParameter(string path) foreach (var fileName in results) { - Assert.That(allFiles.TryGetValue(fileName, out var output)); - Assert.That(filePaths.Contains(output)); + NUnit.Framework.Assert.That(allFiles.TryGetValue(fileName, out var output)); + NUnit.Framework.Assert.That(filePaths.Contains(output)); } } @@ -101,7 +98,7 @@ public void TestExperimentAnnotationFile(string path) ExperimentAnnotationFile experimentAnnotation = FileReader.ReadFile(fileToRead); experimentAnnotation.WriteResults(fileToWrite); - Assert.That(File.Exists(fileToWrite)); + NUnit.Framework.Assert.That(File.Exists(fileToWrite)); File.Delete(fileToWrite); } diff --git a/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs b/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs index d2d41cba7..767fb1564 100644 --- a/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs +++ b/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs @@ -49,130 +49,38 @@ public override string ToString() internal IEnumerable GenerateModifiedOligos(List allKnownFixedMods, RnaDigestionParams digestionParams, List variableModifications) { + int variableModificationIsoforms = 0; int oligoLength = OneBasedEndResidue - OneBasedStartResidue + 1; int maximumVariableModificationIsoforms = digestionParams.MaxModificationIsoforms; int maxModsForOligo = digestionParams.MaxMods; - var twoBasedPossibleVariableAndLocalizeableModifications = new Dictionary>(oligoLength + 4); - - var fivePrimeVariableMods = new List(); - twoBasedPossibleVariableAndLocalizeableModifications.Add(1, fivePrimeVariableMods); + var twoBasedPossibleVariableAndLocalizeableModifications = DictionaryPool.Get(); + var fixedModDictionary = FixedModDictionaryPool.Get(); - var threePrimeVariableMods = new List(); - twoBasedPossibleVariableAndLocalizeableModifications.Add(oligoLength + 2, threePrimeVariableMods); - - // collect all possible variable mods, skipping if there is a database annotated modification - foreach (Modification variableModification in variableModifications) + try { - // Check if can be a 5'-term mod - if (CanBeFivePrime(variableModification, oligoLength) && !ModificationLocalization.UniprotModExists(NucleicAcid, 1, variableModification)) - { - fivePrimeVariableMods.Add(variableModification); - } + PopulateVariableModifications(variableModifications, in twoBasedPossibleVariableAndLocalizeableModifications); + PopulateFixedModsOneIsNorFivePrimeTerminus(oligoLength, allKnownFixedMods, in fixedModDictionary); - for (int r = 0; r < oligoLength; r++) - { - if (variableModification.LocationRestriction == "Anywhere." && - ModificationLocalization.ModFits(variableModification, NucleicAcid.BaseSequence, r + 1, oligoLength, OneBasedStartResidue + r) - && !ModificationLocalization.UniprotModExists(NucleicAcid, r + 1, variableModification)) - { - if (!twoBasedPossibleVariableAndLocalizeableModifications.TryGetValue(r + 2, out List residueVariableMods)) - { - residueVariableMods = new List { variableModification }; - twoBasedPossibleVariableAndLocalizeableModifications.Add(r + 2, residueVariableMods); - } - else - { - residueVariableMods.Add(variableModification); - } - } - } - // Check if can be a 3'-term mod - if (CanBeThreePrime(variableModification, oligoLength) && !ModificationLocalization.UniprotModExists(NucleicAcid, oligoLength, variableModification)) + // Add the mods to the oligo by return numerous OligoWithSetMods + foreach (Dictionary variableModPattern in GetVariableModificationPatterns(twoBasedPossibleVariableAndLocalizeableModifications, maxModsForOligo, oligoLength)) { - threePrimeVariableMods.Add(variableModification); - } - } + AppendFixedModificationsToVariable(in fixedModDictionary, in variableModPattern, out int numFixedMods); - // collect all localized modifications from the database. - foreach (var kvp in NucleicAcid.OneBasedPossibleLocalizedModifications) - { - bool inBounds = kvp.Key >= OneBasedStartResidue && kvp.Key <= OneBasedEndResidue; - if (!inBounds) - { - continue; - } + yield return new OligoWithSetMods(NucleicAcid, digestionParams, OneBasedStartResidue, OneBasedEndResidue, MissedCleavages, + CleavageSpecificityForFdrCategory, variableModPattern, numFixedMods, _fivePrimeTerminus, _threePrimeTerminus); - int locInPeptide = kvp.Key - OneBasedStartResidue + 1; - foreach (Modification modWithMass in kvp.Value) - { - if (modWithMass is Modification variableModification) + variableModificationIsoforms++; + if (variableModificationIsoforms == maximumVariableModificationIsoforms) { - // Check if can be a 5'-term mod - if (locInPeptide == 1 && CanBeFivePrime(variableModification, oligoLength) && !NucleicAcid.IsDecoy) - { - fivePrimeVariableMods.Add(variableModification); - } - - int r = locInPeptide - 1; - if (r >= 0 && r < oligoLength - && (NucleicAcid.IsDecoy || - (ModificationLocalization.ModFits(variableModification, NucleicAcid.BaseSequence, r + 1, oligoLength, OneBasedStartResidue + r) - && variableModification.LocationRestriction == "Anywhere."))) - { - if (!twoBasedPossibleVariableAndLocalizeableModifications.TryGetValue(r + 2, out List residueVariableMods)) - { - residueVariableMods = new List { variableModification }; - twoBasedPossibleVariableAndLocalizeableModifications.Add(r + 2, residueVariableMods); - } - else - { - residueVariableMods.Add(variableModification); - } - } - - // Check if can be a 3'-term mod - if (locInPeptide == oligoLength && CanBeThreePrime(variableModification, oligoLength) && !NucleicAcid.IsDecoy) - { - threePrimeVariableMods.Add(variableModification); - } + yield break; } } } - - int variableModificationIsoforms = 0; - - // Add the mods to the oligo by return numerous OligoWithSetMods - foreach (Dictionary variableModPattern in GetVariableModificationPatterns(twoBasedPossibleVariableAndLocalizeableModifications, maxModsForOligo, oligoLength)) + finally { - int numFixedMods = 0; - foreach (var fixedModPattern in GetFixedModsOneIsNorFivePrimeTerminus(oligoLength, allKnownFixedMods)) - { - if (!variableModPattern.ContainsKey(fixedModPattern.Key)) - { - numFixedMods++; - variableModPattern.Add(fixedModPattern.Key, fixedModPattern.Value); - } - } - yield return new OligoWithSetMods(NucleicAcid, digestionParams, OneBasedStartResidue, OneBasedEndResidue, MissedCleavages, - CleavageSpecificityForFdrCategory, variableModPattern, numFixedMods, _fivePrimeTerminus, _threePrimeTerminus); - variableModificationIsoforms++; - if (variableModificationIsoforms == maximumVariableModificationIsoforms) - { - yield break; - } + DictionaryPool.Return(twoBasedPossibleVariableAndLocalizeableModifications); + FixedModDictionaryPool.Return(fixedModDictionary); } } - - private bool CanBeFivePrime(Modification variableModification, int peptideLength) - { - return (variableModification.LocationRestriction == "5'-terminal." || variableModification.LocationRestriction == "Oligo 5'-terminal.") - && ModificationLocalization.ModFits(variableModification, NucleicAcid.BaseSequence, 1, peptideLength, OneBasedStartResidue); - } - - private bool CanBeThreePrime(Modification variableModification, int peptideLength) - { - return (variableModification.LocationRestriction == "3'-terminal." || variableModification.LocationRestriction == "Oligo 3'-terminal.") - && ModificationLocalization.ModFits(variableModification, NucleicAcid.BaseSequence, peptideLength, peptideLength, OneBasedStartResidue + peptideLength - 1); - } } } diff --git a/mzLib/mzLib.nuspec b/mzLib/mzLib.nuspec index b9c326d5d..b0b4c3045 100644 --- a/mzLib/mzLib.nuspec +++ b/mzLib/mzLib.nuspec @@ -16,8 +16,8 @@ - - + + @@ -29,8 +29,8 @@ - - + + @@ -95,4 +95,4 @@ - \ No newline at end of file +