Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Db writer fix #820

Merged
merged 9 commits into from
Jan 10, 2025
112 changes: 112 additions & 0 deletions mzLib/Test/DatabaseTests/TestDatabaseLoaders.cs
Original file line number Diff line number Diff line change
Expand Up @@ -475,6 +475,118 @@ public void Modification_read_write_into_proteinDb()
Assert.AreEqual(0, ProteinDbLoader.GetPtmListFromProteinXml(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "xml.xml")).Count);
}

[Test]
public void MultiMod_ProteinDbWriter()
{
Loaders.LoadElements();
var sampleModList = PtmListLoader
.ReadModsFromFile(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "z.txt"),
out var errors).ToList();
var currentMod = sampleModList.First();
// create slightly different modifications
var newMod = new Modification(_originalId: "1" + currentMod.OriginalId, _target: currentMod.Target,
_modificationType: currentMod.ModificationType,
_accession: currentMod.Accession, _locationRestriction: currentMod.LocationRestriction,
_featureType: currentMod.FeatureType,
_chemicalFormula: currentMod.ChemicalFormula);
var newMod2 = new Modification(_originalId: "2" + currentMod.OriginalId, _target: currentMod.Target,
_modificationType: currentMod.ModificationType,
_accession: currentMod.Accession, _locationRestriction: currentMod.LocationRestriction,
_featureType: currentMod.FeatureType,
_chemicalFormula: currentMod.ChemicalFormula);
var newMod3 = new Modification(_originalId: "3" + currentMod.OriginalId, _target: currentMod.Target,
_modificationType: currentMod.ModificationType,
_accession: currentMod.Accession, _locationRestriction: currentMod.LocationRestriction,
_featureType: currentMod.FeatureType,
_chemicalFormula: currentMod.ChemicalFormula);
var newMod4 = new Modification(_originalId: "4" + currentMod.OriginalId, _target: currentMod.Target,
_modificationType: currentMod.ModificationType,
_accession: currentMod.Accession, _locationRestriction: currentMod.LocationRestriction,
_featureType: currentMod.FeatureType,
_chemicalFormula: currentMod.ChemicalFormula);
var newMod5 = new Modification(_originalId: "5" + currentMod.OriginalId, _target: currentMod.Target,
_modificationType: currentMod.ModificationType,
_accession: currentMod.Accession, _locationRestriction: currentMod.LocationRestriction,
_featureType: currentMod.FeatureType,
_chemicalFormula: currentMod.ChemicalFormula);
sampleModList.AddRange(new List<Modification>() { newMod, newMod2, newMod3, newMod4, newMod5 });
Assert.AreEqual(6, sampleModList.OfType<Modification>().Count());
// Create a protein with all possible modifications
Protein protein = new Protein(
"MCMCMCSSSSSSSS",
"accession",
"organism",
new List<Tuple<string, string>>(),
new Dictionary<int, List<Modification>>
{
{ 2, sampleModList.OfType<Modification>().ToList() },
{ 4, sampleModList.OfType<Modification>().ToList() },
{ 6, sampleModList.OfType<Modification>().ToList() },
},
null,
"name",
"full_name",
false,
false,
new List<DatabaseReference>(),
new List<SequenceVariation>(),
disulfideBonds: new List<DisulfideBond>());

Assert.AreEqual(6, protein.OneBasedPossibleLocalizedModifications[2].OfType<Modification>().Count());
Assert.AreEqual(18, protein.OneBasedPossibleLocalizedModifications.SelectMany(kvp => kvp.Value).Count());
ProteinDbWriter.WriteXmlDatabase(new Dictionary<string, HashSet<Tuple<int, Modification>>>(),
new List<Protein> { protein },
Path.Combine(TestContext.CurrentContext.TestDirectory, "test_modifications_with_proteins.xml"));
List<Protein> newProteins = ProteinDbLoader.LoadProteinXML(
Path.Combine(TestContext.CurrentContext.TestDirectory, "test_modifications_with_proteins.xml"),
true, DecoyType.None, new List<Modification>(), false, new List<string>(),
out Dictionary<string, Modification> um);

// Create a second protein with the same modifications, but listed in a different order.
sampleModList.Reverse();
Protein modShuffledProtein = new Protein(
"MCMCMCSSSSSSSS",
"accession",
"organism",
new List<Tuple<string, string>>(),
new Dictionary<int, List<Modification>>
{
{ 2, sampleModList.OfType<Modification>().ToList() },
{ 4, sampleModList.OfType<Modification>().ToList() },
{ 6, sampleModList.OfType<Modification>().ToList() },
},
null,
"name",
"full_name",
false,
false,
new List<DatabaseReference>(),
new List<SequenceVariation>(),
disulfideBonds: new List<DisulfideBond>());
string shuffledProteinFileName = Path.Combine(TestContext.CurrentContext.TestDirectory,
"test_shuffled_modifications_with_proteins.xml");
ProteinDbWriter.WriteXmlDatabase(new Dictionary<string, HashSet<Tuple<int, Modification>>>(),
new List<Protein> { modShuffledProtein }, shuffledProteinFileName);
List<Protein> newShuffledProteins = ProteinDbLoader.LoadProteinXML(shuffledProteinFileName,
true, DecoyType.None, new List<Modification>(), false, new List<string>(), out um);

// We've read in proteins from both databases. Assert that they are equal
Assert.AreEqual(newShuffledProteins.First().Accession, newProteins.First().Accession);
Assert.AreEqual(newShuffledProteins.First(), newProteins.First());

// Now, ensure that the modification dictionaries for each are equivalent (contain the same mods) and equal (contain the same mods in the same order)
for(int i = 1; i<4; i++)
{
int oneBasedResidue = i * 2;

Assert.That(newShuffledProteins.First().OneBasedPossibleLocalizedModifications[oneBasedResidue],
Is.EquivalentTo(newProteins.First().OneBasedPossibleLocalizedModifications[oneBasedResidue]));

Assert.That(newShuffledProteins.First().OneBasedPossibleLocalizedModifications[oneBasedResidue],
Is.EqualTo(newProteins.First().OneBasedPossibleLocalizedModifications[oneBasedResidue]));
}
}

[Test]
public static void Test_MetaMorpheusStyleProteinDatabaseWriteAndREad()
{
Expand Down
36 changes: 24 additions & 12 deletions mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ private static Dictionary<string, int> WriteNucleicAcidXmlDatabase(
return newModResEntries;
}

/// <summary>
// <summary>
/// Writes a protein database in mzLibProteinDb format, with additional modifications from the AdditionalModsToAddToProteins list.
/// </summary>
/// <param name="additionalModsToAddToProteins"></param>
Expand Down Expand Up @@ -324,8 +324,17 @@ public static Dictionary<string, int> WriteXmlDatabase(Dictionary<string, HashSe
}

HashSet<Modification> allRelevantModifications = new HashSet<Modification>(
nonVariantProteins.SelectMany(p => p.SequenceVariations.SelectMany(sv => sv.OneBasedModifications).Concat(p.OneBasedPossibleLocalizedModifications).SelectMany(kv => kv.Value))
.Concat(additionalModsToAddToProteins.Where(kv => nonVariantProteins.SelectMany(p => p.SequenceVariations.Select(sv => VariantApplication.GetAccession(p, new[] { sv })).Concat(new[] { p.Accession })).Contains(kv.Key)).SelectMany(kv => kv.Value.Select(v => v.Item2))));
nonVariantProteins
.SelectMany(p => p.SequenceVariations
.SelectMany(sv => sv.OneBasedModifications)
.Concat(p.OneBasedPossibleLocalizedModifications)
.SelectMany(kv => kv.Value))
.Concat(additionalModsToAddToProteins
.Where(kv => nonVariantProteins
.SelectMany(p => p.SequenceVariations
.Select(sv => VariantApplication.GetAccession(p, new[] { sv })).Concat(new[] { p.Accession }))
.Contains(kv.Key))
.SelectMany(kv => kv.Value.Select(v => v.Item2))));

foreach (Modification mod in allRelevantModifications.OrderBy(m => m.IdWithMotif))
{
Expand Down Expand Up @@ -384,7 +393,7 @@ public static Dictionary<string, int> WriteXmlDatabase(Dictionary<string, HashSe
writer.WriteStartElement("dbReference");
writer.WriteAttributeString("type", dbRef.Type);
writer.WriteAttributeString("id", dbRef.Id);
foreach (Tuple<string, string> property in dbRef.Properties)
foreach (Tuple<string, string> property in dbRef.Properties.OrderBy(t => t.Item1).ThenBy(t => t.Item2))
{
writer.WriteStartElement("property");
writer.WriteAttributeString("type", property.Item1);
Expand All @@ -397,7 +406,8 @@ public static Dictionary<string, int> WriteXmlDatabase(Dictionary<string, HashSe
//for now we are not going to write top-down truncations generated for top-down truncation search.
//some day we could write those if observed
//the truncation designation is contained in the "type" field of ProteolysisProduct
List<ProteolysisProduct> proteolysisProducts = protein.ProteolysisProducts.Where(p => !p.Type.Contains("truncation")).ToList();
List<ProteolysisProduct> proteolysisProducts = protein.ProteolysisProducts.Where(p => !p.Type.Contains("truncation"))
.OrderBy(p => p.OneBasedBeginPosition).ToList();
foreach (var proteolysisProduct in proteolysisProducts)
{
writer.WriteStartElement("feature");
Expand All @@ -413,23 +423,24 @@ public static Dictionary<string, int> WriteXmlDatabase(Dictionary<string, HashSe
writer.WriteEndElement();
}

foreach (var hm in GetModsForThisBioPolymer(protein, null, additionalModsToAddToProteins, newModResEntries).OrderBy(b => b.Key))
foreach (var positionModKvp in GetModsForThisBioPolymer(protein, null, additionalModsToAddToProteins, newModResEntries).OrderBy(b => b.Key))
{
foreach (var modId in hm.Value)
foreach (var modId in positionModKvp.Value.OrderBy(mod => mod))
{
writer.WriteStartElement("feature");
writer.WriteAttributeString("type", "modified residue");
writer.WriteAttributeString("description", modId);
writer.WriteStartElement("location");
writer.WriteStartElement("position");
writer.WriteAttributeString("position", hm.Key.ToString(CultureInfo.InvariantCulture));
writer.WriteAttributeString("position", positionModKvp.Key.ToString(CultureInfo.InvariantCulture));
writer.WriteEndElement();
writer.WriteEndElement();
writer.WriteEndElement();
}
}

foreach (var hm in protein.SequenceVariations)

foreach (var hm in protein.SequenceVariations.OrderBy(sv => sv.OneBasedBeginPosition).ThenBy(sv => sv.VariantSequence))
{
writer.WriteStartElement("feature");
writer.WriteAttributeString("type", "sequence variant");
Expand Down Expand Up @@ -458,7 +469,7 @@ public static Dictionary<string, int> WriteXmlDatabase(Dictionary<string, HashSe
}
foreach (var hmm in GetModsForThisBioPolymer(protein, hm, additionalModsToAddToProteins, newModResEntries).OrderBy(b => b.Key))
{
foreach (var modId in hmm.Value)
foreach (var modId in hmm.Value.OrderBy(mod => mod))
{
writer.WriteStartElement("subfeature");
writer.WriteAttributeString("type", "modified residue");
Expand All @@ -475,7 +486,7 @@ public static Dictionary<string, int> WriteXmlDatabase(Dictionary<string, HashSe
writer.WriteEndElement(); // feature
}

foreach (var hm in protein.DisulfideBonds)
foreach (var hm in protein.DisulfideBonds.OrderBy(bond => bond.OneBasedBeginPosition))
{
writer.WriteStartElement("feature");
writer.WriteAttributeString("type", "disulfide bond");
Expand All @@ -500,7 +511,7 @@ public static Dictionary<string, int> WriteXmlDatabase(Dictionary<string, HashSe
writer.WriteEndElement(); // feature
}

foreach (var hm in protein.SpliceSites)
foreach (var hm in protein.SpliceSites.OrderBy(site => site.OneBasedBeginPosition))
{
writer.WriteStartElement("feature");
writer.WriteAttributeString("type", "splice site");
Expand Down Expand Up @@ -538,6 +549,7 @@ public static Dictionary<string, int> WriteXmlDatabase(Dictionary<string, HashSe
return newModResEntries;
}


public static void WriteFastaDatabase(List<Protein> proteinList, string outputFileName, string delimeter)
{
using (StreamWriter writer = new StreamWriter(outputFileName))
Expand Down
Loading