paper release

lhatsk · Feb 8, 2023 · 4e496ab · 4e496ab
1 parent 0de8039
commit 4e496ab
Show file tree

Hide file tree

Showing 2 changed files with 167 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -39,13 +39,13 @@ Distograms have shape LxLx128 with the following binning: torch.arange(2.3125,42
 AlphaLink expects a FASTA file containing a single sequence, the crosslinks, and databases for template/ MSA search , [see also OpenFold Inference](https://github.com/aqlaboratory/openfold#inference).
 
 ```
-python3 predict_with_crosslinks.py --checkpoint_path resources/AlphaLink_params/finetuning_model_5_ptm_CACA_10A.pt 7K3N_A.fasta photoL.csv uniref90.fasta mgy_clusters.fa pdb70/pdb70 pdb_mmcif/mmcif_files uniclust30_2018_08/uniclust30_2018_08
+python predict_with_crosslinks.py --checkpoint_path resources/AlphaLink_params/finetuning_model_5_ptm_CACA_10A.pt 7K3N_A.fasta photoL.csv uniref90.fasta mgy_clusters.fa pdb70/pdb70 pdb_mmcif/mmcif_files uniclust30_2018_08/uniclust30_2018_08
 ```
 
 MSA generation can be skipped if there are precomputed alignments:
 
 ```
-python3 predict_with_crosslinks.py --use_precomputed_alignments msa/ --checkpoint_path resources/AlphaLink_params/finetuning_model_5_ptm_CACA_10A.pt  7K3N_A.fasta photoL.csv uniref90.fasta mgy_clusters.fa pdb70/pdb70 pdb_mmcif/mmcif_files uniclust30_2018_08/uniclust30_2018_08
+python predict_with_crosslinks.py --use_precomputed_alignments msa/ --checkpoint_path resources/AlphaLink_params/finetuning_model_5_ptm_CACA_10A.pt  7K3N_A.fasta photoL.csv uniref90.fasta mgy_clusters.fa pdb70/pdb70 pdb_mmcif/mmcif_files uniclust30_2018_08/uniclust30_2018_08
 ```
 
 ## Network weights
@@ -57,6 +57,21 @@ https://www.dropbox.com/s/5jmb8pxmt5rr751/finetuning_model_5_ptm_distogram.pt.gz
 
 They need to be unpacked (gunzip).
 
+## AlphaLink IHM model deposition [alphalink-ihm-template](https://github.com/grandrea/alphalink-ihm-template)
+
+Script for generating model files in mmCif format for deposition in [PDB-Dev](https://pdb-dev.wwpdb.org/). Requires [python-ihm](https://github.com/ihmwg/python-ihm) from results of AlphaLink
+
+Takes a .csv file with the crosslinks, uniprot accession code and system name to generate a pdb-dev compliant file for deposition. Takes an mmcif file as an input.
+
+First, generate an mmcif file from the .pdb output of AlphaLink using [Maxit](https://sw-tools.rcsb.org/apps/MAXIT/index.html).
+
+Then, edit the make_ihm script to include authors, publication, system name, entity source, deposition database and details as you need.
+
+Then you can run with
+```
+python make_ihm.py
+```
+
 ## PDB models are available at: https://www.dropbox.com/sh/yrto5tzo7u1atqg/AABy2SdP-WFOanp7eOKr3eeoa?dl=0
 
 ## Reproduction instructions

diff --git a/make_ihm.py b/make_ihm.py
@@ -0,0 +1,150 @@
+# https://github.com/grandrea/alphalink-ihm-template/
+# Author Andrea Graziadei
+import ihm
+import ihm.location
+import ihm.dataset
+import ihm.restraint
+import ihm.protocol
+import ihm.model
+import ihm.cross_linkers
+import ihm.reference
+import ihm.dumper
+import ihm.representation
+import pandas as pd
+import ihm.reader
+import ihm.citations
+import sys
+
+if len(sys.argv)!=4:
+    print("usage: python make_ihm.py model.cif ")
+
+model_file = "model_1.cif" #model file converted to cif by MAXIT
+uniprot_accession ="P0A910"
+protein_name ="OmpA"
+restraint_file = "restraint.csv" # a file with 3 space-separated columns for residue from, residue to and restraint confidence (1-FDR)
+
+with open(model_file) as fh:
+    system = ihm.reader.read(fh)
+system=system[0]
+
+#change to PRIDE or whatever repository you have your cx-ms results in
+crosslink_dataset = ihm.dataset.CXMSDataset(ihm.location.DatabaseLocation(db_name="jPOSTrepo",
+                                            db_code="JPST001851"))
+
+
+model_sequence = ihm.reference.UniProtSequence.from_accession(uniprot_accession)
+
+#this would need to be a loop for systems with more than one entity
+entityA = system.entities[0]
+entityA.references = [model_sequence]
+
+#change source as needed using ihm terms
+entityA.source = ihm.source.Natural(ncbi_taxonomy_id=83333,
+                                    common_name="Escherichia coli",
+                                    scientific_name="Escherichia coli K12",
+                                    strain="K12")
+
+entityA.description = (protein_name)
+asymA = system.asym_units[0]
+asymA.description = str(protein_name)
+asymA.details = str(protein_name)
+assembly = ihm.Assembly((asymA,), name='Modeled assembly', description="modeled assembly")
+
+title_string = str("model of " + protein_name)
+
+system.title = title_string
+
+#change authors as needed
+system.authors = ["author 1",
+                  "author 2"]
+
+citation = ihm.Citation(
+    pmid=None,
+    title='mytitle',
+    journal='myjournal', volume=None, page_range=(1, 2), year=2023,
+    authors=["author 1",
+             "author 2"],
+    doi=None)
+system.citations.append(alphalink)
+
+alphalink_software = ihm.Software(name="AlphaLink",
+                                  classification="model building",
+                                  description="protein structure prediction by deep learning assisted by experimental distance restraints",
+                                  location="https://github.com/lhatsk/AlphaLink",
+                                  version="1.0",
+                                  citation=alphalink)
+
+system.software.append(alphalink_software)
+
+rep = ihm.representation.Representation([ihm.representation.AtomicSegment(asymA, rigid=False)])
+
+# Since our input models are plain PDBx, not IHM, we need to add additional
+# required information on the model representation and assembly. This may be
+# better handled by passing the file through python-ihm's
+# util/make-mmcif.py first.
+for state_group in system.state_groups:
+        for state in state_group:
+            for model_group in state:
+                for model in model_group:
+                    if not model.assembly:
+                        model.assembly = assembly
+                    if not model.representation:
+                        model.representation = rep
+
+#define crosslinker. Here, photo-leucine.
+photo_leucine = ihm.ChemDescriptor(auth_name="L-Photo-Leucine",
+                                   chem_comp_id=None,
+                                   smiles="CC1(C[C@H](N)C(O)=O)N=N1",
+                                   inchi="1S/C5H9N3O2/c1-5(7-8-5)2-3(6)4(9)10/h3H,2,6H2,1H3,(H,9,10)/t3-/m0/s1",
+                                   inchi_key="MJRDGTVDJKACQZ-VKHMYHEASA-N",
+                                   common_name="L-Photo-Leucine")
+
+
+#if not using photo-leucine, use ihm.crosslinkers definitions
+crosslink_restraint = ihm.restraint.CrossLinkRestraint(dataset=crosslink_dataset,
+                                                       linker=photo_leucine)
+
+crosslink_df = pd.read_csv(restraint_file, sep=" ", names=["from", "to", "confidence"], header=None)
+crosslinks = []
+# Usually cross-links use an upper bound restraint on the distance
+distance = ihm.restraint.UpperBoundDistanceRestraint(10)
+for index, line in crosslink_df.iterrows():
+    res1ind = int(line["from"])
+    res2ind = int(line["to"])
+    # This assumes that residue indices in the CSV file map 1:1 to mmCIF
+    # seq_ids. Verify by checking the residue names in the ihm_cross_link_list
+    # in the output mmCIF. You may need to add an offset or otherwise map
+    # the residue indices, because it looks off to me.
+    residue_pair = ihm.restraint.ExperimentalCrossLink(
+        residue1=entityA.residue(res1ind), residue2=entityA.residue(res2ind))
+    # This takes a list of all ambiguous cross-links. Here we're saying there
+    # is no ambiguity.
+    crosslink_restraint.experimental_cross_links.append([residue_pair])
+    residue_pair_restraint = ihm.restraint.ResidueCrossLink(experimental_cross_link=residue_pair,
+                                                            asym1=asymA,
+                                                            asym2=asymA,
+                                                            psi=(1 - line["confidence"]),
+                                                            distance=distance)
+    crosslink_restraint.cross_links.append(residue_pair_restraint)
+
+system.restraints.append(crosslink_restraint)
+
+all_datasets = ihm.dataset.DatasetGroup((crosslink_dataset,))
+
+protocol = ihm.protocol.Protocol(name='AlphaLink')
+protocol.steps.append(ihm.protocol.Step(
+    assembly=system.complete_assembly, dataset_group=all_datasets,
+    method='AlphaLink', name='AlphaLink',
+    num_models_begin=0, num_models_end=5, multi_scale=False, ensemble=False))
+
+for state_group in system.state_groups:
+        for state in state_group:
+            for model_group in state:
+                for model in model_group:
+                    model.protocol = protocol
+                    if not model.representation:
+                        model.representation = rep
+
+with open("model.cif", "w", encoding="utf-8") as fh:
+    ihm.dumper.write(fh, [system])
+