Skip to content

Commit

Permalink
paper release
Browse files Browse the repository at this point in the history
  • Loading branch information
kstahl committed Feb 8, 2023
1 parent 0de8039 commit 4e496ab
Show file tree
Hide file tree
Showing 2 changed files with 167 additions and 2 deletions.
19 changes: 17 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,13 @@ Distograms have shape LxLx128 with the following binning: torch.arange(2.3125,42
AlphaLink expects a FASTA file containing a single sequence, the crosslinks, and databases for template/ MSA search , [see also OpenFold Inference](https://github.com/aqlaboratory/openfold#inference).

```
python3 predict_with_crosslinks.py --checkpoint_path resources/AlphaLink_params/finetuning_model_5_ptm_CACA_10A.pt 7K3N_A.fasta photoL.csv uniref90.fasta mgy_clusters.fa pdb70/pdb70 pdb_mmcif/mmcif_files uniclust30_2018_08/uniclust30_2018_08
python predict_with_crosslinks.py --checkpoint_path resources/AlphaLink_params/finetuning_model_5_ptm_CACA_10A.pt 7K3N_A.fasta photoL.csv uniref90.fasta mgy_clusters.fa pdb70/pdb70 pdb_mmcif/mmcif_files uniclust30_2018_08/uniclust30_2018_08
```

MSA generation can be skipped if there are precomputed alignments:

```
python3 predict_with_crosslinks.py --use_precomputed_alignments msa/ --checkpoint_path resources/AlphaLink_params/finetuning_model_5_ptm_CACA_10A.pt 7K3N_A.fasta photoL.csv uniref90.fasta mgy_clusters.fa pdb70/pdb70 pdb_mmcif/mmcif_files uniclust30_2018_08/uniclust30_2018_08
python predict_with_crosslinks.py --use_precomputed_alignments msa/ --checkpoint_path resources/AlphaLink_params/finetuning_model_5_ptm_CACA_10A.pt 7K3N_A.fasta photoL.csv uniref90.fasta mgy_clusters.fa pdb70/pdb70 pdb_mmcif/mmcif_files uniclust30_2018_08/uniclust30_2018_08
```

## Network weights
Expand All @@ -57,6 +57,21 @@ https://www.dropbox.com/s/5jmb8pxmt5rr751/finetuning_model_5_ptm_distogram.pt.gz

They need to be unpacked (gunzip).

## AlphaLink IHM model deposition [alphalink-ihm-template](https://github.com/grandrea/alphalink-ihm-template)

Script for generating model files in mmCif format for deposition in [PDB-Dev](https://pdb-dev.wwpdb.org/). Requires [python-ihm](https://github.com/ihmwg/python-ihm) from results of AlphaLink

Takes a .csv file with the crosslinks, uniprot accession code and system name to generate a pdb-dev compliant file for deposition. Takes an mmcif file as an input.

First, generate an mmcif file from the .pdb output of AlphaLink using [Maxit](https://sw-tools.rcsb.org/apps/MAXIT/index.html).

Then, edit the make_ihm script to include authors, publication, system name, entity source, deposition database and details as you need.

Then you can run with
```
python make_ihm.py
```

## PDB models are available at: https://www.dropbox.com/sh/yrto5tzo7u1atqg/AABy2SdP-WFOanp7eOKr3eeoa?dl=0

## Reproduction instructions
Expand Down
150 changes: 150 additions & 0 deletions make_ihm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
# https://github.com/grandrea/alphalink-ihm-template/
# Author Andrea Graziadei
import ihm
import ihm.location
import ihm.dataset
import ihm.restraint
import ihm.protocol
import ihm.model
import ihm.cross_linkers
import ihm.reference
import ihm.dumper
import ihm.representation
import pandas as pd
import ihm.reader
import ihm.citations
import sys

if len(sys.argv)!=4:
print("usage: python make_ihm.py model.cif ")

model_file = "model_1.cif" #model file converted to cif by MAXIT
uniprot_accession ="P0A910"
protein_name ="OmpA"
restraint_file = "restraint.csv" # a file with 3 space-separated columns for residue from, residue to and restraint confidence (1-FDR)

with open(model_file) as fh:
system = ihm.reader.read(fh)
system=system[0]

#change to PRIDE or whatever repository you have your cx-ms results in
crosslink_dataset = ihm.dataset.CXMSDataset(ihm.location.DatabaseLocation(db_name="jPOSTrepo",
db_code="JPST001851"))


model_sequence = ihm.reference.UniProtSequence.from_accession(uniprot_accession)

#this would need to be a loop for systems with more than one entity
entityA = system.entities[0]
entityA.references = [model_sequence]

#change source as needed using ihm terms
entityA.source = ihm.source.Natural(ncbi_taxonomy_id=83333,
common_name="Escherichia coli",
scientific_name="Escherichia coli K12",
strain="K12")

entityA.description = (protein_name)
asymA = system.asym_units[0]
asymA.description = str(protein_name)
asymA.details = str(protein_name)
assembly = ihm.Assembly((asymA,), name='Modeled assembly', description="modeled assembly")

title_string = str("model of " + protein_name)

system.title = title_string

#change authors as needed
system.authors = ["author 1",
"author 2"]

citation = ihm.Citation(
pmid=None,
title='mytitle',
journal='myjournal', volume=None, page_range=(1, 2), year=2023,
authors=["author 1",
"author 2"],
doi=None)
system.citations.append(alphalink)

alphalink_software = ihm.Software(name="AlphaLink",
classification="model building",
description="protein structure prediction by deep learning assisted by experimental distance restraints",
location="https://github.com/lhatsk/AlphaLink",
version="1.0",
citation=alphalink)

system.software.append(alphalink_software)

rep = ihm.representation.Representation([ihm.representation.AtomicSegment(asymA, rigid=False)])

# Since our input models are plain PDBx, not IHM, we need to add additional
# required information on the model representation and assembly. This may be
# better handled by passing the file through python-ihm's
# util/make-mmcif.py first.
for state_group in system.state_groups:
for state in state_group:
for model_group in state:
for model in model_group:
if not model.assembly:
model.assembly = assembly
if not model.representation:
model.representation = rep

#define crosslinker. Here, photo-leucine.
photo_leucine = ihm.ChemDescriptor(auth_name="L-Photo-Leucine",
chem_comp_id=None,
smiles="CC1(C[C@H](N)C(O)=O)N=N1",
inchi="1S/C5H9N3O2/c1-5(7-8-5)2-3(6)4(9)10/h3H,2,6H2,1H3,(H,9,10)/t3-/m0/s1",
inchi_key="MJRDGTVDJKACQZ-VKHMYHEASA-N",
common_name="L-Photo-Leucine")


#if not using photo-leucine, use ihm.crosslinkers definitions
crosslink_restraint = ihm.restraint.CrossLinkRestraint(dataset=crosslink_dataset,
linker=photo_leucine)

crosslink_df = pd.read_csv(restraint_file, sep=" ", names=["from", "to", "confidence"], header=None)
crosslinks = []
# Usually cross-links use an upper bound restraint on the distance
distance = ihm.restraint.UpperBoundDistanceRestraint(10)
for index, line in crosslink_df.iterrows():
res1ind = int(line["from"])
res2ind = int(line["to"])
# This assumes that residue indices in the CSV file map 1:1 to mmCIF
# seq_ids. Verify by checking the residue names in the ihm_cross_link_list
# in the output mmCIF. You may need to add an offset or otherwise map
# the residue indices, because it looks off to me.
residue_pair = ihm.restraint.ExperimentalCrossLink(
residue1=entityA.residue(res1ind), residue2=entityA.residue(res2ind))
# This takes a list of all ambiguous cross-links. Here we're saying there
# is no ambiguity.
crosslink_restraint.experimental_cross_links.append([residue_pair])
residue_pair_restraint = ihm.restraint.ResidueCrossLink(experimental_cross_link=residue_pair,
asym1=asymA,
asym2=asymA,
psi=(1 - line["confidence"]),
distance=distance)
crosslink_restraint.cross_links.append(residue_pair_restraint)

system.restraints.append(crosslink_restraint)

all_datasets = ihm.dataset.DatasetGroup((crosslink_dataset,))

protocol = ihm.protocol.Protocol(name='AlphaLink')
protocol.steps.append(ihm.protocol.Step(
assembly=system.complete_assembly, dataset_group=all_datasets,
method='AlphaLink', name='AlphaLink',
num_models_begin=0, num_models_end=5, multi_scale=False, ensemble=False))

for state_group in system.state_groups:
for state in state_group:
for model_group in state:
for model in model_group:
model.protocol = protocol
if not model.representation:
model.representation = rep

with open("model.cif", "w", encoding="utf-8") as fh:
ihm.dumper.write(fh, [system])

0 comments on commit 4e496ab

Please sign in to comment.