diff --git a/README.md b/README.md index 7a707a2..3a97883 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,30 @@ -# per-residue-distance -Calculates the per-residue atomic distance between two chains + +# Plotting code for per-residue atomic distances + + + +Obtain rotation-translation matrices from the PDBe FTP server: + +```https://ftp.ebi.ac.uk/pub/databases/pdbe-kb/superposition/[0]//.json``` + +E.g. + +```wget https://ftp.ebi.ac.uk/pub/databases/pdbe-kb/superposition/A/A0QTT2/A0QTT2.json``` + + +Obtain updated mmCIF files (the normal, non-SIFTS mmCIF file type will not work) using: + + +```https://www.ebi.ac.uk/pdbe/entry-files/download/_updated.cif``` + +E.g. + +```wget https://www.ebi.ac.uk/pdbe/entry-files/download/7cyr_updated.cif``` + +## Example + +``` python +python3 per_residue_distance.py --mmcif1 example_data/7cyr_updated.cif.gz --mmcif2 example_data/7cy2_updated.cif.gz --rt_matrices example_data/A0QTT2.json --pdb_id1 7cyr --pdb_id2 7cy2 --chain1 A --chain2 A +``` + +Chain IDs should be parsed as `label_asym_id` from the updated mmcif `atom_site` loop. \ No newline at end of file diff --git a/example_data/7cy2_updated.cif.gz b/example_data/7cy2_updated.cif.gz new file mode 100644 index 0000000..f5f0bc0 Binary files /dev/null and b/example_data/7cy2_updated.cif.gz differ diff --git a/example_data/7cyr_updated.cif.gz b/example_data/7cyr_updated.cif.gz new file mode 100644 index 0000000..a8f22ba Binary files /dev/null and b/example_data/7cyr_updated.cif.gz differ diff --git a/example_data/A0QTT2.json b/example_data/A0QTT2.json new file mode 100644 index 0000000..ddfb773 --- /dev/null +++ b/example_data/A0QTT2.json @@ -0,0 +1,95 @@ +{ + "7cy2_A": { + "pdb_id": "7cy2", + "auth_asym_id": "A", + "struct_asym_id": "A", + "matrix": [ + [ + 1.0, + 0.0, + 0.0, + 0.0 + ], + [ + 0.0, + 1.0, + -0.0, + 0.0 + ], + [ + -0.0, + -0.0, + 1.0, + 0.0 + ], + [ + 0, + 0, + 0, + 1 + ] + ] + }, + "7cyr_A": { + "pdb_id": "7cyr", + "auth_asym_id": "A", + "struct_asym_id": "A", + "matrix": [ + [ + 0.938, + -0.289, + 0.19, + -12.116 + ], + [ + -0.038, + 0.46, + 0.887, + 4.053 + ], + [ + -0.344, + -0.84, + 0.42, + 4.58 + ], + [ + 0, + 0, + 0, + 1 + ] + ] + }, + "7cz2_A": { + "pdb_id": "7cz2", + "auth_asym_id": "A", + "struct_asym_id": "A", + "matrix": [ + [ + -0.085, + 0.465, + 0.881, + -24.287 + ], + [ + 0.348, + 0.843, + -0.412, + 17.893 + ], + [ + -0.934, + 0.271, + -0.233, + 28.361 + ], + [ + 0, + 0, + 0, + 1 + ] + ] + } +} \ No newline at end of file diff --git a/per_residue_distance.py b/per_residue_distance.py new file mode 100644 index 0000000..8cb73ca --- /dev/null +++ b/per_residue_distance.py @@ -0,0 +1,321 @@ + +import json +from pathlib import PosixPath +import gemmi +from gemmi import cif, make_structure_from_block +import numpy as np +import pandas as pd +import logging +import argparse +from matplotlib import pyplot as plt + + +def calculate_static_rmsd( + polymer1: gemmi.PolymerType.PeptideL, + polymer2: gemmi.PolymerType.PeptideL, + default_ptype: gemmi.PolymerType = gemmi.PolymerType.PeptideL, +) -> float: + """ + Calculate the RMSD between two structures, without applying any transformation or + superposition. This should have either been done already or intentionally skipped. + The two structures must be of the same polymer type (left-handed protein by + default). + + :param polymer1: First structure as Gemmi Polymer object + :type polymer1: gemmi.PolymerType.PeptideL + :param polymer2: Second structure as Gemmi Polymer object + :type polymer2: gemmi.PolymerType.PeptideL + :raises ValueError: Not protein structures + :return: RMSD between the two structures + :rtype: float + """ + + if polymer1.check_polymer_type() == polymer2.check_polymer_type() == default_ptype: + rmsds = gemmi.calculate_current_rmsd( + polymer1, polymer2, default_ptype, gemmi.SupSelect.CaP # TODO Needed? + ) + + return rmsds.rmsd + + else: + raise ValueError("Polymer types not the same or supported") + + + + +def load_mmcif( + path: PosixPath | str, return_model: bool = False +) -> gemmi.cif.Block | gemmi.Structure: + """ + Returns mmCIF Gemmi object given path to file. + Modified from https://github.com/PDBeurope/protein-cluster-conformers + + :param path: Load (updated) mmCIF file into memory. + :type path: pathlib.Path | str + :param return_model: Return Gemmi Structure object instead of Block object + :type return_model: bool + :raises ValueError: File parsed was not mmCIF or gzip mmCIF + :return: Contents of mmCIF as Gemmi Block object + :rtype: gemmi.cif.Block | gemmi.Structure + """ + path = str(path) + + if path[-6:] == "cif.gz": + cif_block = cif.read(path).sole_block() + + elif path[-3:] == "cif": + cif_block = cif.read_file(path).sole_block() + + else: + raise ValueError("File parsed was not mmCIF or gzip mmCIF") + + # Convert to Gemmi Structure object if required + return make_structure_from_block(cif_block)[0] if return_model else cif_block + + + + +def load_RT_matrices(path: PosixPath | str) -> dict: + """ + Load a JSON file containing a set of RT matrices. The matrices included in the file + should be 4x4 affine matrices, with the 3x3 rotation matrix in the upper left corner + and the 3x1 translation vector in the right column. + + :param path: Path to JSON file + :type path: PosixPath | str + :return: Dictionary of RT matrices. Keyed by PDB and chain (auth_asym_id) ID. + :rtype: dict + """ + + with open(path, "r") as f: + rt_matrices = json.load(f) + + return rt_matrices + + + + + +def affine_to_transformation(affine_matx: "list[float]") -> gemmi.Transform: + """ + Convert an affine matrix to a Gemmi transformation object. + + :param affine_matx: 4x4 affine matrix. Includes the 3x3 rotation matrix in the upper + left corner and the 3x1 translation vector in the right column. + :type affine_matx: list[float] + :return: Gemmi transformation object + :rtype: gemmi.Transform + """ + + # Parse affine matrix into 3x3 rotation matrix + m = [ + [affine_matx[0][0], affine_matx[0][1], affine_matx[0][2]], + [affine_matx[1][0], affine_matx[1][1], affine_matx[1][2]], + [affine_matx[2][0], affine_matx[2][1], affine_matx[2][2]], + ] + rotation_matrix = gemmi.Mat33(m) + + # Parse affine matrix into 3x1 translation vector + translation_vector = gemmi.Vec3( + affine_matx[0][3], affine_matx[1][3], affine_matx[2][3] + ) + + return gemmi.Transform(rotation_matrix, translation_vector) + + + +def block_to_df(block: gemmi.cif.Block) -> pd.DataFrame: + loop_names = [ + "group_PDB", + "label_asym_id", + "auth_asym_id", + "pdbx_sifts_xref_db_num", + "label_seq_id", + ] + + table = pd.DataFrame(block.find("_atom_site.", loop_names), columns=loop_names) + + return table + + + +def get_unp_residue_indices(model: pd.DataFrame, chain_id: str) -> set: + + # TODO: Consider using Gemmi's Selection class, although it might be slower + # considering it uses the older MMDB parsing syntax. + + df = block_to_df(model) + + res_ids = df["pdbx_sifts_xref_db_num"][ + (df["label_asym_id"] == chain_id) + & (df["group_PDB"] == "ATOM") + & (df["pdbx_sifts_xref_db_num"] != "?") + ].unique() + + # Sort and convert to int + return set(map(int, res_ids)) + + +def fill_missing_ends( + all_data, all_res_ids, min_res_id, max_res_id, fill_type=np.full(3, np.nan) +): + + # Add NaNs to end of arrays + for key, unp_res_id in all_res_ids.items(): + difference = max_res_id - max(unp_res_id) + + if difference == 0: + continue + + appendage_array = np.full((difference, 3), np.nan) + + all_data[key] = np.concatenate((all_data[key], appendage_array)) + + return all_data + + +def calculate_atom_distances(coordinates1, coordinates2): + + coord_difference = coordinates1 - coordinates2 + distances = np.linalg.norm(coord_difference, axis=1) + + return np.array(distances) + + + + + + + +class PositionCalculator: + def __init__(self, model: gemmi.Structure, unp_res_ids: set) -> None: + self.model = model + self.unp_res_ids = sorted(list(unp_res_ids)) + self.all_res_ids = range(1, max(unp_res_ids) + 1) + + # Calculate positions + self.ca_coords = self.__extract_positions() + + def __extract_positions(self, atom_type="CA"): + """ + :return: _description_ + :rtype: tuple( list(float), list(float) ) + """ + + positions = [] + + sorted_res_ids = sorted(list(self.unp_res_ids)) + + index_position = 0 # Keep track of residue to extract + for i in self.all_res_ids: + if i in sorted_res_ids: + ca_position = self.model[index_position][atom_type][0].pos + ca_position = np.array([ca_position[0], ca_position[1], ca_position[2]]) + positions.append(ca_position) + + index_position += 1 + + else: + # Append NaNs + positions += [np.array([np.nan, np.nan, np.nan])] + + return np.array(positions) + + def get_positions(self): + return self.ca_coords + + + + + +if __name__ == "__main__": + + + # Configure logging + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s %(message)s", + ) + + # Collect arguments + parser = argparse.ArgumentParser() + parser.add_argument("--mmcif1", type=str, help="Path to mmCIF file") + parser.add_argument("--mmcif2", type=str, help="Path to mmCIF file") + parser.add_argument("--rt_matrices", type=str, help="Path to JSON file") + parser.add_argument("--pdb_id1", type=str, help="PDB ID") + parser.add_argument("--pdb_id2", type=str, help="PDB ID") + parser.add_argument("--chain1", type=str, help="Chain ID") + parser.add_argument("--chain2", type=str, help="Chain ID") + args = parser.parse_args() + + # Entry labels + entry1 = f"{args.pdb_id1}_{args.chain1}" + entry2 = f"{args.pdb_id2}_{args.chain2}" + + # Load mmCIF files + model1 = load_mmcif(args.mmcif1, return_model=True) + model2 = load_mmcif(args.mmcif2, return_model=True) + + # Load RT matrices + rt_matrices = load_RT_matrices(args.rt_matrices) + rt_matrices1 = rt_matrices[f"{args.pdb_id1}_{args.chain1}"]["matrix"] + rt_matrices2 = rt_matrices[f"{args.pdb_id2}_{args.chain2}"]["matrix"] + + # Convert RT matrices to Gemmi Transform objects + rt_matrices1 = affine_to_transformation(rt_matrices1) + rt_matrices2 = affine_to_transformation(rt_matrices2) + + # Apply RT matrices to models + model1.transform_pos_and_adp(rt_matrices1) + model2.transform_pos_and_adp(rt_matrices2) + + # Obtain residue indices + block1 = load_mmcif(args.mmcif1, return_model=False) + block2 = load_mmcif(args.mmcif2, return_model=False) + + unp_res_ids1 = get_unp_residue_indices(block1, args.chain1) + unp_res_ids2 = get_unp_residue_indices(block2, args.chain2) + all_res_ids = set() + all_unp_res_ids = {} + for i, j in zip((unp_res_ids1, unp_res_ids2), (entry1, entry2)): + all_res_ids = all_res_ids.union(i) + all_unp_res_ids[j] = i + + # Create atomic position objects + all_atomic_coords = {} + + atomic_coords1 = PositionCalculator(model1[args.chain1], unp_res_ids1).get_positions() + atomic_coords2 = PositionCalculator(model2[args.chain2], unp_res_ids2).get_positions() + + all_atomic_coords[entry1] = atomic_coords1 + all_atomic_coords[entry2] = atomic_coords2 + + # Add missing residues to ends as NaN values + all_coordinates = fill_missing_ends( + all_atomic_coords, + all_unp_res_ids, + min(all_res_ids), + max(all_res_ids), + fill_type=np.array([np.nan, np.nan, np.nan]), + ) + + atomic_distances = calculate_atom_distances( + all_coordinates[entry1], + all_coordinates[entry2] + ) + + fig, ax = plt.subplots(1,1, figsize=(10,5), tight_layout=True) + + ax.plot(range(0, max(all_res_ids)), atomic_distances, color="black", linewidth=0.5) + ax.set_xlabel("UniProt Residue ID") + ax.set_ylabel("Distance (\u212B)") + ax.set_title(f"Euclidean distance between {entry1} vs {entry2}") + + plt.show() + + + + + + +