TrajNetworkAnalysis.py

"""
A collection of functions to work with LAMMPS trajectories using MDAnalysis and SciKit-Network.
This started as a way to analyze N130 trajectories to get at the networked structure of the condensates generated
by N130 and rpL5, as a minimal system to look at the structure generated by NPM1 and Arginine-rich peptides inside
the nucleolus.
"""
__author__ = 'Furqan Dar'
__version__ = 0.1

import numpy as np
import MDAnalysis as mdan


def read_n130_rpl5_trajectory(top_file: str, trj_file: str, num_of_reps: int = 108, verbose: bool = False):
    """
    Specifically made to read in LAMMPS trajectories generated for N130 with peptides. The main reason for this
    is to give each molecule a residue name corresponding to either N130 or rpL5.
    :param top_file: Name of the topology file. Should be a lammps `write_data` output file.
    :param trj_file: Name of the trajectory file. Should be a .dcd file.
    :param num_of_reps: The number of repeats of the fundamental repeating unit. For us we have 1 N130 to 15 rpL5s.
    :param verbose: Prints out extra information about the trajectory.
    :return: [MDAnalysis.Universe, Molecule Lengths, Molecule Numbers]
    """
    dum_universe = mdan.Universe(top_file, trj_file, format="LAMMPS")
    if verbose:
        print("The simulation box is", dum_universe.dimensions[:3])
    # Adding the names of the molecules
    names_of_res = ['N130']
    [names_of_res.append('rpL5') for i in range(15)]
    names_of_res = names_of_res * num_of_reps
    dum_universe.add_TopologyAttr(topologyattr='resnames', values=names_of_res)
    
    # Getting the lengths of the different molecules (or bits) we care about
    n130_len = len(dum_universe.residues[0].atoms)
    pept_len = len(dum_universe.residues[1].atoms)
    n130_arm_len = int((n130_len - 1) / 5)
    mol_lens = [n130_len, n130_arm_len, pept_len]
    if verbose:
        print("N130 has {:} beads; each arm has {:} beads.".format(n130_len, n130_arm_len))
        print("The peptide has {:} beads.".format(pept_len))
    pept_num = int(len(dum_universe.select_atoms('resname rpL5')) / pept_len)
    n130_num = 108
    mol_nums = [n130_num, pept_num]
    return [dum_universe, mol_lens, mol_nums]


def gen_type_str_from_type_list(dum_list: list = None):
    """
    Given a list containing integers corresponding to types, we generate a string that combines all of them separated
    by a space ' '.
    """
    if dum_list is None:
        dum_list = [6, 7, 10, 12, 17]
    
    dum_str = list(map(str, dum_list))
    dum_str = " ".join(dum_str)
    return f'type {dum_str:}'


def gen_contact_atomsel_pairs_using_FastNS(traj_universe: mdan.Universe,
                                           at_sel_1: mdan.AtomGroup,
                                           at_sel_2: mdan.AtomGroup,
                                           r_cut: float):
    """
    Given the two atom selections, we use the NeighborSearch algorithms from MDAnalysis to return all pairs of contacts.
    Remember that the pairs are in ID-space of the AtomGroups, and not indices with respect to the Universe.
    """
    dum_neigh_search = mdan.lib.nsgrid.FastNS(cutoff=r_cut, coords=at_sel_1.positions, box=traj_universe.dimensions)
    dum_result = dum_neigh_search.search(at_sel_2.positions)
    return dum_result.get_pairs()


def gen_contact_mol_pairs_using_FastNS(traj_universe: mdan.Universe,
                                       at_sel_1: mdan.AtomGroup,
                                       at_sel_2: mdan.AtomGroup,
                                       r_cut: float):
    """
    Given the two atom selections, we go generate the pairs of contacts with respect to real molecule IDs.
    We simply take the pairs and explicitly calculate the molecule IDs for the pairs, and return the pairs of molecular
    contacts.
"""
    pairs_ids = gen_contact_atomsel_pairs_using_FastNS(traj_universe=traj_universe, at_sel_1=at_sel_1,
                                                       at_sel_2=at_sel_2, r_cut=r_cut)
    at_sel_1_mols = at_sel_1.fragindices
    at_sel_2_mols = at_sel_2.atoms.fragindices
    pairs_mol = np.array([at_sel_1_mols[pairs_ids.T[1]], at_sel_2_mols[pairs_ids.T[0]]]).T
    return pairs_mol


def gen_unweighted_adj_mat_from_mol_pairs_using_FastNS(mol_pairs: np.ndarray, tot_mols: int):
    """
    Given the pairs of molecular contacts, we explicitly loop over the pairs and generate an unweighted adjacency matrix.
    Each pair contains molecule IDs, which are directly the indecies for the adjacency matrix. If we had instead wanted the
    weighted graph, we would add all the contacts.
    :param mol_pairs: List of pairs of molecules in contact.
    :param tot_mols: Total number of molecules in the system. For N130-rpL5 we have 108 N130, and 15 rpL5 per N130.
    :return adj_mat: adjacency matrix for the N130-rpL5 system.
    """
    
    unique_pairs = np.unique(np.sort(mol_pairs, axis=1), axis=0)
    
    adj_mat = np.zeros((tot_mols, tot_mols), int)
    
    adj_mat[mol_pairs[:, 0], mol_pairs[:, 1]] = 1
    adj_mat[mol_pairs[:, 1], mol_pairs[:, 0]] = 1
    
    return adj_mat


def gen_unweighted_adj_mat_between_atom_selections(traj_universe: mdan.Universe,
                                                   at_sel_1: mdan.AtomGroup,
                                                   at_sel_2: mdan.AtomGroup,
                                                   r_cut: float,
                                                   tot_mols: int):
    """
    Given the two atom selections, and assuming we are looping over a frame, we get the unweighted molecular adjacency
    matrices between the two atom selections.
    :param traj_universe:
    :param at_sel_1:
    :param at_sel_2:
    :param r_cut:
    :return:
    """
    pairs_mols = gen_contact_mol_pairs_using_FastNS(traj_universe=traj_universe, at_sel_1=at_sel_1, at_sel_2=at_sel_2,
                                                    r_cut=r_cut)
    
    return gen_unweighted_adj_mat_from_mol_pairs_using_FastNS(mol_pairs=pairs_mols, tot_mols=tot_mols)


def n130_network_analysis_unweighted_adjacency_matrix_between_atom_selections(top_file_path: str,
                                                                              trj_file_path: str,
                                                                              at_sel_1: list,
                                                                              at_sel_2: list,
                                                                              trj_first: int,
                                                                              trj_last: int,
                                                                              trj_step: int,
                                                                              bond_cutoff: float):
    """
    Generate unweighted adjacency matrices for the two atom selections, given the definition of a bond by bond_cutoff.
    This function is specifically to analyze the trajectories for the CG N130-rpL5 systems.
    :param top_file_path: Full path of the topology information file. Must be a LAMMPS data file.
    :param trj_file_path: Full path of the DCD trajectory to analyze.
    :param at_sel_1: List of bead types. E.g. the acidic beads in an acidic tract of N130.
    :param at_sel_2: List of bead types. E.g. the basic beads in rpL5.
    :param trj_first: First frame of the trajectory to start analysis.
    :param trj_last: Last frame till trajectory is analyzed.
    :param trj_step: Step size for looping over the trajectory.
    :param bond_cutoff: Maximal distance to be considered a bond. For N130-rpL5 work, we use the first RDF minima.
    :return: (n, M, M) adjacency matrix where n is the number of frames analyzed, and M is the total number of molecules
    in the system.
    """
    trj_universe, trj_mol_lens, trj_mol_nums = read_n130_rpl5_trajectory(top_file=top_file_path, trj_file=trj_file_path,
                                                                         num_of_reps=108, verbose=False)
    atom_sel_1 = gen_type_str_from_type_list(at_sel_1)
    atom_sel_1 = trj_universe.select_atoms(atom_sel_1)
    
    atom_sel_2 = gen_type_str_from_type_list(at_sel_2)
    atom_sel_2 = trj_universe.select_atoms(atom_sel_2)
    
    traj_subset = trj_universe.trajectory[trj_first:trj_last:trj_step]
    
    tot_mols = 108 + 108 * 15
    
    adj_mats = np.zeros((len(traj_subset), tot_mols, tot_mols), int)
    
    for frm_id, a_frame in enumerate(traj_subset):
        adj_mats[frm_id] = gen_unweighted_adj_mat_between_atom_selections(traj_universe=trj_universe,
                                                                          at_sel_1=atom_sel_1, at_sel_2=atom_sel_2,
                                                                          r_cut=bond_cutoff, tot_mols=tot_mols)
    return adj_mats


def lj_network_analysis_unweighted_adjacency_degrees(top_file_path: str,
                                                     trj_file_path: str,
                                                     trj_first: int,
                                                     trj_last: int,
                                                     trj_step: int,
                                                     bond_cutoff: float) -> np.ndarray:
    """
    Generates a per-frame array of degrees calculated from the unweighted adjacency matrices constructed using bond_cut
    off as the bond radius. This version is specifically for the lj-fluid simulations, and therefore assumes that the
    system only contains 'type 1' molecules.
    :param top_file_path: Full path of the topology information file. Must be a LAMMPS data file.
    :param trj_file_path: Full path of the LAMMPS trajectory.
    :param trj_first: First frame to start analyzing from.
    :param trj_last: Last frame till analysis should be run.
    :param trj_step: Step-size for iterating over frames.
    :param bond_cutoff: Maximal distance for two-beads to be considered bonded. Usually first minima of g(r)
    :return: (n, M) array containing the graph degrees where graph size is M, and number of frames is n.
    """
    trj_universe = mdan.Universe(f"{top_file_path}", f"{trj_file_path}", format="LAMMPSDUMP",
                                 atom_style='id type x y z')
    
    atom_sel = trj_universe.select_atoms("type 1")
    
    traj_subset = trj_universe.trajectory[trj_first:trj_last:trj_step]
    
    tot_mols = atom_sel.atoms.n_atoms
    
    degs_Arr = np.zeros((len(traj_subset), tot_mols), int)
    
    for frm_id, a_frame in enumerate(traj_subset):
        degs_Arr[frm_id] = np.sum(
                gen_unweighted_adj_mat_between_atom_selections(traj_universe=trj_universe, at_sel_1=atom_sel,
                                                               at_sel_2=atom_sel, r_cut=bond_cutoff, tot_mols=tot_mols),
                axis=0) - 1.0
    
    return degs_Arr


def gen_histograms_from_degrees_for_multiple_frames(frame_degrees: np.ndarray, bins_lo: int, bins_hi: int):
    """
    Given a list of degrees of shape (N,M) where we have N frames and M molecules, we calculate the histogram, given
    bins_lo and bins_hi, of each frame and return it back.
    :param frame_degrees:
    :param bins_lo:
    :param bins_hi:
    :return:
    """
    return np.array(
            [np.histogram(aFrame, bins=np.arange(bins_lo, bins_hi), density=True)[0] for aFrame in frame_degrees])


def gen_histogram_from_per_frame_degrees_for_multiple_replicates(rep_degrees: np.ndarray, bins_lo: int, bins_hi: int):
    """
    Calculate the averages over multiple frames of the degree histograms for each of the replicates.
    :param rep_degrees:
    :param bins_lo:
    :param bins_hi:
    :return:
    """
    return np.array(
            [gen_histograms_from_degrees_for_multiple_frames(frame_degrees=a_rep, bins_lo=bins_lo, bins_hi=bins_hi) for
             a_rep in rep_degrees])


def for_histograms_get_average_over_replicates(sys_degrees: dict):
    """
    Average over the replicates to generate the mean values for the histograms, and calculate the std over the replicates
    to get the standard error of the mean.
    :param sys_degrees:
    :return: {'mean':mean_of_replicates, 'stde':stde_of_replicates}
    """
    return {'mean': np.mean(sys_degrees, axis=0), 'stde': np.std(sys_degrees, axis=0)}


def lj_network_get_degree_histograms(tot_dir: str, sys_name: str, num_reps: int, bins_lo: int, bins_hi: int):
    """
    Convenient helper to fetch the raw degree distributions for a particular system, and calculate averages over all the
    frames, and then over the replicates to give the average histograms and stde of those histograms.
    :param tot_dir
    :param sys_name:
    :param num_reps:
    :param bins_lo:
    :param bins_hi:
    :return:
    """
    assert sys_name in ['gas', 'fluid',
                        'solid'], f"sys_name:{sys_name} is not allowed. Only 'gas', 'fluid', or 'solid'."
    
    raw_data = np.array([np.load(f"{tot_dir}{sys_name}/Run_{repNum}/{sys_name}_{repNum}_degs.npz")['a'] for repNum in
                         range(1, num_reps + 1)])
    frame_ave = gen_histogram_from_per_frame_degrees_for_multiple_replicates(rep_degrees=raw_data, bins_hi=bins_hi,
                                                                             bins_lo=bins_lo)
    sys_ave = np.mean(frame_ave, axis=1)
    
    return for_histograms_get_average_over_replicates(sys_ave)