-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTrajNetworkAnalysis.py
267 lines (222 loc) · 13.2 KB
/
TrajNetworkAnalysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
"""
A collection of functions to work with LAMMPS trajectories using MDAnalysis and SciKit-Network.
This started as a way to analyze N130 trajectories to get at the networked structure of the condensates generated
by N130 and rpL5, as a minimal system to look at the structure generated by NPM1 and Arginine-rich peptides inside
the nucleolus.
"""
__author__ = 'Furqan Dar'
__version__ = 0.1
import numpy as np
import MDAnalysis as mdan
def read_n130_rpl5_trajectory(top_file: str, trj_file: str, num_of_reps: int = 108, verbose: bool = False):
"""
Specifically made to read in LAMMPS trajectories generated for N130 with peptides. The main reason for this
is to give each molecule a residue name corresponding to either N130 or rpL5.
:param top_file: Name of the topology file. Should be a lammps `write_data` output file.
:param trj_file: Name of the trajectory file. Should be a .dcd file.
:param num_of_reps: The number of repeats of the fundamental repeating unit. For us we have 1 N130 to 15 rpL5s.
:param verbose: Prints out extra information about the trajectory.
:return: [MDAnalysis.Universe, Molecule Lengths, Molecule Numbers]
"""
dum_universe = mdan.Universe(top_file, trj_file, format="LAMMPS")
if verbose:
print("The simulation box is", dum_universe.dimensions[:3])
# Adding the names of the molecules
names_of_res = ['N130']
[names_of_res.append('rpL5') for i in range(15)]
names_of_res = names_of_res * num_of_reps
dum_universe.add_TopologyAttr(topologyattr='resnames', values=names_of_res)
# Getting the lengths of the different molecules (or bits) we care about
n130_len = len(dum_universe.residues[0].atoms)
pept_len = len(dum_universe.residues[1].atoms)
n130_arm_len = int((n130_len - 1) / 5)
mol_lens = [n130_len, n130_arm_len, pept_len]
if verbose:
print("N130 has {:} beads; each arm has {:} beads.".format(n130_len, n130_arm_len))
print("The peptide has {:} beads.".format(pept_len))
pept_num = int(len(dum_universe.select_atoms('resname rpL5')) / pept_len)
n130_num = 108
mol_nums = [n130_num, pept_num]
return [dum_universe, mol_lens, mol_nums]
def gen_type_str_from_type_list(dum_list: list = None):
"""
Given a list containing integers corresponding to types, we generate a string that combines all of them separated
by a space ' '.
"""
if dum_list is None:
dum_list = [6, 7, 10, 12, 17]
dum_str = list(map(str, dum_list))
dum_str = " ".join(dum_str)
return f'type {dum_str:}'
def gen_contact_atomsel_pairs_using_FastNS(traj_universe: mdan.Universe,
at_sel_1: mdan.AtomGroup,
at_sel_2: mdan.AtomGroup,
r_cut: float):
"""
Given the two atom selections, we use the NeighborSearch algorithms from MDAnalysis to return all pairs of contacts.
Remember that the pairs are in ID-space of the AtomGroups, and not indices with respect to the Universe.
"""
dum_neigh_search = mdan.lib.nsgrid.FastNS(cutoff=r_cut, coords=at_sel_1.positions, box=traj_universe.dimensions)
dum_result = dum_neigh_search.search(at_sel_2.positions)
return dum_result.get_pairs()
def gen_contact_mol_pairs_using_FastNS(traj_universe: mdan.Universe,
at_sel_1: mdan.AtomGroup,
at_sel_2: mdan.AtomGroup,
r_cut: float):
"""
Given the two atom selections, we go generate the pairs of contacts with respect to real molecule IDs.
We simply take the pairs and explicitly calculate the molecule IDs for the pairs, and return the pairs of molecular
contacts.
"""
pairs_ids = gen_contact_atomsel_pairs_using_FastNS(traj_universe=traj_universe, at_sel_1=at_sel_1,
at_sel_2=at_sel_2, r_cut=r_cut)
at_sel_1_mols = at_sel_1.fragindices
at_sel_2_mols = at_sel_2.atoms.fragindices
pairs_mol = np.array([at_sel_1_mols[pairs_ids.T[1]], at_sel_2_mols[pairs_ids.T[0]]]).T
return pairs_mol
def gen_unweighted_adj_mat_from_mol_pairs_using_FastNS(mol_pairs: np.ndarray, tot_mols: int):
"""
Given the pairs of molecular contacts, we explicitly loop over the pairs and generate an unweighted adjacency matrix.
Each pair contains molecule IDs, which are directly the indecies for the adjacency matrix. If we had instead wanted the
weighted graph, we would add all the contacts.
:param mol_pairs: List of pairs of molecules in contact.
:param tot_mols: Total number of molecules in the system. For N130-rpL5 we have 108 N130, and 15 rpL5 per N130.
:return adj_mat: adjacency matrix for the N130-rpL5 system.
"""
unique_pairs = np.unique(np.sort(mol_pairs, axis=1), axis=0)
adj_mat = np.zeros((tot_mols, tot_mols), int)
adj_mat[mol_pairs[:, 0], mol_pairs[:, 1]] = 1
adj_mat[mol_pairs[:, 1], mol_pairs[:, 0]] = 1
return adj_mat
def gen_unweighted_adj_mat_between_atom_selections(traj_universe: mdan.Universe,
at_sel_1: mdan.AtomGroup,
at_sel_2: mdan.AtomGroup,
r_cut: float,
tot_mols: int):
"""
Given the two atom selections, and assuming we are looping over a frame, we get the unweighted molecular adjacency
matrices between the two atom selections.
:param traj_universe:
:param at_sel_1:
:param at_sel_2:
:param r_cut:
:return:
"""
pairs_mols = gen_contact_mol_pairs_using_FastNS(traj_universe=traj_universe, at_sel_1=at_sel_1, at_sel_2=at_sel_2,
r_cut=r_cut)
return gen_unweighted_adj_mat_from_mol_pairs_using_FastNS(mol_pairs=pairs_mols, tot_mols=tot_mols)
def n130_network_analysis_unweighted_adjacency_matrix_between_atom_selections(top_file_path: str,
trj_file_path: str,
at_sel_1: list,
at_sel_2: list,
trj_first: int,
trj_last: int,
trj_step: int,
bond_cutoff: float):
"""
Generate unweighted adjacency matrices for the two atom selections, given the definition of a bond by bond_cutoff.
This function is specifically to analyze the trajectories for the CG N130-rpL5 systems.
:param top_file_path: Full path of the topology information file. Must be a LAMMPS data file.
:param trj_file_path: Full path of the DCD trajectory to analyze.
:param at_sel_1: List of bead types. E.g. the acidic beads in an acidic tract of N130.
:param at_sel_2: List of bead types. E.g. the basic beads in rpL5.
:param trj_first: First frame of the trajectory to start analysis.
:param trj_last: Last frame till trajectory is analyzed.
:param trj_step: Step size for looping over the trajectory.
:param bond_cutoff: Maximal distance to be considered a bond. For N130-rpL5 work, we use the first RDF minima.
:return: (n, M, M) adjacency matrix where n is the number of frames analyzed, and M is the total number of molecules
in the system.
"""
trj_universe, trj_mol_lens, trj_mol_nums = read_n130_rpl5_trajectory(top_file=top_file_path, trj_file=trj_file_path,
num_of_reps=108, verbose=False)
atom_sel_1 = gen_type_str_from_type_list(at_sel_1)
atom_sel_1 = trj_universe.select_atoms(atom_sel_1)
atom_sel_2 = gen_type_str_from_type_list(at_sel_2)
atom_sel_2 = trj_universe.select_atoms(atom_sel_2)
traj_subset = trj_universe.trajectory[trj_first:trj_last:trj_step]
tot_mols = 108 + 108 * 15
adj_mats = np.zeros((len(traj_subset), tot_mols, tot_mols), int)
for frm_id, a_frame in enumerate(traj_subset):
adj_mats[frm_id] = gen_unweighted_adj_mat_between_atom_selections(traj_universe=trj_universe,
at_sel_1=atom_sel_1, at_sel_2=atom_sel_2,
r_cut=bond_cutoff, tot_mols=tot_mols)
return adj_mats
def lj_network_analysis_unweighted_adjacency_degrees(top_file_path: str,
trj_file_path: str,
trj_first: int,
trj_last: int,
trj_step: int,
bond_cutoff: float) -> np.ndarray:
"""
Generates a per-frame array of degrees calculated from the unweighted adjacency matrices constructed using bond_cut
off as the bond radius. This version is specifically for the lj-fluid simulations, and therefore assumes that the
system only contains 'type 1' molecules.
:param top_file_path: Full path of the topology information file. Must be a LAMMPS data file.
:param trj_file_path: Full path of the LAMMPS trajectory.
:param trj_first: First frame to start analyzing from.
:param trj_last: Last frame till analysis should be run.
:param trj_step: Step-size for iterating over frames.
:param bond_cutoff: Maximal distance for two-beads to be considered bonded. Usually first minima of g(r)
:return: (n, M) array containing the graph degrees where graph size is M, and number of frames is n.
"""
trj_universe = mdan.Universe(f"{top_file_path}", f"{trj_file_path}", format="LAMMPSDUMP",
atom_style='id type x y z')
atom_sel = trj_universe.select_atoms("type 1")
traj_subset = trj_universe.trajectory[trj_first:trj_last:trj_step]
tot_mols = atom_sel.atoms.n_atoms
degs_Arr = np.zeros((len(traj_subset), tot_mols), int)
for frm_id, a_frame in enumerate(traj_subset):
degs_Arr[frm_id] = np.sum(
gen_unweighted_adj_mat_between_atom_selections(traj_universe=trj_universe, at_sel_1=atom_sel,
at_sel_2=atom_sel, r_cut=bond_cutoff, tot_mols=tot_mols),
axis=0) - 1.0
return degs_Arr
def gen_histograms_from_degrees_for_multiple_frames(frame_degrees: np.ndarray, bins_lo: int, bins_hi: int):
"""
Given a list of degrees of shape (N,M) where we have N frames and M molecules, we calculate the histogram, given
bins_lo and bins_hi, of each frame and return it back.
:param frame_degrees:
:param bins_lo:
:param bins_hi:
:return:
"""
return np.array(
[np.histogram(aFrame, bins=np.arange(bins_lo, bins_hi), density=True)[0] for aFrame in frame_degrees])
def gen_histogram_from_per_frame_degrees_for_multiple_replicates(rep_degrees: np.ndarray, bins_lo: int, bins_hi: int):
"""
Calculate the averages over multiple frames of the degree histograms for each of the replicates.
:param rep_degrees:
:param bins_lo:
:param bins_hi:
:return:
"""
return np.array(
[gen_histograms_from_degrees_for_multiple_frames(frame_degrees=a_rep, bins_lo=bins_lo, bins_hi=bins_hi) for
a_rep in rep_degrees])
def for_histograms_get_average_over_replicates(sys_degrees: dict):
"""
Average over the replicates to generate the mean values for the histograms, and calculate the std over the replicates
to get the standard error of the mean.
:param sys_degrees:
:return: {'mean':mean_of_replicates, 'stde':stde_of_replicates}
"""
return {'mean': np.mean(sys_degrees, axis=0), 'stde': np.std(sys_degrees, axis=0)}
def lj_network_get_degree_histograms(tot_dir: str, sys_name: str, num_reps: int, bins_lo: int, bins_hi: int):
"""
Convenient helper to fetch the raw degree distributions for a particular system, and calculate averages over all the
frames, and then over the replicates to give the average histograms and stde of those histograms.
:param tot_dir
:param sys_name:
:param num_reps:
:param bins_lo:
:param bins_hi:
:return:
"""
assert sys_name in ['gas', 'fluid',
'solid'], f"sys_name:{sys_name} is not allowed. Only 'gas', 'fluid', or 'solid'."
raw_data = np.array([np.load(f"{tot_dir}{sys_name}/Run_{repNum}/{sys_name}_{repNum}_degs.npz")['a'] for repNum in
range(1, num_reps + 1)])
frame_ave = gen_histogram_from_per_frame_degrees_for_multiple_replicates(rep_degrees=raw_data, bins_hi=bins_hi,
bins_lo=bins_lo)
sys_ave = np.mean(frame_ave, axis=1)
return for_histograms_get_average_over_replicates(sys_ave)