-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathdiffusion_analysis.py
318 lines (251 loc) · 18.1 KB
/
diffusion_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
# coding: utf-8
'''
Authors: Tyler Reddy and Anna Duncan
The purpose of this Python module is to provide utility functions for analyzing the diffusion of particles in molecular dynamics simulation trajectories using either linear or anomalous diffusion models.'''
import numpy
import scipy
import scipy.optimize
def fit_anomalous_diffusion_data(time_data_array,MSD_data_array,degrees_of_freedom=2):
'''This function should fit anomalous diffusion data to Equation 1 in [Kneller2011]_, and return appropriate diffusion parameters.
.. math::
MSD = ND_{\\alpha}t^{\\alpha}
An appropriate coefficient (`N` = 2,4,6 for 1,2,3 `degrees_of_freedom`) will be assigned based on the specified `degrees_of_freedom`. The latter value defaults to 2 (i.e., a planar phospholipid bilayer with `N` = 4).
Input data should include arrays of MSD (in Angstroms ** 2) and time values (in ns).
The results are returned in a tuple.
Parameters
----------
time_data_array : array_like
Input array of time window sizes (nanosecond units)
MSD_data_array : array_like
Input array of MSD values (Angstrom ** 2 units; order matched to time_data_array)
degrees_of_freedom : int
The degrees of freedom for the diffusional process (1, 2 or 3; default 2)
Returns
-------
fractional_diffusion_coefficient
The fractional diffusion coefficient (units of Angstrom ** 2 / ns ** alpha)
standard_deviation_fractional_diffusion_coefficient
The standard deviation of the fractional diffusion coefficent (units of Angstrom ** 2 / ns ** alpha)
alpha
The scaling exponent (no dimensions) of the non-linear fit
standard_deviation_alpha
The standard deviation of the scaling exponent (no dimensions)
sample_fitting_data_X_values_nanoseconds
An array of time window sizes (x values) that may be used to plot the non-linear fit curve
sample_fitting_data_Y_values_Angstroms
An array of MSD values (y values) that may be used to plot the non-linear fit curve
Raises
------
ValueError
If the time window and MSD arrays do not have the same shape
References
----------
.. [Kneller2011] Kneller et al. (2011) J Chem Phys 135: 141105.
Examples
--------
Calculate fractional diffusion coefficient and alpha from artificial data (would typically obtain empirical data from an MD simulation trajectory):
>>> import diffusion_analysis
>>> import numpy
>>> artificial_time_values = numpy.arange(10)
>>> artificial_MSD_values = numpy.array([0.,1.,2.,2.2,3.6,4.7,5.8,6.6,7.0,6.9])
>>> results_tuple = diffusion_analysis.fit_anomalous_diffusion_data(artificial_time_values,artificial_MSD_values)
>>> D, D_std, alpha, alpha_std = results_tuple[0:4]
>>> print D, D_std, alpha, alpha_std
0.268426206526 0.0429995249239 0.891231967011 0.0832911559401
Plot the non-linear fit data:
>>> import matplotlib
>>> import matplotlib.pyplot as plt
>>> sample_fit_x_values, sample_fit_y_values = results_tuple[4:]
>>> p = plt.plot(sample_fit_x_values,sample_fit_y_values,'-',artificial_time_values,artificial_MSD_values,'.')
.. image:: example_nonlinear.png
'''
if time_data_array.shape != MSD_data_array.shape:
raise ValueError("The shape of time_data_array must match the shape of MSD_data_array.")
def function_to_fit(time,fractional_diffusion_coefficient,scaling_exponent):
coefficient_dictionary = {1:2,2:4,3:6} #dictionary for mapping degrees_of_freedom to coefficient in fitting equation
coefficient = coefficient_dictionary[degrees_of_freedom]
return coefficient * fractional_diffusion_coefficient * (time ** scaling_exponent) #equation 1 in the above paper with appropriate coefficient based on degrees of freedom
#fit the above function to the data and pull out the resulting parameters
optimized_parameter_value_array, estimated_covariance_params_array = scipy.optimize.curve_fit(function_to_fit,time_data_array,MSD_data_array)
#generate sample fitting data over the range of time window values (user can plot if they wish)
sample_fitting_data_X_values_nanoseconds = numpy.linspace(time_data_array[0],time_data_array[-1],100)
sample_fitting_data_Y_values_Angstroms = function_to_fit(sample_fitting_data_X_values_nanoseconds, *optimized_parameter_value_array)
#could then plot the non-linear fit curve in matplotlib with, for example: axis.plot(sample_fitting_data_X_values_nanoseconds,sample_fitting_data_Y_values_Angstroms,color='black')
#could plot the original data points alongside the fit (MSD vs time) with, for example: axis.scatter(time_data_array,MSD_data_array,color='black')
#extract pertinent values from the scipy curve_fit arrays (D_alpha, alpha, and their standard deviations)
parameter_standard_deviation_array = numpy.sqrt(numpy.diagonal(estimated_covariance_params_array))
fractional_diffusion_coefficient = optimized_parameter_value_array[0]
standard_deviation_fractional_diffusion_coefficient = parameter_standard_deviation_array[0]
alpha = optimized_parameter_value_array[1]
standard_deviation_alpha = parameter_standard_deviation_array[1]
return (fractional_diffusion_coefficient, standard_deviation_fractional_diffusion_coefficient, alpha, standard_deviation_alpha,sample_fitting_data_X_values_nanoseconds,sample_fitting_data_Y_values_Angstroms)
def fit_linear_diffusion_data(time_data_array,MSD_data_array,degrees_of_freedom=2):
'''The linear (i.e., normal, random-walk) MSD vs. time diffusion constant calculation.
The results are returned in a tuple.
Parameters
----------
time_data_array : array_like
Input array of time window sizes (nanosecond units)
MSD_data_array : array_like
Input array of MSD values (Angstrom ** 2 units; order matched to time_data_array)
degrees_of_freedom : int
The degrees of freedom for the diffusional process (1, 2 or 3; default 2)
Returns
-------
diffusion_constant
The linear (or normal, random-walk) diffusion coefficient (units of Angstrom ** 2 / ns)
diffusion_constant_error_estimate
The estimated uncertainty in the diffusion constant (units of Angstrom ** 2 / ns), calculated as the difference in the slopes of the two halves of the data. A similar approach is used by GROMACS g_msd [Hess2008]_.
sample_fitting_data_X_values_nanoseconds
An array of time window sizes (x values) that may be used to plot the linear fit
sample_fitting_data_Y_values_Angstroms
An array of MSD values (y values) that may be used to plot the linear fit
Raises
------
ValueError
If the time window and MSD arrays do not have the same shape
References
----------
.. [Hess2008] Hess et al. (2008) JCTC 4: 435-447.
Examples
--------
Calculate linear diffusion coefficient from artificial data (would typically obtain empirical data from an MD simulation trajectory):
>>> import diffusion_analysis
>>> import numpy
>>> artificial_time_values = numpy.arange(10)
>>> artificial_MSD_values = numpy.array([0.,1.,2.,2.2,3.6,4.7,5.8,6.6,7.0,6.9])
>>> results_tuple = diffusion_analysis.fit_linear_diffusion_data(artificial_time_values,artificial_MSD_values)
>>> D, D_error = results_tuple[0:2]
>>> print D, D_error
0.210606060606 0.07
Plot the linear fit data:
>>> import matplotlib
>>> import matplotlib.pyplot as plt
>>> sample_fit_x_values, sample_fit_y_values = results_tuple[2:]
>>> p = plt.plot(sample_fit_x_values,sample_fit_y_values,'-',artificial_time_values,artificial_MSD_values,'.')
.. image:: example_linear.png
'''
if time_data_array.shape != MSD_data_array.shape:
raise ValueError("The shape of time_data_array must match the shape of MSD_data_array.")
coefficient_dictionary = {1:2.,2:4.,3:6.} #dictionary for mapping degrees_of_freedom to coefficient in fitting equation
coefficient = coefficient_dictionary[degrees_of_freedom]
x_data_array = time_data_array
y_data_array = MSD_data_array
z = numpy.polyfit(x_data_array,y_data_array,1)
slope, intercept = z
diffusion_constant = slope / coefficient
#estimate error in D constant using g_msd approach:
first_half_x_data, second_half_x_data = numpy.array_split(x_data_array,2)
first_half_y_data, second_half_y_data = numpy.array_split(y_data_array,2)
slope_first_half, intercept_first_half = numpy.polyfit(first_half_x_data,first_half_y_data,1)
slope_second_half, intercept_second_half = numpy.polyfit(second_half_x_data,second_half_y_data,1)
diffusion_constant_error_estimate = abs(slope_first_half - slope_second_half) / coefficient
#use poly1d object for polynomial calling convenience (to provide plotting fit data for user if they want to use it):
p = numpy.poly1d(z)
sample_fitting_data_X_values_nanoseconds = numpy.linspace(time_data_array[0],time_data_array[-1],100)
sample_fitting_data_Y_values_Angstroms = p(sample_fitting_data_X_values_nanoseconds)
return (diffusion_constant, diffusion_constant_error_estimate,sample_fitting_data_X_values_nanoseconds,sample_fitting_data_Y_values_Angstroms)
def centroid_array_production_protein(protein_sel,num_protein_copies):
dictionary_centroid_arrays = {}
full_protein_coord_array = protein_sel.positions
list_individual_protein_coord_arrays = numpy.split(full_protein_coord_array,num_protein_copies)
list_per_protein_centroids = [numpy.average(protein_coord_array,axis=0) for protein_coord_array in list_individual_protein_coord_arrays]
dictionary_centroid_arrays['protein'] = numpy.array(list_per_protein_centroids)
return dictionary_centroid_arrays
def mean_square_displacement_by_species(coordinate_file_path, trajectory_file_path, window_size_frames_list, dict_particle_selection_strings, contiguous_protein_selection=None, num_protein_copies = None):
'''Calculate the mean square displacement (MSD) of particles in a molecular dynamics simulation trajectory using the Python `MDAnalysis <http://code.google.com/p/mdanalysis/>`_ package [Michaud-Agrawal2011]_.
Parameters
----------
coordinate_file_path: str
Path to the coordinate file to be used in loading the trajectory.
trajectory_file_path: str or list of str
Path to the trajectory file to be used or ordered list of trajectory file paths.
window_size_frames_list: list
List of window sizes measured in frames. Time values are not used as timesteps and simulation output frequencies can vary.
dict_particle_selection_strings: dict
Dictionary of the MDAnalysis selection strings for each particle set for which MSD values will be calculated separately. Format: {'particle_identifier':'MDAnalysis selection string'}. If a given selection contains more than one particle then the centroid of the particles is used, and if more than one MDAnalysis residue object is involved, then the centroid of the selection within each residue is calculated separately.
contiguous_protein_selection: str
When parsing protein diffusion it may be undesirable to split by residue (i.e., amino acid). You may provide an MDAnalysis selection string for this parameter containing a single type of protein (and no other protein types or other molecules / atoms). This selection string may encompass multiple copies of the same protein which should be specified with the parameter `num_protein_copies`
num_protein_copies: int
The number of protein copies if contiguous_protein_selection is specified. This will allow for the protein coordinates to be split and the individual protein centroids will be used for diffusion analysis.
Returns
-------
dict_MSD_values: dict
Dictionary of mean square displacement data. Contains three keys: MSD_value_dict (MSD values, in Angstrom ** 2), MSD_std_dict (standard deviation of MSD values), frame_skip_value_list (the frame window sizes)
References
----------
.. [Michaud-Agrawal2011] N. Michaud-Agrawal, E. J. Denning, T. B. Woolf, and O. Beckstein. MDAnalysis: A Toolkit for the Analysis of Molecular Dynamics Simulations. J. Comput. Chem. 32 (2011), 2319–2327
Examples
--------
Extract MSD values from an artificial GROMACS xtc file (results measured in A**2 based on frame window sizes) containing only three amino acid residues.
>>> import diffusion_analysis
>>> import numpy
>>> import matplotlib
>>> import matplotlib.pyplot as plt
>>> window_size_list_frames = [1,2,4,6]
>>> dict_particle_selection_strings = {'MET':'resname MET','ARG':'resname ARG','CYS':'resname CYS'}
>>> dict_MSD_values = diffusion_analysis.mean_square_displacement_by_species('./test_data/dummy.gro','./test_data/diffusion_testing.xtc',window_size_list_frames,dict_particle_selection_strings)
Plot the results:
>>> fig = plt.figure()
>>> ax = fig.add_subplot(111)
>>> for residue_name in dict_particle_selection_strings.keys():
... p = ax.errorbar(window_size_list_frames,dict_MSD_values['MSD_value_dict'][residue_name],yerr=dict_MSD_values['MSD_std_dict'][residue_name],label=residue_name,fmt='o')
>>> a = ax.set_xlabel('Window size (frames)')
>>> a = ax.set_ylabel('MSD ($\AA^2$)')
>>> a = ax.set_ylim(-10,600)
>>> a = ax.legend(loc=2,numpoints=1)
.. image:: example_MSD_extraction.png
'''
import MDAnalysis
universe_object = MDAnalysis.Universe(coordinate_file_path,trajectory_file_path)
if not contiguous_protein_selection:
MDA_residue_selection_dictionary = {}
for particle_name, selection_string in dict_particle_selection_strings.items():
MDA_selection = universe_object.select_atoms(selection_string)
MDA_selection_residue_list = MDA_selection.residues #have to break it down by residues, otherwise would end up with centroid of all particles of a given name
list_per_residue_selection_objects = [residue.atoms.select_atoms(selection_string) for residue in MDA_selection_residue_list] #the MDA selection objects PER residue
MDA_residue_selection_dictionary[particle_name] = list_per_residue_selection_objects
def centroid_array_production(current_MDA_selection_dictionary): #actually my modification of this for the github workflow won't work--it will find the centroid of ALL the particles with the same name
'''Produce numpy arrays of centroids organized in a dictionary by particle identifier and based on assignment of particle selections to unique residue objects within MDAnalysis.'''
dictionary_centroid_arrays = {}
for particle_name,list_per_residue_selection_objects in current_MDA_selection_dictionary.items():
list_per_residue_selection_centroids = [residue_selection.centroid() for residue_selection in list_per_residue_selection_objects]
dictionary_centroid_arrays[particle_name] = numpy.array(list_per_residue_selection_centroids)
return dictionary_centroid_arrays
else: #dealing with proteins, where we don't want a breakdown by residue
protein_selection = universe_object.select_atoms(contiguous_protein_selection)
dict_MSD_values = {'MSD_value_dict':{},'MSD_std_dict':{},'frame_skip_value_list':[]} #for overall storage of MSD average / standard deviation values for this replicate
for MSD_window_size_frames in window_size_frames_list:
list_per_window_average_displacements = []
counter = 0
trajectory_striding_dictionary = {} #store values in a dict as you stride through trajectory
for ts in universe_object.trajectory[::MSD_window_size_frames]:
if counter == 0: #first parsed frame
if not contiguous_protein_selection:
previous_frame_centroid_array_dictionary = centroid_array_production(MDA_residue_selection_dictionary)
else:
previous_frame_centroid_array_dictionary = centroid_array_production_protein(protein_selection, num_protein_copies)
else: #all subsequent frames
if not contiguous_protein_selection:
current_frame_centroid_array_dictionary = centroid_array_production(MDA_residue_selection_dictionary)
else:
current_frame_centroid_array_dictionary = centroid_array_production_protein(protein_selection, num_protein_copies)
for particle_name in current_frame_centroid_array_dictionary.keys():
if not particle_name in trajectory_striding_dictionary.keys(): #create the appropriate entry if this particle types hasn't been parsed yet
trajectory_striding_dictionary[particle_name] = {'MSD_value_list_centroids':[]}
current_delta_array_centroids = previous_frame_centroid_array_dictionary[particle_name] - current_frame_centroid_array_dictionary[particle_name]
square_delta_array_centroids = numpy.square(current_delta_array_centroids)
sum_squares_delta_array_centroids = numpy.sum(square_delta_array_centroids,axis=1)
trajectory_striding_dictionary[particle_name]['MSD_value_list_centroids'].append(numpy.average(sum_squares_delta_array_centroids))
#reset the value of the 'previous' array as you go along:
previous_frame_centroid_array_dictionary = current_frame_centroid_array_dictionary
#print 'frame:', ts.frame
counter += 1
for particle_name, MSD_data_subdictionary in trajectory_striding_dictionary.items():
if not particle_name in dict_MSD_values['MSD_value_dict'].keys(): #initialize subdictionaries as needed
dict_MSD_values['MSD_value_dict'][particle_name] = []
dict_MSD_values['MSD_std_dict'][particle_name] = []
dict_MSD_values['MSD_value_dict'][particle_name].append(numpy.average(numpy.array(trajectory_striding_dictionary[particle_name]['MSD_value_list_centroids'])))
dict_MSD_values['MSD_std_dict'][particle_name].append(numpy.std(numpy.array(trajectory_striding_dictionary[particle_name]['MSD_value_list_centroids'])))
dict_MSD_values['frame_skip_value_list'].append(MSD_window_size_frames)
return dict_MSD_values