diffusion_analysis.py

# coding: utf-8
'''
Authors: Tyler Reddy and Anna Duncan

The purpose of this Python module is to provide utility functions for analyzing the diffusion of particles in molecular dynamics simulation trajectories using either linear or anomalous diffusion models.'''

import numpy
import scipy
import scipy.optimize

def fit_anomalous_diffusion_data(time_data_array,MSD_data_array,degrees_of_freedom=2):
    '''This function should fit anomalous diffusion data to Equation 1 in [Kneller2011]_, and return appropriate diffusion parameters.

    .. math::

        MSD = ND_{\\alpha}t^{\\alpha}

    An appropriate coefficient (`N` = 2,4,6 for 1,2,3 `degrees_of_freedom`) will be assigned based on the specified `degrees_of_freedom`. The latter value defaults to 2 (i.e., a planar phospholipid bilayer with `N` = 4).
    Input data should include arrays of MSD (in Angstroms ** 2) and time values (in ns).
    The results are returned in a tuple.

    Parameters
    ----------
    time_data_array : array_like
        Input array of time window sizes (nanosecond units)
    MSD_data_array : array_like
        Input array of MSD values (Angstrom ** 2 units; order matched to time_data_array)
    degrees_of_freedom : int
        The degrees of freedom for the diffusional process (1, 2 or 3; default 2)

    Returns
    -------
    fractional_diffusion_coefficient
        The fractional diffusion coefficient (units of Angstrom ** 2 / ns ** alpha)
    standard_deviation_fractional_diffusion_coefficient
        The standard deviation of the fractional diffusion coefficent (units of Angstrom ** 2 / ns ** alpha)
    alpha
        The scaling exponent (no dimensions) of the non-linear fit
    standard_deviation_alpha
        The standard deviation of the scaling exponent (no dimensions)
    sample_fitting_data_X_values_nanoseconds
        An array of time window sizes (x values) that may be used to plot the non-linear fit curve
    sample_fitting_data_Y_values_Angstroms
        An array of MSD values (y values) that may be used to plot the non-linear fit curve

    Raises
    ------
    ValueError
        If the time window and MSD arrays do not have the same shape

    References
    ----------

    .. [Kneller2011] Kneller et al. (2011) J Chem Phys 135: 141105.


    Examples
    --------
    Calculate fractional diffusion coefficient and alpha from artificial data (would typically obtain empirical data from an MD simulation trajectory):

    >>> import diffusion_analysis
    >>> import numpy
    >>> artificial_time_values = numpy.arange(10)
    >>> artificial_MSD_values = numpy.array([0.,1.,2.,2.2,3.6,4.7,5.8,6.6,7.0,6.9])
    >>> results_tuple = diffusion_analysis.fit_anomalous_diffusion_data(artificial_time_values,artificial_MSD_values)
    >>> D, D_std, alpha, alpha_std = results_tuple[0:4]
    >>> print D, D_std, alpha, alpha_std
    0.268426206526 0.0429995249239 0.891231967011 0.0832911559401

    Plot the non-linear fit data:

    >>> import matplotlib
    >>> import matplotlib.pyplot as plt
    >>> sample_fit_x_values, sample_fit_y_values = results_tuple[4:]
    >>> p = plt.plot(sample_fit_x_values,sample_fit_y_values,'-',artificial_time_values,artificial_MSD_values,'.')

    .. image:: example_nonlinear.png

        '''

    if time_data_array.shape != MSD_data_array.shape:
        raise ValueError("The shape of time_data_array must match the shape of MSD_data_array.")

    def function_to_fit(time,fractional_diffusion_coefficient,scaling_exponent):
        coefficient_dictionary = {1:2,2:4,3:6} #dictionary for mapping degrees_of_freedom to coefficient in fitting equation
        coefficient = coefficient_dictionary[degrees_of_freedom]
        return coefficient * fractional_diffusion_coefficient * (time ** scaling_exponent) #equation 1 in the above paper with appropriate coefficient based on degrees of freedom

    #fit the above function to the data and pull out the resulting parameters
    optimized_parameter_value_array, estimated_covariance_params_array = scipy.optimize.curve_fit(function_to_fit,time_data_array,MSD_data_array)
    #generate sample fitting data over the range of time window values (user can plot if they wish)
    sample_fitting_data_X_values_nanoseconds = numpy.linspace(time_data_array[0],time_data_array[-1],100)
    sample_fitting_data_Y_values_Angstroms = function_to_fit(sample_fitting_data_X_values_nanoseconds, *optimized_parameter_value_array)
    #could then plot the non-linear fit curve in matplotlib with, for example: axis.plot(sample_fitting_data_X_values_nanoseconds,sample_fitting_data_Y_values_Angstroms,color='black')
    #could plot the original data points alongside the fit (MSD vs time) with, for example: axis.scatter(time_data_array,MSD_data_array,color='black')

    #extract pertinent values from the scipy curve_fit arrays (D_alpha, alpha, and their standard deviations)
    parameter_standard_deviation_array = numpy.sqrt(numpy.diagonal(estimated_covariance_params_array))
    fractional_diffusion_coefficient = optimized_parameter_value_array[0]
    standard_deviation_fractional_diffusion_coefficient = parameter_standard_deviation_array[0]
    alpha = optimized_parameter_value_array[1]
    standard_deviation_alpha = parameter_standard_deviation_array[1]

    return (fractional_diffusion_coefficient, standard_deviation_fractional_diffusion_coefficient, alpha, standard_deviation_alpha,sample_fitting_data_X_values_nanoseconds,sample_fitting_data_Y_values_Angstroms)

def fit_linear_diffusion_data(time_data_array,MSD_data_array,degrees_of_freedom=2):
    '''The linear (i.e., normal, random-walk) MSD vs. time diffusion constant calculation.

    The results are returned in a tuple.

    Parameters
    ----------
    time_data_array : array_like
        Input array of time window sizes (nanosecond units)
    MSD_data_array : array_like
        Input array of MSD values (Angstrom ** 2 units; order matched to time_data_array)
    degrees_of_freedom : int
        The degrees of freedom for the diffusional process (1, 2 or 3; default 2)

    Returns
    -------
    diffusion_constant
        The linear (or normal, random-walk) diffusion coefficient (units of Angstrom ** 2 / ns)
    diffusion_constant_error_estimate
        The estimated uncertainty in the diffusion constant (units of Angstrom ** 2 / ns), calculated as the difference in the slopes of the two halves of the data. A similar approach is used by GROMACS g_msd [Hess2008]_.
    sample_fitting_data_X_values_nanoseconds
        An array of time window sizes (x values) that may be used to plot the linear fit
    sample_fitting_data_Y_values_Angstroms
        An array of MSD values (y values) that may be used to plot the linear fit

    Raises
    ------
    ValueError
        If the time window and MSD arrays do not have the same shape

    References
    ----------

    .. [Hess2008] Hess et al. (2008) JCTC 4: 435-447.

    Examples
    --------
    Calculate linear diffusion coefficient from artificial data (would typically obtain empirical data from an MD simulation trajectory):

    >>> import diffusion_analysis
    >>> import numpy
    >>> artificial_time_values = numpy.arange(10)
    >>> artificial_MSD_values = numpy.array([0.,1.,2.,2.2,3.6,4.7,5.8,6.6,7.0,6.9])
    >>> results_tuple = diffusion_analysis.fit_linear_diffusion_data(artificial_time_values,artificial_MSD_values)
    >>> D, D_error = results_tuple[0:2]
    >>> print D, D_error
    0.210606060606 0.07

    Plot the linear fit data:

    >>> import matplotlib
    >>> import matplotlib.pyplot as plt
    >>> sample_fit_x_values, sample_fit_y_values = results_tuple[2:]
    >>> p = plt.plot(sample_fit_x_values,sample_fit_y_values,'-',artificial_time_values,artificial_MSD_values,'.')

    .. image:: example_linear.png

    '''

    if time_data_array.shape != MSD_data_array.shape:
        raise ValueError("The shape of time_data_array must match the shape of MSD_data_array.")

    coefficient_dictionary = {1:2.,2:4.,3:6.} #dictionary for mapping degrees_of_freedom to coefficient in fitting equation
    coefficient = coefficient_dictionary[degrees_of_freedom]

    x_data_array = time_data_array
    y_data_array = MSD_data_array
    z = numpy.polyfit(x_data_array,y_data_array,1)
    slope, intercept = z
    diffusion_constant = slope / coefficient

    #estimate error in D constant using g_msd approach:
    first_half_x_data, second_half_x_data = numpy.array_split(x_data_array,2)
    first_half_y_data, second_half_y_data = numpy.array_split(y_data_array,2)
    slope_first_half, intercept_first_half = numpy.polyfit(first_half_x_data,first_half_y_data,1)
    slope_second_half, intercept_second_half = numpy.polyfit(second_half_x_data,second_half_y_data,1)
    diffusion_constant_error_estimate = abs(slope_first_half - slope_second_half) / coefficient

    #use poly1d object for polynomial calling convenience (to provide plotting fit data for user if they want to use it):
    p = numpy.poly1d(z)
    sample_fitting_data_X_values_nanoseconds = numpy.linspace(time_data_array[0],time_data_array[-1],100)
    sample_fitting_data_Y_values_Angstroms = p(sample_fitting_data_X_values_nanoseconds)

    return (diffusion_constant, diffusion_constant_error_estimate,sample_fitting_data_X_values_nanoseconds,sample_fitting_data_Y_values_Angstroms)


def centroid_array_production_protein(protein_sel,num_protein_copies):
    dictionary_centroid_arrays = {}
    full_protein_coord_array = protein_sel.positions
    list_individual_protein_coord_arrays = numpy.split(full_protein_coord_array,num_protein_copies)
    list_per_protein_centroids = [numpy.average(protein_coord_array,axis=0) for protein_coord_array in list_individual_protein_coord_arrays]
    dictionary_centroid_arrays['protein'] = numpy.array(list_per_protein_centroids)
    return dictionary_centroid_arrays


def mean_square_displacement_by_species(coordinate_file_path, trajectory_file_path, window_size_frames_list, dict_particle_selection_strings, contiguous_protein_selection=None, num_protein_copies = None):
    '''Calculate the mean square displacement (MSD) of particles in a molecular dynamics simulation trajectory using the Python `MDAnalysis <http://code.google.com/p/mdanalysis/>`_ package [Michaud-Agrawal2011]_.

    Parameters
    ----------
    coordinate_file_path: str
        Path to the coordinate file to be used in loading the trajectory.
    trajectory_file_path: str or list of str
        Path to the trajectory file to be used or ordered list of trajectory file paths.
    window_size_frames_list: list
        List of window sizes measured in frames. Time values are not used as timesteps and simulation output frequencies can vary.
    dict_particle_selection_strings: dict
        Dictionary of the MDAnalysis selection strings for each particle set for which MSD values will be calculated separately. Format: {'particle_identifier':'MDAnalysis selection string'}. If a given selection contains more than one particle then the centroid of the particles is used, and if more than one MDAnalysis residue object is involved, then the centroid of the selection within each residue is calculated separately.
    contiguous_protein_selection: str
        When parsing protein diffusion it may be undesirable to split by residue (i.e., amino acid). You may provide an MDAnalysis selection string for this parameter containing a single type of protein (and no other protein types or other molecules / atoms). This selection string may encompass multiple copies of the same protein which should be specified with the parameter `num_protein_copies`
    num_protein_copies: int
        The number of protein copies if contiguous_protein_selection is specified. This will allow for the protein coordinates to be split and the individual protein centroids will be used for diffusion analysis.

    Returns
    -------
    dict_MSD_values: dict
        Dictionary of mean square displacement data. Contains three keys: MSD_value_dict (MSD values, in Angstrom ** 2), MSD_std_dict (standard deviation of MSD values), frame_skip_value_list (the frame window sizes)

    References
    ----------

    .. [Michaud-Agrawal2011] N. Michaud-Agrawal, E. J. Denning, T. B. Woolf, and O. Beckstein. MDAnalysis: A Toolkit for the Analysis of Molecular Dynamics Simulations. J. Comput. Chem. 32 (2011), 2319–2327

    Examples
    --------
    Extract MSD values from an artificial GROMACS xtc file (results measured in A**2 based on frame window sizes) containing only three amino acid residues.

    >>> import diffusion_analysis
    >>> import numpy
    >>> import matplotlib
    >>> import matplotlib.pyplot as plt

    >>> window_size_list_frames = [1,2,4,6]
    >>> dict_particle_selection_strings = {'MET':'resname MET','ARG':'resname ARG','CYS':'resname CYS'}
    >>> dict_MSD_values = diffusion_analysis.mean_square_displacement_by_species('./test_data/dummy.gro','./test_data/diffusion_testing.xtc',window_size_list_frames,dict_particle_selection_strings)

    Plot the results:

    >>> fig = plt.figure()
    >>> ax = fig.add_subplot(111)
    >>> for residue_name in dict_particle_selection_strings.keys():
    ...     p = ax.errorbar(window_size_list_frames,dict_MSD_values['MSD_value_dict'][residue_name],yerr=dict_MSD_values['MSD_std_dict'][residue_name],label=residue_name,fmt='o')
    >>> a = ax.set_xlabel('Window size (frames)')
    >>> a = ax.set_ylabel('MSD ($\AA^2$)')
    >>> a = ax.set_ylim(-10,600)
    >>> a = ax.legend(loc=2,numpoints=1)

    .. image:: example_MSD_extraction.png


'''
    import MDAnalysis

    universe_object = MDAnalysis.Universe(coordinate_file_path,trajectory_file_path)

    if not contiguous_protein_selection:
        MDA_residue_selection_dictionary = {}
        for particle_name, selection_string in dict_particle_selection_strings.items():
            MDA_selection = universe_object.select_atoms(selection_string)
            MDA_selection_residue_list = MDA_selection.residues #have to break it down by residues, otherwise would end up with centroid of all particles of a given name
            list_per_residue_selection_objects = [residue.atoms.select_atoms(selection_string) for residue in MDA_selection_residue_list] #the MDA selection objects PER residue
            MDA_residue_selection_dictionary[particle_name] = list_per_residue_selection_objects

        def centroid_array_production(current_MDA_selection_dictionary): #actually my modification of this for the github workflow won't work--it will find the centroid of ALL the particles with the same name
            '''Produce numpy arrays of centroids organized in a dictionary by particle identifier and based on assignment of particle selections to unique residue objects within MDAnalysis.'''
            dictionary_centroid_arrays = {}
            for particle_name,list_per_residue_selection_objects in current_MDA_selection_dictionary.items():
                list_per_residue_selection_centroids = [residue_selection.centroid() for residue_selection in list_per_residue_selection_objects]
                dictionary_centroid_arrays[particle_name] = numpy.array(list_per_residue_selection_centroids)
            return dictionary_centroid_arrays

    else: #dealing with proteins, where we don't want a breakdown by residue
        protein_selection = universe_object.select_atoms(contiguous_protein_selection)


    dict_MSD_values = {'MSD_value_dict':{},'MSD_std_dict':{},'frame_skip_value_list':[]} #for overall storage of MSD average / standard deviation values for this replicate

    for MSD_window_size_frames in window_size_frames_list:
        list_per_window_average_displacements = []
        counter = 0
        trajectory_striding_dictionary = {} #store values in a dict as you stride through trajectory
        for ts in universe_object.trajectory[::MSD_window_size_frames]:
            if counter == 0: #first parsed frame
                if not contiguous_protein_selection:
                    previous_frame_centroid_array_dictionary = centroid_array_production(MDA_residue_selection_dictionary)
                else:
                    previous_frame_centroid_array_dictionary = centroid_array_production_protein(protein_selection, num_protein_copies)
            else: #all subsequent frames
                if not contiguous_protein_selection:
                    current_frame_centroid_array_dictionary = centroid_array_production(MDA_residue_selection_dictionary)
                else:
                    current_frame_centroid_array_dictionary = centroid_array_production_protein(protein_selection, num_protein_copies)

                for particle_name in current_frame_centroid_array_dictionary.keys():
                    if not particle_name in trajectory_striding_dictionary.keys(): #create the appropriate entry if this particle types hasn't been parsed yet
                        trajectory_striding_dictionary[particle_name] = {'MSD_value_list_centroids':[]}
                    current_delta_array_centroids = previous_frame_centroid_array_dictionary[particle_name] - current_frame_centroid_array_dictionary[particle_name]
                    square_delta_array_centroids = numpy.square(current_delta_array_centroids)
                    sum_squares_delta_array_centroids = numpy.sum(square_delta_array_centroids,axis=1)
                    trajectory_striding_dictionary[particle_name]['MSD_value_list_centroids'].append(numpy.average(sum_squares_delta_array_centroids))

                #reset the value of the 'previous' array as you go along:
                previous_frame_centroid_array_dictionary = current_frame_centroid_array_dictionary
                #print 'frame:', ts.frame
            counter += 1
        for particle_name, MSD_data_subdictionary in trajectory_striding_dictionary.items():
            if not particle_name in dict_MSD_values['MSD_value_dict'].keys(): #initialize subdictionaries as needed
                dict_MSD_values['MSD_value_dict'][particle_name] = []
                dict_MSD_values['MSD_std_dict'][particle_name] = []
            dict_MSD_values['MSD_value_dict'][particle_name].append(numpy.average(numpy.array(trajectory_striding_dictionary[particle_name]['MSD_value_list_centroids'])))
            dict_MSD_values['MSD_std_dict'][particle_name].append(numpy.std(numpy.array(trajectory_striding_dictionary[particle_name]['MSD_value_list_centroids'])))
        dict_MSD_values['frame_skip_value_list'].append(MSD_window_size_frames)
    return dict_MSD_values