-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstatistical_raptor.py
384 lines (318 loc) · 13.9 KB
/
statistical_raptor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
# General modules
import pickle
import sys
import os
import time
from time import gmtime, strftime
import logging
import argparse
# Graph modules
import plotly
from plotly.graph_objs import Scatter, Layout
# Bio.PDB
from Bio.PDB import *
from Bio.PDB.PDBParser import PDBParser
from Bio.PDB.MMCIF2Dict import MMCIF2Dict
from numpy import *
# Own modules, scripts
from phobius_tester import *
from SP_calculator_DB import calculate_distance
def select_and_save_distances_query(beg_helix1,end_helix1,
beg_helix2,end_helix2,
chain1,chain2, transdom_x_distances):
"""
Iterates through all the residues from two domains and saves the distance
calculated in the calculate_distance function.
"""
if chain1 is chain2:
if beg_helix1 is beg_helix2:
beg_helix2 += 5
for res1 in range(beg_helix1, end_helix1 + 1):
try:
res_name1 = chain1[res1].get_resname()
except KeyError as e:
continue
for res2 in range(beg_helix2, end_helix2 + 1):
if res1 == res2:
continue
try:
res_name2 = chain2[res2].get_resname()
except KeyError as e:
continue
distance = calculate_distance(res1, res2, chain1, chain2)
if distance %1 >= 0.5:
distance = round(distance)
else:
distance = distance - (distance % 1)
distance = round(distance) + 0.5
save_results_query(res1, res_name1, res_name2,
distance, transdom_x_distances)
def save_results_query(res1, res_name1, res_name2, distance, dict_name_dist):
"""
Saves the results of the distances of two residues in a given position.
"""
# Check if the entry of the 2 aa has been generated reversely!
dict_name_dist.setdefault(res1, {})
if (res_name2, res_name1) in dict_name_dist[res1]:
dict_name_dist[res1][(res_name2, res_name1)].setdefault(distance, 0)
dict_name_dist[res1][(res_name2, res_name1)][distance] += 1
return
# Save results into a dict-list
dict_name_dist.setdefault(res1, {})
dict_name_dist[res1].setdefault((res_name1, res_name2), {})
dict_name_dist[res1][(res_name1, res_name2)].setdefault(distance, 0)
dict_name_dist[res1][(res_name1, res_name2)][distance] += 1
return
def get_seq_PDB_CIF(filename):
"""
From a PDB created with Modeller or other, iterates trough all atoms and generates
a fasta sequence with all the information.
"""
pdb = open (filename, "r")
seq = ">" + filename[0:4] + ":A|something|other|another\n"
chain = "A"
correspondence = { "ALA":"A", "CYS":"C", "ASP":"D", "GLU":"E",
"PHE":"F", "GLY":"G", "HIS":"H", "ILE":"I", "LYS":"K", "LEU":"L",
"MET":"M", "ASN":"N", "PRO":"P", "GLN":"Q", "ARG":"R", "SER":"S",
"THR":"T", "VAL":"V", "TRP":"W", "TYR":"Y"}
for line in pdb:
if line.startswith("ATOM"):
field_list = line.split()
if '.pdb' in filename:
if field_list[2] == "CA":
if field_list[3] not in correspondence:
seq += str('X')
else:
seq += str(correspondence[field_list[3]])
if '.cif' in filename:
if field_list[3] == "CA":
if field_list[5] not in correspondence:
seq += str('X')
else:
seq += str(correspondence[field_list[5]])
seq += '\n>\n'
return seq
def distance_query_generator(filename):
"""
Creates the following dicts with the values of distances:
- query_intra_distances: values from the distances same chain same dom.
- query_inter_distances: values between different chains diff domains.
"""
# Creation of variables to work with new dicts
trans_dom_query = {}
query_inter_distances = {}
query_intra_distances = {}
file_id = (filename.split('/')[-1])[:-3]
if '.pdb' in filename:
try:
trans_dom_query = phobius_runner(filename[:-3] + 'fasta' )
except:
sys.stderr('***It is recomendable add the fasta seq as \
a separated file in the current directory ***\n ')
fasta_fil = open('fasta_to_generate.fasta', 'w')
seq_fasta = get_seq_PDB_CIF(filename)
fasta_fil.write(seq_fasta)
fasta_fil.close()
trans_dom_query = phobius_runner('fasta_to_generate.fasta')
entity = PDBParser()
elif '.cif' in filename:
try:
trans_dom_query = phobius_runner(filename[:-3] + 'fasta' )
except:
raise ValueError( 'To use .cif files introduce a fasta with the following name: '
+ filename[:-3] + 'fasta')
entity = MMCIFParser()
structure = entity.get_structure('X', filename)
model = structure[0]
for key, value in trans_dom_query.items():
logging.info('Chain processed: {}\n Transdom: {}'.format(key, value))
if '.pdb' in filename:
for chain in model:
chain1 = chain
else:
chain1 = model[key]
for val in value:
beg_helix1, end_helix1 = int(val[0]), int(val[1])
# ################INTRA DISTANCES ####################################
select_and_save_distances_query(beg_helix1,end_helix1,
beg_helix1,end_helix1,
chain1,chain1, query_intra_distances)
# ################INTER DISTANCES SAME CHAIN ##############################
for val2 in value:
if val == val2:
continue
beg_helix2, end_helix2 = int(val2[0]), int(val2[1])
select_and_save_distances_query(beg_helix1,end_helix1,
beg_helix2,end_helix2,
chain1,chain1, query_inter_distances)
# ################INTER DISTANCES BTWN CHAIN ########################
for key, value in trans_dom_query.items():
if '.pdb' in filename:
for chain in model:
chain2 = chain
else:
chain2 = model[key]
if chain1 == chain2:
continue
for val3 in value:
beg_helix3, end_helix3 = int(val3[0]), int(val3[1])
select_and_save_distances_query(beg_helix1,end_helix1,
beg_helix3,end_helix3,
chain1,chain2, query_inter_distances)
return (query_intra_distances,query_inter_distances)
def PMF_pair_calc(res_pair, distance, query_dict, SP_distances_2res, SP_freq_aa, SP_freq_dist):
"""
Calculates all the PMF_pair from a given pair of residues.
In order to do that, calculates:
- acumulative_distance_2res: frequencies presented at the specific distance or less
between two residues.
- acumulative_distance_gen: general frequency of the given distance or less in all the datasets.
"""
PMF_pair = 0
for dist_query, ocurrences in distance.items():
acumulative_distance_2res = 0
acumulative_distance_gen = 0
if res_pair not in SP_distances_2res.keys():
res_pair = (res_pair[1],res_pair[0])
if res_pair not in SP_freq_dist.keys():
res_pair = (res_pair[1],res_pair[0])
acumulative_distance_2res += sum( freq for dist,freq in SP_distances_2res[res_pair].items()
if dist <= dist_query and dist != 999 )
acumulative_distance_gen += sum( freq for dist,freq in SP_freq_dist.items()
if dist <= dist_query )
PMF_pair_dist = - log(acumulative_distance_2res/
(SP_freq_aa[res_pair[0]]*SP_freq_aa[res_pair[1]]*acumulative_distance_gen)
)*ocurrences
PMF_pair += PMF_pair_dist
return PMF_pair
def apply_window_size(PMF,window):
"""
Apply an specific windows size selected by the user for showing in the graph the results.
Calculates the median of the window and assigns at each position of interest the value.
"""
PMF_median = {}
list_keys = []
median_window = 0
for i in range(0,len(PMF)):
list_keys = sorted(PMF.keys())
if len(PMF) >= i+window:
PMF_median[list_keys[i]] = sum(PMF[list_keys[x]] for x in range(i,i+window)) / window
else:
too_much = i+window
while len(PMF) < too_much:
too_much -= 1
PMF_median[list_keys[i]] = sum(PMF[list_keys[x]] for x in range(i,too_much)) / (too_much-i)
return PMF_median
def make_plot(PMF_positions,file_id,output_path):
"""
Creates using plotly an offline graph in html with the displaying of the results.
"""
# Assigns None value to the positions no transmembrane for showing efficiently the results
for i in range(min(PMF_positions.keys()),max(PMF_positions.keys())):
if i in PMF_positions.keys():
continue
else:
PMF_positions[i] = None
positions = [key for key in sorted(PMF_positions.keys())]
pmf_values = [value for key,value in sorted(PMF_positions.items())]
color_list = [ 'rgb(22, 96, 167)', 'rgb(205, 12, 24)',
'rgb(0,128,128)', 'rgb(250,128,114)', 'rgb(154,205,50)'
'rgb(255,105,180)', 'rgb(0,191,255)', 'rgb(50,205,50)', 'rgb(165,42,42)']
# Create a trace
plotly.offline.plot({
"data": [
Scatter(
x=positions,
y=pmf_values,
line = dict(color = color_list[random.randint(0,len(color_list))])
)
],
"layout": Layout(
xaxis =dict(title = 'Position' ) ,
yaxis = dict(title = 'Potential Mean Force (kT)'),
title= '<b>Transmembrane statistic potentials: ' + file_id + '</b>'
)
}, filename = output_path + '_' + file_id + '.html')
def MAIN(filename,window,output_path):
"""
Main program with all the subprocesses. Calculates PMF intra and inter
positions. Creates the plot.
"""
# Recovering of the database with pickle module
tuple_dict = pickle.load( open( "SP_dataset.p", "rb" ))
# Prepares all the dataset for calculate PMF
SP_intra_distances = tuple_dict[0]
SP_inter_distances = tuple_dict[1]
SP_aa_freq = tuple_dict[2]
SP_dist_intra_freq = tuple_dict[3]
SP_dist_inter_freq = tuple_dict[4]
sys.stderr.write('\tCreating the distance query dict...\n')
# Obtain all the distance values
distances_query = distance_query_generator(filename)
query_distances_intra, query_distances_inter = distances_query[0] , distances_query[1]
PMF_positions = {}
sys.stderr.write('\tCalculating the PMF for each position...\n')
# Calculates all the PMF values for a given position
for pos in query_distances_intra.keys():
PMF_positions[pos] = sum(PMF_pair_calc(aa_pairs,distances,
query_distances_intra,
SP_intra_distances,
SP_aa_freq ,
SP_dist_intra_freq) for aa_pairs,distances in query_distances_intra[pos].items())
for pos in query_distances_inter.keys():
PMF_positions[pos] += sum(PMF_pair_calc(aa_pairs,distances,
query_distances_inter,
SP_inter_distances,
SP_aa_freq ,
SP_dist_inter_freq) for aa_pairs,distances in query_distances_inter[pos].items())
sys.stderr.write('\tPlotting the results...\n')
#Creates the graph in plotly
PMF_median= apply_window_size(PMF_positions,window)
file_id = filename.split('/')[-1]
make_plot(PMF_median,file_id,output_path)
def argparse_creator():
parser = argparse.ArgumentParser(description="""This program analizes the folding of transmembrane
domains in proten models according to knowledge-based potentials""")
parser.add_argument('-i','--input',
dest = "infile",
action = "store",
default = None,
help = "Input file or directory. Files must have only one chain")
parser.add_argument('-o','--output',
dest = "outfile",
action = "store",
default = "output.html",
help = "Path and output files.")
parser.add_argument('-w','--window',
dest = "window",
action = "store",
default = 10,
help = "Size of the window in the plot for statistic potentials")
options = parser.parse_args()
infile = options.infile
output = options.outfile
window = options.window
arg = (infile, output, window)
return arg
if __name__ == '__main__':
argv = argparse_creator()
if len(sys.argv) >= 1:
path = argv[0]
output_path = argv[1]
window = int(argv[2])
if os.path.isdir(path):
for filename in os.listdir(path):
if '.pdb' in path or '.cif' in filename:
sys.stderr.write('Working with {}\n'.format(filename))
file_pdb = os.path.join(path, filename)
MAIN(file_pdb,window,output_path)
elif os.path.isfile(path):
if '.pdb' in path or '.cif' in path:
sys.stderr.write('Working with {}\n'.format(path))
MAIN(path,window,output_path)
else:
raise ValueError("This type of file is not accepted")
else:
raise ValueError("We only work with directory files sir.")
else:
raise ValueError("TREX: Please specify input [1]")