-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathinputs.py
386 lines (357 loc) · 16 KB
/
inputs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
# Copyright 2013-2015 James S Blachly, MD and The Ohio State University
#
# This file is part of Mucor.
#
# Mucor is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Mucor is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Mucor. If not, see <http://www.gnu.org/licenses/>.
# inputs.py
from __future__ import print_function
import HTSeq
import sys
import os
from variant import Variant
import numpy as np
def throwWarning(message, help = False):
print("*** WARNING: " + message + " ***")
return
class Parser(object):
'''Object to cover all parsing functions'''
def __init__(self):
self.row = ''
self.source = ''
self.var = None
self.fieldId = ''
self.header = ''
self.fn = ''
# Respective parse functions.
# Keys are identical to the allowed formats in mucor_config.py
# Values are the parse functions defined below
self.supported_formats = { "MiSeq":self.parse_MiSeq,
"IonTorrent":self.parse_IonTorrent,
"SomaticIndelDetector":self.parse_SomaticIndelDetector,
"Mutect":self.parse_MuTectVCF,
"muTector":self.parse_MuTectOUT,
"Samtools":self.parse_SamTools,
"VarScan":self.parse_VarScan,
"HaplotypeCaller":self.parse_HapCaller,
"FreeBayes":self.parse_FreeBayes,
"GenericGATK":self.parse_GenericGATK,
"INFOCol":self.parse_INFO_Column }
def parse(self, row, source):
self.source = source
self.row = row
chrom = row.chrom
ref = row.ref
alt = "/".join(sorted(row.alt))
pos = row.pos # already exists as GenomicPosition object
dp = 0
frac = 0.0
effect, fc = parse_EFC(row.info)
out = []
samples_dict = self.supported_formats[self.source](row.samples)
for sample, vals in samples_dict.items():
out.append(Variant(source='', sample=sample, pos=pos, ref=ref, alt=alt, frac=vals[1], dp=vals[0], eff=effect, fc=fc))
return out
def parse_MiSeq(self,samples):
''' MiSeq vcf parser function. Input: InputParser object. Output: Variant object '''
out = {} # list of variant objects; 1 per sample
for sample, values in samples.items():
vf = None
dp = None
if 'VF' in values.keys():
vf = float(values['VF'])
if 'DP' in values.keys():
dp = int(values['DP'])
if 'AD' in values.keys():
# prepare ref depth and alt depth in case VAF or DP was not defined
rd = int(values['AD'].split(',')[0])
ad = int(values['AD'].split(',')[1])
if not vf and 'AD' in values.keys():
# try to calculate vaf from AD column
vf = float(ad)/float(ad + rd)
if not dp and 'AD' in values.keys():
# try to calculate dp from AD column
if vf:
# if AD and VF are both defined, this will yield the same DP defined in the INFO column
dp = round(ad/vf)
else:
# the sum of ref and alt alleles may not equal total depth, if there are two mutations at the same location, or if a single base was mutated and not shown
dp = rd + ad
out[sample] = (dp, vf)
return out
def parse_IonTorrent(self, samples):
''' Ion Torrent vcf parser function. Input: InputParser object. Output: Variant object '''
out = {} # list of variant objects; 1 per sample
for sample, values in samples.items():
dp = None
vf = None
try:
ao = sum( [int(x) for x in values['AO'].split(',')] )
ro = int(values['RO'])
dp = int(values['DP'])
vf = float(ao)/float(dp)
except KeyError:
pass
out[sample] = (dp, vf)
return out
def parse_MuTectOUT(self):
''' MuTect '.out' parser function. Input: InputParser object. Output: Variant object '''
row = self.row
fieldId = self.fieldId
header = self.header
fn = self.fn
chrom = row[0]
ref = row[3]
alt = row[4]
effect = ""
fc = ""
vf = float(row[fieldId['tumor_f']])
dp = int(int(str(row[fieldId['t_ref_count']]).strip()) + int(str(row[fieldId['t_alt_count']]).strip()))
position = int(row[fieldId['position']])
var = Variant(source=os.path.basename(fn), sample='', pos=HTSeq.GenomicPosition(chrom, int(position)), ref=ref, alt=alt, frac=vf, dp=dp, eff=effect.strip(';'), fc=fc.strip(';'))
return var
def parse_MuTectVCF(self, samples):
''' MuTect vcf parser function. Input: InputParser object. Output: Variant object '''
out = {} # list of variant objects; 1 per sample
for sample, values in samples.items():
dp = None
vf = None
# remove empty sample columns
if sample == "none" and values['DP'] == '0' and values['BQ'] == '.':
continue
try:
dp = values['DP']
vf = values['FA']
except KeyError:
pass
out[sample] = (dp, vf)
return out
def parse_SomaticIndelDetector(self, samples):
''' DEPRECIATED AND UNTESTED '''
''' GATK SomaticIndelDetector vcf parser function. Input: InputParser object. Output: Variant object '''
out = {} # list of variant objects; 1 per sample
for sample, values in samples.items():
dp = None
vf = None
try:
dp = int(values['DP'])
ad = int(values['AD'].split(',')[1])
vf = float(ad)/float(dp)
except KeyError:
pass
out[sample] = (dp, vf)
return out
'''
# pre-htseq vcf reder parse function
row = self.row
fieldId = self.fieldId
header = self.header
fn = self.fn
chrom = row[0]
ref = row[3]
alt = row[4]
effect = self.eff
fc = self.fc
j = 0
# Below attempts to grab sample ID.
# assumes that sample ID is the final column in the self.header. always true?
# if not always true, adopt the parse_mutect solution here as well
tmpsampID = header[-1]
for i in row[fieldId['FORMAT']].split(':'):
if i == "AD":
ALT_count = row[fieldId[tmpsampID]].split(':')[j].split(',')[1]
elif i == "DP":
dp = row[fieldId[tmpsampID]].split(':')[j]
vf = float( float(ALT_count)/float(dp) )
j+=1
position = int(row[fieldId['POS']])
var = Variant(source=fn.split('/')[-1], pos=HTSeq.GenomicPosition(chrom, int(position)), ref=ref, alt=alt, frac=vf, dp=dp, eff=effect.strip(';'), fc=fc.strip(';'))
return var
'''
def parse_SamTools(self, samples):
''' samtools vcf parser function. Input: InputParser object. Output: Variant object '''
out = {} # list of variant objects; 1 per sample
for sample, values in samples.items():
dp = None
vf = None
if 'DP4' in values.keys():
dp4 = values['DP4'].split(',')
ro = int(dp4[0]) + int(dp4[1])
ao = int(dp4[2]) + int(dp4[3])
dp = ro + ao
vf = float(ao)/float(dp)
if not dp:
# if DP4 is not present, check for depth field
if 'DP' in values.keys():
dp = int(values['DP'])
out[sample] = (dp, vf)
return out
def parse_VarScan(self,samples):
''' varscan vcf parser function. Input: InputParser object. Output: Variant object '''
out = {} # list of variant objects; 1 per sample
for sample, values in samples.items():
dp = None
vf = None
try:
dp = int(values['DP'])
vf = float(values['FREQ'].strip('%'))/100.0
except KeyError:
pass
out[sample] = (dp, vf)
return out
def parse_HapCaller(self, samples):
''' GATK haplotype caller vcf parser function. Input: InputParser object. Output: Variant object '''
out = {} # list of variant objects; 1 per sample
for sample, values in samples.items():
vf = None
dp = None
try:
dp = int(values['DP'])
ad = str(values['AD'])
ref_count = int(ad.split(',')[0])
alt_count = int(ad.split(',')[1])
vf = float( float(alt_count)/(float(ref_count) + float(alt_count)) )
except KeyError:
pass
except ZeroDivisionError:
if dp == 0 and alt_count == 0 and ref_count == 0:
# This was called, but is not covered
vf = 0
out[sample] = (dp, vf)
return out
def parse_FreeBayes(self, samples):
''' freebayes vcf parser function. Input: InputParser object. Output: Variant object '''
out = {} # list of variant objects; 1 per sample
for sample, values in samples.items():
dp = None
vf = None
try:
dp = int(values['DP'])
ro = int(values['RO'])
ao = sum([int(x) for x in values['AO'].split(',')])
vf = float( float(ao)/float(ao + ro) )
except KeyError:
pass
out[sample] = (dp, vf)
return out
def parse_GenericGATK(self, samples):
'''
Generic GATK parser function. This was written for the Illumina BaseSpace BWA Enrichment Workflow vcf files, but may apply to more filetypes
Input: InputParser object. Output: Variant object
'''
out = {}
for sample, values in samples.items():
dp = None
vf = None
if 'DP' in values.keys():
#DP is defined
dp = values['DP']
if 'VF' in values.keys():
#VF is defined
vf = values['VF']
if not dp or not vf:
try:
# try to determine depth and VAF from 'AD' field, if present
ro = int(values['AD'].split(',')[0])
ao = sum([int(x) for x in values['AD'].split(',')[1:]])
if not dp and (ro or ao): # is is possible that ro and ao are both 0, leading to division error when calculating vf
dp = ro + ao
vf = float(float(ao)/float(dp)) # one VF for all possible alternate alleles. Nothing unusual, unless the mutation has multiple alt alleles in 1 vcf line
except KeyError:
if not dp:
# there may not have been a mutant here, or it was unable to be identified.
# printing this error can result in overwhelming amounts of warnings
pass
#throwWarning("Could not determine depth for {0} using GenericGATK parser".format(sample))
if not vf:
# there may not have been a mutant here, or it was unable to be identified.
# printing this error can result in overwhelming amounts of warnings
pass
#throwWarning("Could not determine variant allele frequency for {0} using GenericGATK parser".format(sample))
pass
out[sample] = (dp, vf)
return out
def parse_INFO_Column(self, samples):
'''
Generic VCF parser which extracts allele frequency and depth from the INFO field, rather than the SAMPLE field associated with a specific sample. May be useful for cases in which aggregate VAF and DP are more interesting than per-sample metrics, or when aggregate values are available and per-sample values are not.
Input:
'''
out = {}
info = self.row.info
for sample, values in samples.items():
dp = None
vf = None
for key in ['VAF', 'VF', 'AF']: # expandable to accomadate other abbreviations
if key in info:
vf = np.sum(info[key]) # sum allele frequencies in the event that the position is multi-allelic
for key in ['DP', 'ADP']: # expandable to accomadate other abbreviations
if key in info:
dp = info[key]
out[sample] = (dp, vf)
return out
def parse_MAF(self):
''' DEPRECIATED AND UNTESTED'''
''' maf filetype parser function. Input: InputParser object. Output: Variant object '''
row = self.row
fieldId = self.fieldId
header = self.header
fn = self.fn
position = int(str(row[fieldId['Start_position']]).split('.')[0]) # case sensitive. what if, 'Start_Position' instead? case-insensitive hash lookup, or make everything lowercase befor making comparisons?
dp = int(str(row[fieldId['TTotCov']]).split('.')[0])
vf = float( float(row[fieldId['TVarCov']])/float(dp) )
chrom = str(row[fieldId['Chromosome']])
ref = str(row[fieldId['Reference_Allele']])
alt = str(row[fieldId['Tumor_Seq_Allele2']])
effect = self.eff
fc = self.fc
if ref == "-":
ref = ""
if alt == "-":
alt = ""
var = Variant(source=os.path.basename(fn), pos=HTSeq.GenomicPosition(chrom, int(position)), ref=ref, alt=alt, frac=vf, dp=dp, eff=effect.strip(';'), fc=fc.strip(';'))
return var
def castList(list_or_string):
'''
Input: a string or a list
Output: list
'''
if isinstance(list_or_string, list):
return list_or_string
else:
return [list_or_string]
def parse_EFC(INFO):
# attempt to extract 'effect' and 'functional consequence' from the VCF line
effect = "" # e.g. E123K
fc = "" # e.g. SYNONYMOUS_CODING
# check for snpeff annotation first
if 'EFF' in INFO:
castList(INFO['EFF'])
fc = ";".join([x for x in set([ x.split('(')[0] for x in castList(INFO['EFF']) ])])
effect = ";".join([x for x in set([ x.split('|')[3] for x in castList(INFO['EFF']) if x.split('|')[3] ])])
# check for newer type of snpeff notation
elif 'ANN' in INFO:
castList(INFO['ANN'])
fc = ";".join([x for x in set([ x.split('|')[1] for x in castList(INFO['ANN']) ])])
effect = ";".join([x for x in set([ x.split('|')[9] for x in castList(INFO['ANN']) ])])
# if the MiSeq software reported functional consequence and effect and the file is not snpEff anotated, the MiSeq annotations will be used instead
elif 'FC' in INFO:
fc = ";".join([x for x in set( [y.split('_')[0] for y in castList(INFO['FC'])] )])
try:
effect = ";".join([x for x in set( [y.split('_')[1] for y in castList(INFO['FC'])] )])
except IndexError:
# this mutation has no consequence
# i.e. Silent, Noncoding, etc.
pass
if 'EXON' in INFO:
fc += ";EXON"
return effect, fc