forked from ndfriedman/myMskccUtils
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcombine_impact_data_util.py
231 lines (164 loc) · 10.4 KB
/
combine_impact_data_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
#written by Noah Friedman
#a script intended to be a pipleine for impact data aglommeration and other stuff
import sys
import argparse
import os
import pandas as pd
import numpy as np
import data_compacting_and_cleaning_util
#TODO: make these functions parameter based instead of arguemnt based
#function to combine case based annotations (patient drugs etc)
def combine_case_based_annotations(clinicalInfoTablePath=None, survivalDataPath=None, drugDataPath=None,
caseDfIdCol='PATIENT_ID', writeCombinedDf=False, outputFilename='combinedCaseBasedAnnotations.tsv', outputDirPath='/ifs/work/taylorlab/friedman/myAdjustedDataFiles'):
caseBasedDfsToAmalgamate = [] # a list of case based dfs to amalgamate (clinical dfs etc)
if clinicalInfoTablePath != None:
clinicalDf = pd.read_table(clinicalInfoTablePath)
clinicalDf = clinicalDf.rename(columns={'Patient_Identifier': caseDfIdCol})
caseBasedDfsToAmalgamate.append(clinicalDf)
if survivalDataPath != None: caseBasedDfsToAmalgamate.append(pd.read_table(survivalDataPath))
if drugDataPath != None: caseBasedDfsToAmalgamate.append(pd.read_table(drugDataPath))
#ALERT do renames if needed for casedf id col
mergedCaseBasedDf = data_compacting_and_cleaning_util.amalgamate_dfs_same_col(caseBasedDfsToAmalgamate, caseDfIdCol, 'combinedCaseBasedAnnotations.tsv', mode='return')
if writeCombinedDf:
writePath = os.path.join(outputDirPath, outputFilename)
print 'writing data to ', writePath
dfFinal.to_csv(writePath, index=False, sep='\t')
return mergedCaseBasedDf
#util function to combine sample specific annotations
def combine_sample_based_annotations(signatureTablePath=None, sampleInfoPath=None, cnaSummaryInfoPath=None,
sampleDfIdCol='Tumor_Sample_Barcode'):
sampleBasedDfsToAmalgamate = [] # a list of sample based dfs to amalgamate (signatures etc)
if signatureTablePath != None: sampleBasedDfsToAmalgamate.append(pd.read_table(signatureTablePath))
if sampleInfoPath != None:
sampleDf = pd.read_table(sampleInfoPath)
sampleDf = sampleDf.rename(columns ={'#Sample Identifier': 'Tumor_Sample_Barcode'})
sampleBasedDfsToAmalgamate.append(sampleDf)
if cnaSummaryInfoPath != None:
sampleBasedDfsToAmalgamate.append(pd.read_table(cnaSummaryInfoPath))
mergedSampleBasedDf = data_compacting_and_cleaning_util.amalgamate_dfs_same_col(sampleBasedDfsToAmalgamate, sampleDfIdCol, 'combinedSampleBasedAnnotations.tsv', mode='return')
return mergedSampleBasedDf
def get_specific_cancer_type_case_ids(cancerTypes, pathToCancerTypeLists='/ifs/work/taylorlab/friedman/clinicalData/msk-impact/msk-impact/case_lists'):
idSet = set()
for cancerType in cancerTypes:
path = os.path.join(pathToCancerTypeLists, cancerType)
with open(path) as f:
lines = f.readlines()
ids = lines[4]
caseIds = ids.strip('\n').split('\t')[1:]
idSet = idSet | set(caseIds)
f.close()
return idSet
def combine_gene_based_dfs(
somaticMutationsPath=None, signaturesDFPath=None, cnaDfPath=None, makeGenesCols=False,
writeCombinedDf=False, outputFilename='geneBasedAnnotations.tsv', outputDirPath='/ifs/work/taylorlab/friedman/myAdjustedDataFiles'):
#util function to rename gene columns with a suffix so there isnt a conflict for gene amplifications/variants etc
def rename_cols_with_suffix(df, colSuffix):
cols = list(df.columns.values)
cols.remove('Tumor_Sample_Barcode')
renameDict = {key: value for (key, value) in [(string, string + colSuffix) for string in cols]}
df = df.rename(columns=renameDict)
return df
geneBasedDfsToAmalgamage = [] #a list of dfs with information on a per gene basis
if cnaDfPath != None:
dfCNA = data_compacting_and_cleaning_util.parse_data_format_cases_on_line_one(cnaDfPath)
dfCNA = rename_cols_with_suffix(dfCNA, '_CNA')
geneBasedDfsToAmalgamage.append(dfCNA)
if somaticMutationsPath != None: #TODO qualify this information based on how we are going to treat mutations
geneMatrix = data_compacting_and_cleaning_util.parse_mut_data_into_gene_matrix(somaticMutationsPath, write=False)
geneMatrix = rename_cols_with_suffix(geneMatrix, '_MUT')
geneBasedDfsToAmalgamage.append(geneMatrix)
if signaturesDFPath != None:
dfSignatures = pd.read_table(signaturesDFPath)
geneBasedDfsToAmalgamage.append(dfSignatures)
dfFinal = reduce(lambda left,right: pd.merge(left,right,on='Tumor_Sample_Barcode'), geneBasedDfsToAmalgamage)
if writeCombinedDf:
writePath = os.path.join(outputDirPath, outputFilename)
print 'writing data to ', writePath
dfFinal.to_csv(writePath, index=False, sep='\t')
return dfFinal
# a utility function for combining multiple dfs with information about variants
#creates a join col based on chromosome, pos alt
def combine_variant_based_dfs(mafAnnoDf=None, triuncDf=None,
writeCombinedDf=False, outputFilename='mergedAnnotatedMaf.maf', outputDirPath='/ifs/work/taylorlab/friedman/myAdjustedDataFiles'):
#create a unique identifier for joining the dfs
def create_chr_pos_join_key(row):
return str(row['Chromosome']) + ':' + str(row['Start_Position']) + '->' + str(row['End_Position']) + ';' + str(row['Tumor_Seq_Allele2']) + '/' + str(row['Tumor_Seq_Allele2'])
dfsToAmalgamate = []
if mafAnnoDf != None:
mafAnnoData = pd.read_table(mafAnnoDf)
mafAnnoData['joinCol'] = mafAnnoData.apply(lambda row: create_chr_pos_join_key(row), axis=1)
mafAnnoData = mafAnnoData.drop_duplicates(subset=['joinCol'])
dfsToAmalgamate.append(mafAnnoData)
if triuncDf != None:
triuncData = pd.read_table(triuncDf)
triuncData = triuncData[['Chromosome', 'Start_Position', 'End_Position', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'Ref_Tri']]
triuncData['joinCol'] = triuncData.apply(lambda row: create_chr_pos_join_key(row), axis=1)
triuncData = triuncData[['joinCol', 'Ref_Tri']]
triuncData = triuncData.drop_duplicates(subset=['joinCol'])
dfsToAmalgamate.append(triuncData)
mergedDf = pd.merge(dfsToAmalgamate[0], dfsToAmalgamate[1], how='inner')
if writeCombinedDf:
writePath = os.path.join(outputDirPath, outputFilename)
print 'writing data to ', writePath
mergedDf.to_csv(writePath, index=False, sep='\t')
return mergedDf
#a function for merging dfs of different sizes (ie a df of tumor sample barcodes and a df of clinical ids)
def merge_dfs_of_different_sizes(tsBarcodeDf, clinicalIdDf,
writeCombinedDf=False, outputFilename='mergedAnnotations.tsv', outputDirPath='/ifs/work/taylorlab/friedman/myAdjustedDataFiles'):
tsBarcodeDf['PATIENT_ID'] = tsBarcodeDf['Tumor_Sample_Barcode'].apply(lambda x: x[:9])
combinedDf = clinicalIdDf.merge(tsBarcodeDf)
if writeCombinedDf:
writePath = os.path.join(outputDirPath, outputFilename)
print 'writing data to ', writePath
combinedDf.to_csv(writePath, index=False, sep='\t')
return combinedDf
#simple merge and write wrapper for merges that require no manipulation of constituent dfs
def simple_df_merge(df1, df2, mergeOn=None,
writeCombinedDf=False, outputFilename='mergedAnnotations.tsv', outputDirPath='/ifs/work/taylorlab/friedman/myAdjustedDataFiles'):
mergedDf = df1.merge(df2, on=mergeOn)
if writeCombinedDf:
writePath = os.path.join(outputDirPath, outputFilename)
print 'writing data to ', writePath
mergedDf.to_csv(writePath, index=False, sep='\t')
return mergedDf
#returns a list of all the genes in the impact panel
def enumerate_impact_panel_genes(panelFile='/ifs/work/taylorlab/friedman/msk-impact/msk-impact/gene_panels/impact468_gene_panel.txt'):
with open(panelFile) as f:
lines = f.readlines()
panelLine = lines[3]
return panelLine.strip('\n').split('\t')[1:]
def main():
parser = argparse.ArgumentParser(description='Arg parser for this script')
parser.add_argument('--amalgamatedFilePath', help='a user specified path to an amalgamated file. if none, as default, create an amagamated file', default=None)
parser.add_argument('--signaturesTable', help='Table of signature data', default='/ifs/work/taylorlab/friedman/signatureInvestigation/inputFiles/impactSignaturesAlexGenerated.txt')
parser.add_argument('--facetsData', help='path to maf annotated with facets data', default='/ifs/work/taylorlab/friedman/signatureInvestigation/inputFiles/data_mutations_extended.mafAnno_fast.txt')
parser.add_argument('--clinicalInfoTable', help='Table of clinical info table', default='/ifs/work/taylorlab/friedman/signatureInvestigation/inputFiles/clinicalPatientDataReheadered.txt')
parser.add_argument('--survivalData', help='path to survival data', default='/ifs/work/taylorlab/friedman/signatureInvestigation/inputFiles/survivalData.txt')
parser.add_argument('--drugData', help='path to data about drug treatment data', default='/ifs/work/taylorlab/friedman/clinicalData/msk-impact/msk-impact/data_timeline.txt')
#/ifs/work/taylorlab/friedman/myAdjustedDataFiles/impactVarsWithTriunc.maf #VERSION OF MAF WITH REF TRI INFO
parser.add_argument('--impactSomaticMutations', help='the path to the data containing impact somatic mutations', default='/ifs/work/taylorlab/friedman/clinicalData/msk-impact/msk-impact/data_mutations_extended.txt')
parser.add_argument('--impactGermlineMutations', help='the path to the data containing impact germline mutations', default=None)
parser.add_argument('--clonalityData', help='path to clonality info about the sample', default='/ifs/work/taylorlab/friedman/signatureInvestigation/clonalityInfo.tsv')
parser.add_argument('--cnaData', help='', default='/ifs/work/taylorlab/friedman/clinicalData/msk-impact/msk-impact/data_CNA.txt')
#arg parser modes
parser.add_argument('--geneVsVariantLevelAnnotationMode', help='a mode dictating whether the analysis of certain features such as facets or somatic mutations should be performed on the gene or variant level. There are three valid options: geneOnly, variantOnly, or both', default='geneOnly')
args = parser.parse_args()
amalgamatedDf = None
if args.amalgamatedFilePath != None:
amalgamatedDf = pd.read_table(args.amalgamatedFilePath)
else:
"""print args.clinicalInfoTable
mergedCaseBasedDf = combine_case_based_annotations(args, 'PATIENT_ID')
mergedSampleBasedDf = combine_sample_based_annotations(args, 'Tumor_Sample_Barcode')
print mergedCaseBasedDf
print mergedSampleBasedDf"""
if args.geneVsVariantLevelAnnotationMode == 'geneOnly':
combine_gene_based_dfs(args.impactSomaticMutations)
elif args.geneVsVariantLevelAnnotationMode == 'variantOnly':
combine_variant_based_dfs(args)
elif args.geneVsVariantLevelAnnotationMode == 'both':
print 'both variant and gene mode not implemented'
else:
print 'invalid geneVsVariant annotation option specifed. valid options are: both, variantOnly and geneOnly'
if __name__ == '__main__':
main()