-
Notifications
You must be signed in to change notification settings - Fork 0
/
trainer.py
249 lines (203 loc) · 8.69 KB
/
trainer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
import logging
import numpy as np
import scanpy as sc
import wandb
import time
import torch
import lightning as pl
import hydra
from models.byol_refactor import BYOL
from models.barlowtwins_refactor import BarlowTwins
from models.moco_refactor import MoCo
from models.vicreg_refactor import VICReg
from models.simclr_refactor import SimCLR
from models.simsiam_refactor import SimSiam
from models.nnclr_refactor import NNCLR
from models.concerto import Concerto
#from models.dino import *
from evaluator import infer_embedding, infer_embedding_separate
from anndata.experimental.pytorch import AnnLoader
from data.dataset import OurDataset, OurMultimodalDataset
from sklearn.metrics import f1_score, accuracy_score
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
import os
_model_dict = {"BYOL": BYOL, "BarlowTwins": BarlowTwins, "MoCo": MoCo, "VICReg": VICReg, "SimCLR": SimCLR, "SimSiam": SimSiam, "NNCLR": NNCLR, "Concerto": Concerto}
class CheckpointEveryNSteps(pl.Callback):
"""
Save a checkpoint every N steps, instead of Lightning's default that checkpoints
based on validation loss.
"""
def __init__(
self,
save_step_frequency,
prefix="checkpoint",
use_modelcheckpoint_filename=False,
):
"""
Args:
save_step_frequency: how often to save in steps
prefix: add a prefix to the name, only used if
use_modelcheckpoint_filename=False
use_modelcheckpoint_filename: just use the ModelCheckpoint callback's
default filename, don't use ours.
"""
self.save_step_frequency = save_step_frequency
self.prefix = prefix
self.use_modelcheckpoint_filename = use_modelcheckpoint_filename
def on_train_epoch_end(self, trainer: pl.Trainer, _):
""" Check if we should save a checkpoint after every train batch """
epoch = trainer.current_epoch
if epoch % self.save_step_frequency == 0:
if self.use_modelcheckpoint_filename:
filename = trainer.checkpoint_callback.filename
else:
filename = f"{self.prefix}_{epoch=}.ckpt"
ckpt_path = os.path.join(trainer.checkpoint_callback.dirpath, filename)
trainer.save_checkpoint(ckpt_path)
def train_model(dataset, model_config, random_seed, batch_size,
num_workers, n_epochs, logger, ckpt_dir, cfg=None):
model_name = model_config["model"]
train_loader = torch.utils.data.DataLoader(
dataset,
batch_size=batch_size,
num_workers=num_workers,
shuffle=True,
drop_last=True)
logger.info(f".. Dataloader ready. Now build {model_name}")
model = _model_dict[str(model_name)](**model_config)
print(n_epochs)
print(model_config)
device = "cuda" if torch.cuda.is_available() else "cpu"
trainer = pl.Trainer(max_epochs=n_epochs, accelerator=device, default_root_dir=ckpt_dir, callbacks=[CheckpointEveryNSteps(save_step_frequency=25)]) # cpu works for smaller tasks!!
logger.info(f".. Model ready. Now train on {device}.")
try:
trainer.fit(
model,
train_loader,
)
logger.info(f".. Training done.")
except Exception as error:
# handle the exception
logger.info(".. An exception occurred while training:", error)
return model
def inference(model, val_loader):
outs = []
for x in val_loader:
with torch.no_grad():
outs.append(model.predict(x.layers['counts']))
embedding = torch.concat(outs)
embedding = np.array(embedding)
return embedding
def train_clf(encoder, train_adata, val_adata, batch_size=256, num_workers=12, ctype_key='CellType'):
train_loader = torch.utils.data.DataLoader(
dataset=OurDataset(adata=train_adata, transforms=None, valid_ids=None),
batch_size=batch_size,
num_workers=num_workers,
shuffle=False,
drop_last=False
)
val_loader = torch.utils.data.DataLoader(
dataset=OurDataset(adata=val_adata, transforms=None, valid_ids=None),
batch_size=batch_size,
num_workers=num_workers,
shuffle=False,
drop_last=False
)
start = time.time()
train_X, val_X = infer_embedding(encoder, train_loader), infer_embedding(encoder, val_loader)
train_y = train_adata.obs[ctype_key]
val_y = val_adata.obs[ctype_key]
clf = KNeighborsClassifier(n_neighbors=11)
clf = clf.fit(train_X, train_y)
run_time = time.time() - start
y_pred = clf.predict(val_X)
maavg_f1 = f1_score(val_y, y_pred, average='macro')
accuracy = accuracy_score(val_y, y_pred)
return clf, maavg_f1, accuracy, run_time
def train_clf_multimodal(encoder, train_adata, val_adata, batch_size=256, num_workers=12, ctype_key='CellType'):
train_loader = torch.utils.data.DataLoader(
dataset=OurMultimodalDataset(adata=train_adata, transforms=None, valid_ids=None),
batch_size=batch_size,
num_workers=num_workers,
shuffle=False,
drop_last=False
)
val_loader = torch.utils.data.DataLoader(
dataset=OurMultimodalDataset(adata=val_adata, transforms=None, valid_ids=None),
batch_size=batch_size,
num_workers=num_workers,
shuffle=False,
drop_last=False
)
start = time.time()
train_X, val_X = infer_embedding(encoder, train_loader), infer_embedding(encoder, val_loader)
train_y = train_adata.obs[ctype_key]
val_y = val_adata.obs[ctype_key]
clf = KNeighborsClassifier(n_neighbors=11)
clf = clf.fit(train_X, train_y)
run_time = time.time() - start
y_pred = clf.predict(val_X)
maavg_f1 = f1_score(val_y, y_pred, average='macro')
accuracy = accuracy_score(val_y, y_pred)
return clf, maavg_f1, accuracy, run_time
def predict_protein_multimodal(encoder, train_adata, val_adata, batch_size=256, num_workers=12, ctype_key='CellType'):
train_dataset = OurMultimodalDataset(adata=train_adata, transforms=None, valid_ids=None)
train_loader = torch.utils.data.DataLoader(
dataset=train_dataset,
batch_size=batch_size,
num_workers=num_workers,
shuffle=False,
drop_last=False
)
val_dataset = OurMultimodalDataset(adata=val_adata, transforms=None, valid_ids=None)
val_loader = torch.utils.data.DataLoader(
dataset=val_dataset,
batch_size=batch_size,
num_workers=num_workers,
shuffle=False,
drop_last=False
)
start = time.time()
train_X, train_rna, _ = infer_embedding_separate(encoder, train_loader)
val_X, val_rna, _ = infer_embedding_separate(encoder, val_loader)
# Query-to-reference
train_y = train_adata.obs[ctype_key]
val_y = val_adata.obs[ctype_key]
clf = KNeighborsClassifier(n_neighbors=11)
clf = clf.fit(train_X, train_y)
run_time = time.time() - start
y_pred = clf.predict(val_X)
maavg_f1 = f1_score(val_y, y_pred, average='macro')
accuracy = accuracy_score(val_y, y_pred)
# TODO Query-to-reference only RNA
start3 = time.time()
clf_rna = KNeighborsClassifier(n_neighbors=11)
clf_rna = clf_rna.fit(train_rna, train_y)
run_time3 = time.time() - start3
y_pred_rna = clf_rna.predict(val_rna)
maavg_f1_rna = f1_score(val_y, y_pred_rna, average='macro')
accuracy_rna = accuracy_score(val_y, y_pred_rna)
# Predict protein and measure Pearson correlation
start2 = time.time()
nbrs = NearestNeighbors(metric='cosine', n_neighbors=5, algorithm='auto').fit(train_rna)
indices = nbrs.kneighbors(val_rna, return_distance=False)
val_new_protein = np.array(train_dataset.adata2.X.todense())[indices].mean(axis=1)
tmp = val_dataset.adata2.X.todense()
pearsons = []
for true_protein, pred_protein in zip(tmp, val_new_protein):
t1 = time.time()
pearsons.append(np.corrcoef(pred_protein, true_protein)[0, 1])
# Query-to-reference for with predicted protein
val_new_loader = torch.utils.data.DataLoader(
dataset=OurMultimodalDataset(adata=val_adata, transforms=None, valid_ids=None, new_protein=np.array(val_new_protein)),
batch_size=batch_size,
num_workers=num_workers,
shuffle=False,
drop_last=False
)
val_new_X = infer_embedding(encoder, val_new_loader)
run_time2 = time.time() - start2
y_pred2 = clf.predict(val_new_X)
maavg_f1_2 = f1_score(val_y, y_pred2, average='macro')
accuracy2 = accuracy_score(val_y, y_pred2)
return (clf, maavg_f1, accuracy, run_time), (maavg_f1_2, accuracy2, np.mean(pearsons), np.min(pearsons), np.max(pearsons), run_time2), (clf_rna, maavg_f1_rna, accuracy_rna, run_time3)