-
Notifications
You must be signed in to change notification settings - Fork 0
/
distillation.py
254 lines (221 loc) · 8.16 KB
/
distillation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
# -*- coding: utf-8 -*-
import sys
import os
import tensorflow as tf
from Agent.MaskedSoftmax import MaskedSoftmax
if 'COLAB_GPU' in os.environ:
# fix resolve modules
from os.path import dirname
sys.path.append(dirname(dirname(dirname(__file__))))
else: # local GPU
gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_virtual_device_configuration(
gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=3 * 1024)]
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import Huber
import tensorflow.keras as keras
from model import createModel
from Core.MazeRLWrapper import MazeRLWrapper
from Utils.ExperienceBuffers.CebPrioritized import CebPrioritized
from Agent.DQNAgent import DQNAgent
from Agent.DQNEnsembleAgent import DQNEnsembleAgent
import time
import Utils
from Utils.ExperienceBuffers.CebLinear import CebLinear
import glob
import numpy as np
#######################################
def train(model, trainableModel, memory, params):
modelClone = tf.keras.models.clone_model(model)
modelClone.set_weights(model.get_weights()) # use clone model for stability
BOOTSTRAPPED_STEPS = params['steps']
GAMMA = params['gamma']
ALPHA = params.get('alpha', 1.0)
rows = np.arange(params['batchSize'])
lossSum = 0
for _ in range(params['episodes']):
allStates, actions, rewards, actionsMask, teacherPredictions, nextStateScoreMultiplier = memory.sampleSequenceBatch(
batch_size=params['batchSize'],
maxSamplesFromEpisode=params.get('maxSamplesFromEpisode', 16),
sequenceLen=BOOTSTRAPPED_STEPS + 1
)
states = allStates[:, :-1]
rewards = rewards[:, :-1]
actions = actions[:, 0]
futureScores = modelClone.predict(allStates[:, -1]).max(axis=-1) * nextStateScoreMultiplier[:, -1]
totalRewards = (rewards * (GAMMA ** np.arange(BOOTSTRAPPED_STEPS))).sum(axis=-1)
targets = modelClone.predict(states[:, 0])
targets[rows, actions] += ALPHA * (
totalRewards + futureScores * (GAMMA ** BOOTSTRAPPED_STEPS) - targets[rows, actions]
)
lossSum += trainableModel.fit(
[states[:, 0], teacherPredictions[:, 0], actionsMask[:, 0], targets],
epochs=1, verbose=0
).history['loss'][0]
###
return lossSum / params['episodes']
def complexLoss(valueLoss, teacherPower, distributions, actionsMasks, y_true, y_pred, y_pred_softmax):
# mask out invalid actions
lossValues = valueLoss(y_true * actionsMasks, y_pred * actionsMasks)
lossDistribution = keras.losses.kl_divergence(distributions * actionsMasks, y_pred_softmax * actionsMasks)
return lossValues + (lossDistribution * teacherPower)
def wrapStudentModel(student):
inputA = keras.layers.Input(shape=student.layers[0].input_shape[0][1:])
inputDistributions = keras.layers.Input(shape=(4, ))
inputMasks = keras.layers.Input(shape=(4, ))
inputTargets = keras.layers.Input(shape=(4, ))
teacherPower = tf.Variable(1.0, tf.float32)
res = student(inputA)
resSoftmax = MaskedSoftmax()(res, inputMasks)
model = keras.Model(inputs=[inputA, inputDistributions, inputMasks, inputTargets], outputs=[res, resSoftmax])
model.add_loss(complexLoss(
Huber(delta=1),
teacherPower,
inputDistributions, inputMasks, inputTargets,
res, resSoftmax
))
model.compile(optimizer=Adam(lr=1e-3), loss=None )
return model, teacherPower
def learn_environment(teacher, model, params):
NAME = params['name']
BATCH_SIZE = params['batch size']
GAMMA = params['gamma']
BOOTSTRAPPED_STEPS = params['bootstrapped steps']
LOOP_LIMIT = params['maze']['loop limit']
metrics = {}
environments = [
MazeRLWrapper(params['maze']) for _ in range(params['test episodes'])
]
memory = CebPrioritized(maxSize=5000, sampleWeight='abs')
trainableModel, teacherPower = wrapStudentModel(model)
######################################################
def withTeacherPredictions(replay):
prevStates, actions, rewards, actionsMasks = zip(*replay)
teacherPredictions = teacher.predict(np.array(prevStates), np.array(actionsMasks))
return list(zip(prevStates, actions, rewards, actionsMasks, teacherPredictions))
def testModel(EXPLORE_RATE):
for e in environments: e.reset()
replays = [replay for replay, _ in Utils.emulateBatch(
environments,
DQNAgent(model, exploreRate=EXPLORE_RATE, noise=params.get('agent noise', 0)),
maxSteps=params.get('max test steps')
)
]
################
# explore if hit the loop
envsIndexes = [i for i, e in enumerate(environments) if e.hitTheLoop]
if envsIndexes:
envs = [environments[i] for i in envsIndexes]
for e in envs: e.Continue()
exploreReplays = Utils.emulateBatch(
envs,
DQNAgent(
model,
exploreRate=params.get('explore rate after loop', 1),
noise=params.get('agent noise after loop', 0)
),
maxSteps=params.get('max steps after loop', 16)
)
for ind, (replay, _) in zip(envsIndexes, exploreReplays):
replays[ind] += replay[1:]
################
for replay in replays:
if BOOTSTRAPPED_STEPS < len(replay):
memory.addEpisode(withTeacherPredictions(replay), terminated=True)
return [x.score for x in environments]
######################################################
# collect some experience
for _ in range(2):
testModel(EXPLORE_RATE=0)
#######################
bestModelScore = -float('inf')
for epoch in range(params['epochs']):
T = time.time()
EXPLORE_RATE = params['explore rate'](epoch)
alpha = params.get('alpha', lambda _: 1)(epoch)
teacherP = max((0, params.get('teacher power', lambda _: 1)(epoch) ))
teacherPower.assign(teacherP)
print(
'[%s] %d/%d epoch. Explore rate: %.3f. Alpha: %.5f. Teacher power: %.3f' % (
NAME, epoch, params['epochs'], EXPLORE_RATE, alpha, teacherP
)
)
##################
# Training
trainLoss = train(
model, trainableModel, memory,
{
'gamma': GAMMA,
'batchSize': BATCH_SIZE,
'steps': BOOTSTRAPPED_STEPS,
'episodes': params['train episodes'](epoch),
'alpha': alpha
}
)
print('Avg. train loss: %.4f' % trainLoss)
##################
# test
print('Testing...')
scores = testModel(EXPLORE_RATE)
Utils.trackScores(scores, metrics)
##################
scoreSum = sum(scores)
print('Scores sum: %.5f' % scoreSum)
if (bestModelScore < scoreSum) and (params['warm up epochs'] < epoch):
print('save best model (%.2f => %.2f)' % (bestModelScore, scoreSum))
bestModelScore = scoreSum
model.save_weights('weights/%s.h5' % NAME)
##################
os.makedirs('charts', exist_ok=True)
Utils.plotData2file(metrics, 'charts/%s.jpg' % NAME)
print('Epoch %d finished in %.1f sec.' % (epoch, time.time() - T))
print('------------------')
#######################################
MAZE_FOV = 3
MAZE_MINIMAP_SIZE = 8
MAZE_LOOPLIMIT = 32
#######################################
if __name__ == "__main__":
DEFAULT_MAZE_PARAMS = {
'size': 40,
'FOV': MAZE_FOV,
'minimapSize': MAZE_MINIMAP_SIZE,
'loop limit': MAZE_LOOPLIMIT,
}
MODEL_INPUT_SHAPE = MazeRLWrapper(DEFAULT_MAZE_PARAMS).input_size
models = []
for x in glob.iglob('weights/agent-*.h5'):
filename = os.path.abspath(x)
model = createModel(shape=MODEL_INPUT_SHAPE)
model.load_weights(filename)
models.append(model)
teacher = DQNEnsembleAgent(models)
#######################
DEFAULT_LEARNING_PARAMS = {
'maze': DEFAULT_MAZE_PARAMS,
'batch size': 256,
'gamma': 0.95,
'bootstrapped steps': 3,
'epochs': 100,
'warm up epochs': 0,
'test episodes': 128,
'train episodes': lambda _: 128,
'alpha': lambda _: 1,
'explore rate': lambda _: 0,
'agent noise': 0.01,
'explore rate after loop': 0.2,
'agent noise after loop': 0.1,
'max test steps': 1000
}
#######################
# just transfer distributions from teacher
learn_environment(
teacher,
createModel(shape=MODEL_INPUT_SHAPE),
{
**DEFAULT_LEARNING_PARAMS,
'name': 'distilled',
'teacher power': lambda epoch: 1,
}
)