-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathdqn.py
172 lines (122 loc) · 6.98 KB
/
dqn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
"""Defines deep Q-network architectures.
Heavily influenced by DeepMind's seminal paper 'Playing Atari with Deep Reinforcement Learning'
(Mnih et al., 2013) and 'Human-level control through deep reinforcement learning' (Mnih et al.,
2015).
"""
import abc
import math
import numpy as np
import tensorflow as tf
def _convolutional_layer(x, shape, stride, activation_fn):
if len(shape) != 4:
raise ValueError('Shape "{}" is invalid. Must have length 4.'.format(shape))
num_input_params = shape[0] * shape[1] * shape[2]
num_output_params = shape[0] * shape[1] * shape[3]
maxval = math.sqrt(6 / (num_input_params + num_output_params))
W = tf.Variable(tf.random_uniform(shape, -maxval, maxval), name='Weights')
b = tf.Variable(tf.constant(0.1, tf.float32, [shape[3]]), name='Bias')
conv = tf.nn.conv2d(x, W, [1, stride, stride, 1], 'VALID')
return activation_fn(tf.nn.bias_add(conv, b))
def _fully_connected_layer(x, shape, bias_shape, activation_fn):
if len(shape) != 2:
raise ValueError('Shape "{}" is invalid. Must have length 2.'.format(shape))
maxval = 1 / math.sqrt(shape[0] + shape[1])
W = tf.Variable(tf.random_uniform(shape, -maxval, maxval), name='Weights')
b = tf.Variable(tf.constant(0.1, tf.float32, bias_shape), name='Bias')
return activation_fn(tf.matmul(x, W) + b)
class _QNetwork(abc.ABC):
"""Base class for neural networks that learn the Q (action value) function."""
@abc.abstractmethod
def __init__(self):
"""Creates a Q-network."""
def get_action_value(self, state, action):
"""Estimates the value of the specified action for the specified state.
Args:
state: State of the environment. Can be batched into multiple states.
action: A valid action. Can be batched into multiple actions.
"""
sess = tf.get_default_session()
return sess.run(self.estimated_action_value, {self.x: state, self.action: action})
def get_optimal_action_value(self, state):
"""Estimates the optimal action value for the specified state.
Args:
state: State of the environment. Can be batched into multiple states.
"""
sess = tf.get_default_session()
return sess.run(self.optimal_action_value, {self.x: state})
def get_optimal_action(self, state):
"""Estimates the optimal action for the specified state.
Args:
state: State of the environment. Can be batched into multiple states.
"""
sess = tf.get_default_session()
return sess.run(self.optimal_action, {self.x: state})
class DeepQNetwork(_QNetwork):
"""A deep Q-network."""
def __init__(self, state_shape, num_actions):
"""Creates a deep Q-network.
Args:
state_shape: A vector with three values, representing the width, height and depth of
input states. For example, the shape of 100x80 RGB images is [100, 80, 3].
num_actions: Number of possible actions.
"""
width, height, depth = state_shape
self.x = tf.placeholder(tf.float32, [None, width, height, depth], name='Input_States')
with tf.name_scope('Convolutional_Layer_1'):
h_conv1 = _convolutional_layer(self.x, [4, 4, depth, 64], 2, tf.nn.relu)
with tf.name_scope('Convolutional_Layer_2'):
h_conv2 = _convolutional_layer(h_conv1, [3, 3, 64, 64], 2, tf.nn.relu)
with tf.name_scope('Convolutional_Layer_3'):
h_conv3 = _convolutional_layer(h_conv2, [3, 3, 64, 64], 1, tf.nn.relu)
# Flatten the output to feed it into fully connected layers.
num_params = np.prod(h_conv3.get_shape().as_list()[1:])
h_flat = tf.reshape(h_conv3, [-1, num_params])
with tf.name_scope('Fully_Connected_Layer_1'):
h_fc = _fully_connected_layer(h_flat, [num_params, 512], [512], tf.nn.relu)
with tf.name_scope('Fully_Connected_Layer_2'):
# Use a single shared bias for each action.
self.Q = _fully_connected_layer(h_fc, [512, num_actions], [1], tf.identity)
# Estimate the optimal action and its expected value.
self.optimal_action = tf.squeeze(tf.argmax(self.Q, 1, name='Optimal_Action'))
self.optimal_action_value = tf.squeeze(tf.reduce_max(self.Q, 1))
# Estimate the value of the specified action.
self.action = tf.placeholder(tf.uint8, name='Action')
one_hot_action = tf.one_hot(self.action, num_actions)
self.estimated_action_value = tf.reduce_sum(self.Q * one_hot_action, 1)
class DuelingDeepQNetwork(_QNetwork):
"""A deep Q-network with a dueling architecture."""
def __init__(self, state_shape, num_actions):
"""Creates a deep Q-network with a dueling architecture.
Args:
state_shape: A vector with three values, representing the width, height and depth of
input states. For example, the shape of 100x80 RGB images is [100, 80, 3].
num_actions: Number of possible actions.
"""
width, height, depth = state_shape
self.x = tf.placeholder(tf.float32, [None, width, height, depth], name='Input_States')
with tf.name_scope('Convolutional_Layer_1'):
h_conv1 = _convolutional_layer(self.x, [4, 4, depth, 64], 2, tf.nn.relu)
with tf.name_scope('Convolutional_Layer_2'):
h_conv2 = _convolutional_layer(h_conv1, [3, 3, 64, 64], 2, tf.nn.relu)
with tf.name_scope('Convolutional_Layer_3'):
h_conv3 = _convolutional_layer(h_conv2, [3, 3, 64, 64], 1, tf.nn.relu)
# Flatten the output to feed it into fully connected layers.
num_params = np.prod(h_conv3.get_shape().as_list()[1:])
h_flat = tf.reshape(h_conv3, [-1, num_params])
with tf.name_scope('Advantage_Stream'):
h_advantage_fc = _fully_connected_layer(h_flat, [num_params, 512], [512], tf.nn.relu)
# Use a single shared bias for each action.
advantage = _fully_connected_layer(h_advantage_fc, [512, num_actions], [1], tf.identity)
with tf.name_scope('State_Value_Stream'):
h_state_value_fc = _fully_connected_layer(h_flat, [num_params, 512], [512], tf.nn.relu)
state_value = _fully_connected_layer(h_state_value_fc, [512, 1], [1], tf.identity)
# Connect streams and estimate action values (Q). To improve training stability as suggested
# by Wang et al., 2015, Q = state value + advantage - mean(advantage).
self.Q = state_value + advantage - tf.reduce_mean(advantage, 1, keep_dims=True)
# Estimate the optimal action and its expected value.
self.optimal_action = tf.squeeze(tf.argmax(self.Q, 1, name='Optimal_Action'))
self.optimal_action_value = tf.squeeze(tf.reduce_max(self.Q, 1))
# Estimate the value of the specified action.
self.action = tf.placeholder(tf.uint8, name='Action')
one_hot_action = tf.one_hot(self.action, num_actions)
self.estimated_action_value = tf.reduce_sum(self.Q * one_hot_action, 1)