Commit d5aba73e authored by Raymond Yuan's avatar Raymond Yuan
Browse files

minor changes

parent 439a7edc
...@@ -2,7 +2,6 @@ import os ...@@ -2,7 +2,6 @@ import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # see issue #152 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = "" os.environ["CUDA_VISIBLE_DEVICES"] = ""
import threading import threading
import gym import gym
import multiprocessing import multiprocessing
...@@ -17,7 +16,6 @@ from tensorflow.python import keras ...@@ -17,7 +16,6 @@ from tensorflow.python import keras
from tensorflow.python.keras import layers from tensorflow.python.keras import layers
tf.enable_eager_execution() tf.enable_eager_execution()
print("Eager execution: {}".format(tf.executing_eagerly()))
def str2bool(v): def str2bool(v):
if v.lower() in ('yes', 'true', 't', 'y', '1'): if v.lower() in ('yes', 'true', 't', 'y', '1'):
...@@ -32,9 +30,8 @@ parser = argparse.ArgumentParser(description='Run A3C algorithm on the game ' ...@@ -32,9 +30,8 @@ parser = argparse.ArgumentParser(description='Run A3C algorithm on the game '
'Cartpole.') 'Cartpole.')
parser.add_argument('--algorithm', default='a3c', type=str, parser.add_argument('--algorithm', default='a3c', type=str,
help='Choose between \'a3c\' and \'random\'.') help='Choose between \'a3c\' and \'random\'.')
parser.add_argument("--train", type=str2bool, default=True, parser.add_argument('--train', dest='train', action='store_true',
help='Train our model or to run an existing model and ' help='Train our model.')
'watch it play.')
parser.add_argument('--lr', default=0.0005, parser.add_argument('--lr', default=0.0005,
help='Learning rate for the shared optimizer.') help='Learning rate for the shared optimizer.')
parser.add_argument('--update-freq', default=20, type=int, parser.add_argument('--update-freq', default=20, type=int,
...@@ -45,10 +42,8 @@ parser.add_argument('--gamma', default=0.99, ...@@ -45,10 +42,8 @@ parser.add_argument('--gamma', default=0.99,
help='Discount factor of rewards.') help='Discount factor of rewards.')
parser.add_argument('--save-dir', default='/tmp/', type=str, parser.add_argument('--save-dir', default='/tmp/', type=str,
help='Directory in which you desire to save the model.') help='Directory in which you desire to save the model.')
args = parser.parse_args() args = parser.parse_args()
class ActorCriticModel(keras.Model): class ActorCriticModel(keras.Model):
def __init__(self, state_size, action_size): def __init__(self, state_size, action_size):
super(ActorCriticModel, self).__init__() super(ActorCriticModel, self).__init__()
...@@ -67,22 +62,6 @@ class ActorCriticModel(keras.Model): ...@@ -67,22 +62,6 @@ class ActorCriticModel(keras.Model):
values = self.values(v1) values = self.values(v1)
return logits, values return logits, values
def tf_wrap(np_array, dtype=np.float32):
"""Converts an np array to a tf constant.
Arguments:
np_array: Input array.
dtype: The desired data type of the array.
Returns:
A tensor of the np array of type dtype.
"""
if np_array.dtype != dtype:
np_array = np_array.astype(dtype)
return tf.constant(np_array)
def record(episode, def record(episode,
episode_reward, episode_reward,
worker_idx, worker_idx,
...@@ -170,7 +149,7 @@ class MasterAgent(): ...@@ -170,7 +149,7 @@ class MasterAgent():
print(self.state_size, self.action_size) print(self.state_size, self.action_size)
self.global_model = ActorCriticModel(self.state_size, self.action_size) # global network self.global_model = ActorCriticModel(self.state_size, self.action_size) # global network
self.global_model(tf.constant(np.random.random((1, self.state_size)), dtype=tf.float32)) self.global_model(tf.convert_to_tensor(np.random.random((1, self.state_size)), dtype=tf.float32))
def train(self): def train(self):
if args.algorithm == 'random': if args.algorithm == 'random':
...@@ -180,33 +159,31 @@ class MasterAgent(): ...@@ -180,33 +159,31 @@ class MasterAgent():
res_queue = Queue() res_queue = Queue()
# We run the algorithm on cpu only! workers = [Worker(self.state_size,
with tf.device('/cpu:0'): self.action_size,
workers = [Worker(self.state_size, self.global_model,
self.action_size, self.opt, res_queue,
self.global_model, i, save_dir=self.save_dir) for i in range(multiprocessing.cpu_count())]
self.opt, res_queue,
i, save_dir=self.save_dir) for i in range(multiprocessing.cpu_count())] for i, worker in enumerate(workers):
print("Starting worker {}".format(i))
for i, worker in enumerate(workers): worker.start()
print("Starting worker {}".format(i))
worker.start() moving_average_rewards = [] # record episode reward to plot
while True:
moving_average_rewards = [] # record episode reward to plot reward = res_queue.get()
while True: if reward is not None:
reward = res_queue.get() moving_average_rewards.append(reward)
if reward is not None: else:
moving_average_rewards.append(reward) break
else: [w.join() for w in workers]
break
[w.join() for w in workers] plt.plot(moving_average_rewards)
plt.ylabel('Moving average ep reward')
plt.plot(moving_average_rewards) plt.xlabel('Step')
plt.ylabel('Moving average ep reward') plt.savefig(os.path.join(self.save_dir,
plt.xlabel('Step') '{} Moving Average.png'.format(self.game_name)))
plt.savefig(os.path.join(self.save_dir, plt.show()
'{} Moving Average.png'.format(self.game_name)))
plt.show()
def play(self): def play(self):
env = gym.make(self.game_name).unwrapped env = gym.make(self.game_name).unwrapped
...@@ -222,7 +199,7 @@ class MasterAgent(): ...@@ -222,7 +199,7 @@ class MasterAgent():
try: try:
while not done: while not done:
env.render(mode='rgb_array') env.render(mode='rgb_array')
policy, value = model(tf_wrap(state[None, :])) policy, value = model(tf.convert_to_tensor(state[None, :], dtype=tf.float32))
policy = tf.nn.softmax(policy) policy = tf.nn.softmax(policy)
action = np.argmax(policy) action = np.argmax(policy)
state, reward, done, _ = env.step(action) state, reward, done, _ = env.step(action)
...@@ -295,7 +272,9 @@ class Worker(threading.Thread): ...@@ -295,7 +272,9 @@ class Worker(threading.Thread):
time_count = 0 time_count = 0
done = False done = False
while not done: while not done:
logits, _ = self.local_model(tf_wrap(current_state[None, :])) logits, _ = self.local_model(
tf.convert_to_tensor(current_state[None, :],
dtype=tf.float32))
probs = tf.nn.softmax(logits) probs = tf.nn.softmax(logits)
action = np.random.choice(self.action_size, p=probs.numpy()[0]) action = np.random.choice(self.action_size, p=probs.numpy()[0])
...@@ -356,7 +335,9 @@ class Worker(threading.Thread): ...@@ -356,7 +335,9 @@ class Worker(threading.Thread):
if done: if done:
reward_sum = 0. # terminal reward_sum = 0. # terminal
else: else:
reward_sum = self.local_model(tf_wrap(new_state[None, :]))[-1].numpy()[0] reward_sum = self.local_model(
tf.convert_to_tensor(new_state[None, :],
dtype=tf.float32))[-1].numpy()[0]
# Get discounted rewards # Get discounted rewards
discounted_rewards = [] discounted_rewards = []
...@@ -365,9 +346,12 @@ class Worker(threading.Thread): ...@@ -365,9 +346,12 @@ class Worker(threading.Thread):
discounted_rewards.append(reward_sum) discounted_rewards.append(reward_sum)
discounted_rewards.reverse() discounted_rewards.reverse()
logits, values = self.local_model(tf_wrap(np.vstack(memory.states))) logits, values = self.local_model(
tf.convert_to_tensor(np.vstack(memory.states),
dtype=tf.float32))
# Get our advantages # Get our advantages
advantage = tf_wrap(np.array(discounted_rewards)[:, None]) - values advantage = tf.convert_to_tensor(np.array(discounted_rewards)[:, None],
dtype=tf.float32) - values
# Value loss # Value loss
value_loss = advantage ** 2 value_loss = advantage ** 2
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment