from __future__ import print_function from builtins import range # Copyright 2018 The TensorFlow Authors All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== import numpy as np import tensorflow as tf # import moviepy.editor as mpy import time, os, traceback, multiprocessing, portalocker, sys import envwrap import util import valuerl, worldmodel from config import config MODEL_NAME = config["name"] LOG_PATH = util.create_directory("output/" + config["env"] + "/" + MODEL_NAME + "/" + config["log_path"]) + "/" + MODEL_NAME LOAD_PATH = util.create_directory("output/" + config["env"] + "/" + MODEL_NAME + "/" + config["save_model_path"]) OBS_DIM = np.prod(config["obs_dims"]) HIDDEN_DIM = config["hidden_dim"] ACTION_DIM = config["action_dim"] MAX_FRAMES = config["max_frames"] REWARD_SCALE = config["reward_scale"] DISCOUNT = config["discount"] ALGO = config["policy_config"]["algo"] AGENT_BATCH_SIZE = config["agent_config"]["batch_size"] EVALUATOR_BATCH_SIZE = config["evaluator_config"]["batch_size"] RELOAD_EVERY_N = config["agent_config"]["reload_every_n"] FRAMES_BEFORE_LEARNING = config["policy_config"]["frames_before_learning"] FRAMES_PER_UPDATE = config["policy_config"]["frames_per_update"] LEARNER_EPOCH_N = config["policy_config"]["epoch_n"] SYNC_UPDATES = config["policy_config"]["frames_per_update"] >= 0 POLICY_BAYESIAN_CONFIG = config["policy_config"]["bayesian"] AUX_CONFIG = config["aux_config"] DDPG_EXPLORE_CHANCE = config["policy_config"]["explore_chance"] if ALGO == "ddpg" else 0. MODEL_AUGMENTED = config["model_config"] is not False if MODEL_AUGMENTED: MODEL_BAYESIAN_CONFIG = config["model_config"]["bayesian"] FILENAME = sys.argv[3] if __name__ == '__main__': oprl = valuerl.ValueRL(MODEL_NAME, ALGO, OBS_DIM, ACTION_DIM, HIDDEN_DIM, REWARD_SCALE, DISCOUNT, POLICY_BAYESIAN_CONFIG, AUX_CONFIG, DDPG_EXPLORE_CHANCE) obs_loader = tf.placeholder(tf.float32, [1, OBS_DIM]) policy_actions, _ = oprl.build_evalution_graph(obs_loader, mode="exploit") if MODEL_AUGMENTED: next_obs_loader = tf.placeholder(tf.float32, [1, OBS_DIM]) reward_loader = tf.placeholder(tf.float32, [1]) done_loader = tf.placeholder(tf.float32, [1]) worldmodel = worldmodel.DeterministicWorldModel(MODEL_NAME, OBS_DIM, ACTION_DIM, HIDDEN_DIM, REWARD_SCALE, DISCOUNT, MODEL_BAYESIAN_CONFIG) _, _, _, _, _, confidence, _ = oprl.build_Q_expansion_graph(next_obs_loader, reward_loader, done_loader, worldmodel, rollout_len=3, model_ensembling=True) sess = tf.Session() sess.run(tf.global_variables_initializer()) oprl.load(sess, FILENAME) if MODEL_AUGMENTED: worldmodel.load(sess, FILENAME) env = envwrap.get_env(config["env"]) hist = np.zeros([4, 10]) for _ in range(10): ts = 0 rgb_frames = [] obs, reward, done, reset = env.reset(), 0, False, False while not reset: # env.internal_env.render() # rgb_frames.append(env.internal_env.render(mode='rgb_array')) # action = env.action_space.sample() all_actions = sess.run(policy_actions, feed_dict={obs_loader: np.array([obs])}) all_actions = np.clip(all_actions, -1., 1.) action = all_actions[0] obs, _reward, done, reset = env.step(action) if MODEL_AUGMENTED: _confidences = sess.run(confidence, feed_dict={next_obs_loader: np.expand_dims(obs,0), reward_loader: np.expand_dims(_reward,0), done_loader: np.expand_dims(done,0)}) # print "%.02f %.02f %.02f %.02f" % tuple(_confidences[0,0]) for h in range(4): bucket = int((_confidences[0,0,h]-1e-5)*10) hist[h,bucket] += 1 reward += _reward ts += 1 # print ts, _reward, reward print(ts, reward) hist /= np.sum(hist, axis=1, keepdims=True) for row in reversed(hist.T): print(' '.join(["%.02f"] * 4) % tuple(row)) #clip = mpy.ImageSequenceClip(rgb_frames, fps=100) #clip.write_videofile(FILENAME + "/movie.mp4")