Commit 65da497f authored by Shining Sun's avatar Shining Sun
Browse files

Merge branch 'master' of https://github.com/tensorflow/models into cifar_keras

parents 93e0022d 7d032ea3
......@@ -22,7 +22,7 @@ import math
import numpy as np
import gym
import maze_env_utils
from environments import maze_env_utils
# Directory that contains mujoco xml files.
MODEL_DIR = 'environments/assets'
......@@ -39,6 +39,13 @@ class MazeEnv(gym.Env):
maze_id=None,
maze_height=0.5,
maze_size_scaling=8,
n_bins=0,
sensor_range=3.,
sensor_span=2 * math.pi,
observe_blocks=False,
put_spin_near_agent=False,
top_down_view=False,
manual_collision=False,
*args,
**kwargs):
self._maze_id = maze_id
......@@ -52,6 +59,14 @@ class MazeEnv(gym.Env):
self.MAZE_HEIGHT = height = maze_height
self.MAZE_SIZE_SCALING = size_scaling = maze_size_scaling
self._n_bins = n_bins
self._sensor_range = sensor_range * size_scaling
self._sensor_span = sensor_span
self._observe_blocks = observe_blocks
self._put_spin_near_agent = put_spin_near_agent
self._top_down_view = top_down_view
self._manual_collision = manual_collision
self.MAZE_STRUCTURE = structure = maze_env_utils.construct_maze(maze_id=self._maze_id)
self.elevated = any(-1 in row for row in structure) # Elevate the maze to allow for falling.
self.blocks = any(
......@@ -61,6 +76,13 @@ class MazeEnv(gym.Env):
torso_x, torso_y = self._find_robot()
self._init_torso_x = torso_x
self._init_torso_y = torso_y
self._init_positions = [
(x - torso_x, y - torso_y)
for x, y in self._find_all_robots()]
self._xy_to_rowcol = lambda x, y: (2 + (y + size_scaling / 2) / size_scaling,
2 + (x + size_scaling / 2) / size_scaling)
self._view = np.zeros([5, 5, 3]) # walls (immovable), chasms (fall), movable blocks
height_offset = 0.
if self.elevated:
......@@ -74,9 +96,13 @@ class MazeEnv(gym.Env):
default = tree.find(".//default")
default.find('.//geom').set('solimp', '.995 .995 .01')
self.movable_blocks = []
for i in range(len(structure)):
for j in range(len(structure[0])):
if self.elevated and structure[i][j] not in [-1]:
struct = structure[i][j]
if struct == 'r' and self._put_spin_near_agent:
struct = maze_env_utils.Move.SpinXY
if self.elevated and struct not in [-1]:
# Create elevated platform.
ET.SubElement(
worldbody, "geom",
......@@ -93,7 +119,7 @@ class MazeEnv(gym.Env):
conaffinity="1",
rgba="0.9 0.9 0.9 1",
)
if structure[i][j] == 1: # Unmovable block.
if struct == 1: # Unmovable block.
# Offset all coordinates so that robot starts at the origin.
ET.SubElement(
worldbody, "geom",
......@@ -111,26 +137,32 @@ class MazeEnv(gym.Env):
conaffinity="1",
rgba="0.4 0.4 0.4 1",
)
elif maze_env_utils.can_move(structure[i][j]): # Movable block.
elif maze_env_utils.can_move(struct): # Movable block.
# The "falling" blocks are shrunk slightly and increased in mass to
# ensure that it can fall easily through a gap in the platform blocks.
falling = maze_env_utils.can_move_z(structure[i][j])
shrink = 0.99 if falling else 1.0
moveable_body = ET.SubElement(
name = "movable_%d_%d" % (i, j)
self.movable_blocks.append((name, struct))
falling = maze_env_utils.can_move_z(struct)
spinning = maze_env_utils.can_spin(struct)
x_offset = 0.25 * size_scaling if spinning else 0.0
y_offset = 0.0
shrink = 0.1 if spinning else 0.99 if falling else 1.0
height_shrink = 0.1 if spinning else 1.0
movable_body = ET.SubElement(
worldbody, "body",
name="moveable_%d_%d" % (i, j),
pos="%f %f %f" % (j * size_scaling - torso_x,
i * size_scaling - torso_y,
name=name,
pos="%f %f %f" % (j * size_scaling - torso_x + x_offset,
i * size_scaling - torso_y + y_offset,
height_offset +
height / 2 * size_scaling),
height / 2 * size_scaling * height_shrink),
)
ET.SubElement(
moveable_body, "geom",
movable_body, "geom",
name="block_%d_%d" % (i, j),
pos="0 0 0",
size="%f %f %f" % (0.5 * size_scaling * shrink,
0.5 * size_scaling * shrink,
height / 2 * size_scaling),
height / 2 * size_scaling * height_shrink),
type="box",
material="",
mass="0.001" if falling else "0.0002",
......@@ -138,45 +170,56 @@ class MazeEnv(gym.Env):
conaffinity="1",
rgba="0.9 0.1 0.1 1"
)
if maze_env_utils.can_move_x(structure[i][j]):
if maze_env_utils.can_move_x(struct):
ET.SubElement(
moveable_body, "joint",
movable_body, "joint",
armature="0",
axis="1 0 0",
damping="0.0",
limited="true" if falling else "false",
range="%f %f" % (-size_scaling, size_scaling),
margin="0.01",
name="moveable_x_%d_%d" % (i, j),
name="movable_x_%d_%d" % (i, j),
pos="0 0 0",
type="slide"
)
if maze_env_utils.can_move_y(structure[i][j]):
if maze_env_utils.can_move_y(struct):
ET.SubElement(
moveable_body, "joint",
movable_body, "joint",
armature="0",
axis="0 1 0",
damping="0.0",
limited="true" if falling else "false",
range="%f %f" % (-size_scaling, size_scaling),
margin="0.01",
name="moveable_y_%d_%d" % (i, j),
name="movable_y_%d_%d" % (i, j),
pos="0 0 0",
type="slide"
)
if maze_env_utils.can_move_z(structure[i][j]):
if maze_env_utils.can_move_z(struct):
ET.SubElement(
moveable_body, "joint",
movable_body, "joint",
armature="0",
axis="0 0 1",
damping="0.0",
limited="true",
range="%f 0" % (-height_offset),
margin="0.01",
name="moveable_z_%d_%d" % (i, j),
name="movable_z_%d_%d" % (i, j),
pos="0 0 0",
type="slide"
)
if maze_env_utils.can_spin(struct):
ET.SubElement(
movable_body, "joint",
armature="0",
axis="0 0 1",
damping="0.0",
limited="false",
name="spinable_%d_%d" % (i, j),
pos="0 0 0",
type="ball"
)
torso = tree.find(".//body[@name='torso']")
geoms = torso.findall(".//geom")
......@@ -190,13 +233,203 @@ class MazeEnv(gym.Env):
self.wrapped_env = model_cls(*args, file_path=file_path, **kwargs)
def get_ori(self):
return self.wrapped_env.get_ori()
def get_top_down_view(self):
self._view = np.zeros_like(self._view)
def valid(row, col):
return self._view.shape[0] > row >= 0 and self._view.shape[1] > col >= 0
def update_view(x, y, d, row=None, col=None):
if row is None or col is None:
x = x - self._robot_x
y = y - self._robot_y
th = self._robot_ori
row, col = self._xy_to_rowcol(x, y)
update_view(x, y, d, row=row, col=col)
return
row, row_frac, col, col_frac = int(row), row % 1, int(col), col % 1
if row_frac < 0:
row_frac += 1
if col_frac < 0:
col_frac += 1
if valid(row, col):
self._view[row, col, d] += (
(min(1., row_frac + 0.5) - max(0., row_frac - 0.5)) *
(min(1., col_frac + 0.5) - max(0., col_frac - 0.5)))
if valid(row - 1, col):
self._view[row - 1, col, d] += (
(max(0., 0.5 - row_frac)) *
(min(1., col_frac + 0.5) - max(0., col_frac - 0.5)))
if valid(row + 1, col):
self._view[row + 1, col, d] += (
(max(0., row_frac - 0.5)) *
(min(1., col_frac + 0.5) - max(0., col_frac - 0.5)))
if valid(row, col - 1):
self._view[row, col - 1, d] += (
(min(1., row_frac + 0.5) - max(0., row_frac - 0.5)) *
(max(0., 0.5 - col_frac)))
if valid(row, col + 1):
self._view[row, col + 1, d] += (
(min(1., row_frac + 0.5) - max(0., row_frac - 0.5)) *
(max(0., col_frac - 0.5)))
if valid(row - 1, col - 1):
self._view[row - 1, col - 1, d] += (
(max(0., 0.5 - row_frac)) * max(0., 0.5 - col_frac))
if valid(row - 1, col + 1):
self._view[row - 1, col + 1, d] += (
(max(0., 0.5 - row_frac)) * max(0., col_frac - 0.5))
if valid(row + 1, col + 1):
self._view[row + 1, col + 1, d] += (
(max(0., row_frac - 0.5)) * max(0., col_frac - 0.5))
if valid(row + 1, col - 1):
self._view[row + 1, col - 1, d] += (
(max(0., row_frac - 0.5)) * max(0., 0.5 - col_frac))
# Draw ant.
robot_x, robot_y = self.wrapped_env.get_body_com("torso")[:2]
self._robot_x = robot_x
self._robot_y = robot_y
self._robot_ori = self.get_ori()
structure = self.MAZE_STRUCTURE
size_scaling = self.MAZE_SIZE_SCALING
height = self.MAZE_HEIGHT
# Draw immovable blocks and chasms.
for i in range(len(structure)):
for j in range(len(structure[0])):
if structure[i][j] == 1: # Wall.
update_view(j * size_scaling - self._init_torso_x,
i * size_scaling - self._init_torso_y,
0)
if structure[i][j] == -1: # Chasm.
update_view(j * size_scaling - self._init_torso_x,
i * size_scaling - self._init_torso_y,
1)
# Draw movable blocks.
for block_name, block_type in self.movable_blocks:
block_x, block_y = self.wrapped_env.get_body_com(block_name)[:2]
update_view(block_x, block_y, 2)
return self._view
def get_range_sensor_obs(self):
"""Returns egocentric range sensor observations of maze."""
robot_x, robot_y, robot_z = self.wrapped_env.get_body_com("torso")[:3]
ori = self.get_ori()
structure = self.MAZE_STRUCTURE
size_scaling = self.MAZE_SIZE_SCALING
height = self.MAZE_HEIGHT
segments = []
# Get line segments (corresponding to outer boundary) of each immovable
# block or drop-off.
for i in range(len(structure)):
for j in range(len(structure[0])):
if structure[i][j] in [1, -1]: # There's a wall or drop-off.
cx = j * size_scaling - self._init_torso_x
cy = i * size_scaling - self._init_torso_y
x1 = cx - 0.5 * size_scaling
x2 = cx + 0.5 * size_scaling
y1 = cy - 0.5 * size_scaling
y2 = cy + 0.5 * size_scaling
struct_segments = [
((x1, y1), (x2, y1)),
((x2, y1), (x2, y2)),
((x2, y2), (x1, y2)),
((x1, y2), (x1, y1)),
]
for seg in struct_segments:
segments.append(dict(
segment=seg,
type=structure[i][j],
))
# Get line segments (corresponding to outer boundary) of each movable
# block within the agent's z-view.
for block_name, block_type in self.movable_blocks:
block_x, block_y, block_z = self.wrapped_env.get_body_com(block_name)[:3]
if (block_z + height * size_scaling / 2 >= robot_z and
robot_z >= block_z - height * size_scaling / 2): # Block in view.
x1 = block_x - 0.5 * size_scaling
x2 = block_x + 0.5 * size_scaling
y1 = block_y - 0.5 * size_scaling
y2 = block_y + 0.5 * size_scaling
struct_segments = [
((x1, y1), (x2, y1)),
((x2, y1), (x2, y2)),
((x2, y2), (x1, y2)),
((x1, y2), (x1, y1)),
]
for seg in struct_segments:
segments.append(dict(
segment=seg,
type=block_type,
))
sensor_readings = np.zeros((self._n_bins, 3)) # 3 for wall, drop-off, block
for ray_idx in range(self._n_bins):
ray_ori = (ori - self._sensor_span * 0.5 +
(2 * ray_idx + 1.0) / (2 * self._n_bins) * self._sensor_span)
ray_segments = []
# Get all segments that intersect with ray.
for seg in segments:
p = maze_env_utils.ray_segment_intersect(
ray=((robot_x, robot_y), ray_ori),
segment=seg["segment"])
if p is not None:
ray_segments.append(dict(
segment=seg["segment"],
type=seg["type"],
ray_ori=ray_ori,
distance=maze_env_utils.point_distance(p, (robot_x, robot_y)),
))
if len(ray_segments) > 0:
# Find out which segment is intersected first.
first_seg = sorted(ray_segments, key=lambda x: x["distance"])[0]
seg_type = first_seg["type"]
idx = (0 if seg_type == 1 else # Wall.
1 if seg_type == -1 else # Drop-off.
2 if maze_env_utils.can_move(seg_type) else # Block.
None)
if first_seg["distance"] <= self._sensor_range:
sensor_readings[ray_idx][idx] = (self._sensor_range - first_seg["distance"]) / self._sensor_range
return sensor_readings
def _get_obs(self):
return np.concatenate([self.wrapped_env._get_obs(),
[self.t * 0.001]])
wrapped_obs = self.wrapped_env._get_obs()
if self._top_down_view:
view = [self.get_top_down_view().flat]
else:
view = []
if self._observe_blocks:
additional_obs = []
for block_name, block_type in self.movable_blocks:
additional_obs.append(self.wrapped_env.get_body_com(block_name))
wrapped_obs = np.concatenate([wrapped_obs[:3]] + additional_obs +
[wrapped_obs[3:]])
range_sensor_obs = self.get_range_sensor_obs()
return np.concatenate([wrapped_obs,
range_sensor_obs.flat] +
view + [[self.t * 0.001]])
def reset(self):
self.t = 0
self.trajectory = []
self.wrapped_env.reset()
if len(self._init_positions) > 1:
xy = random.choice(self._init_positions)
self.wrapped_env.set_xy(xy)
return self._get_obs()
@property
......@@ -226,9 +459,41 @@ class MazeEnv(gym.Env):
return j * size_scaling, i * size_scaling
assert False, 'No robot in maze specification.'
def _find_all_robots(self):
structure = self.MAZE_STRUCTURE
size_scaling = self.MAZE_SIZE_SCALING
coords = []
for i in range(len(structure)):
for j in range(len(structure[0])):
if structure[i][j] == 'r':
coords.append((j * size_scaling, i * size_scaling))
return coords
def _is_in_collision(self, pos):
x, y = pos
structure = self.MAZE_STRUCTURE
size_scaling = self.MAZE_SIZE_SCALING
for i in range(len(structure)):
for j in range(len(structure[0])):
if structure[i][j] == 1:
minx = j * size_scaling - size_scaling * 0.5 - self._init_torso_x
maxx = j * size_scaling + size_scaling * 0.5 - self._init_torso_x
miny = i * size_scaling - size_scaling * 0.5 - self._init_torso_y
maxy = i * size_scaling + size_scaling * 0.5 - self._init_torso_y
if minx <= x <= maxx and miny <= y <= maxy:
return True
return False
def step(self, action):
self.t += 1
inner_next_obs, inner_reward, done, info = self.wrapped_env.step(action)
if self._manual_collision:
old_pos = self.wrapped_env.get_xy()
inner_next_obs, inner_reward, done, info = self.wrapped_env.step(action)
new_pos = self.wrapped_env.get_xy()
if self._is_in_collision(new_pos):
self.wrapped_env.set_xy(old_pos)
else:
inner_next_obs, inner_reward, done, info = self.wrapped_env.step(action)
next_obs = self._get_obs()
done = False
return next_obs, inner_reward, done, info
......@@ -26,20 +26,27 @@ class Move(object):
XZ = 15
YZ = 16
XYZ = 17
SpinXY = 18
def can_move_x(movable):
return movable in [Move.X, Move.XY, Move.XZ, Move.XYZ]
return movable in [Move.X, Move.XY, Move.XZ, Move.XYZ,
Move.SpinXY]
def can_move_y(movable):
return movable in [Move.Y, Move.XY, Move.YZ, Move.XYZ]
return movable in [Move.Y, Move.XY, Move.YZ, Move.XYZ,
Move.SpinXY]
def can_move_z(movable):
return movable in [Move.Z, Move.XZ, Move.YZ, Move.XYZ]
def can_spin(movable):
return movable in [Move.SpinXY]
def can_move(movable):
return can_move_x(movable) or can_move_y(movable) or can_move_z(movable)
......@@ -70,7 +77,88 @@ def construct_maze(maze_id='Maze'):
[1, 0, 0, 1],
[1, 1, 1, 1],
]
elif maze_id == 'Block':
O = 'r'
structure = [
[1, 1, 1, 1, 1],
[1, O, 0, 0, 1],
[1, 0, 0, 0, 1],
[1, 0, 0, 0, 1],
[1, 1, 1, 1, 1],
]
elif maze_id == 'BlockMaze':
O = 'r'
structure = [
[1, 1, 1, 1],
[1, O, 0, 1],
[1, 1, 0, 1],
[1, 0, 0, 1],
[1, 1, 1, 1],
]
else:
raise NotImplementedError('The provided MazeId %s is not recognized' % maze_id)
return structure
def line_intersect(pt1, pt2, ptA, ptB):
"""
Taken from https://www.cs.hmc.edu/ACM/lectures/intersections.html
this returns the intersection of Line(pt1,pt2) and Line(ptA,ptB)
"""
DET_TOLERANCE = 0.00000001
# the first line is pt1 + r*(pt2-pt1)
# in component form:
x1, y1 = pt1
x2, y2 = pt2
dx1 = x2 - x1
dy1 = y2 - y1
# the second line is ptA + s*(ptB-ptA)
x, y = ptA
xB, yB = ptB
dx = xB - x
dy = yB - y
DET = (-dx1 * dy + dy1 * dx)
if math.fabs(DET) < DET_TOLERANCE: return (0, 0, 0, 0, 0)
# now, the determinant should be OK
DETinv = 1.0 / DET
# find the scalar amount along the "self" segment
r = DETinv * (-dy * (x - x1) + dx * (y - y1))
# find the scalar amount along the input line
s = DETinv * (-dy1 * (x - x1) + dx1 * (y - y1))
# return the average of the two descriptions
xi = (x1 + r * dx1 + x + s * dx) / 2.0
yi = (y1 + r * dy1 + y + s * dy) / 2.0
return (xi, yi, 1, r, s)
def ray_segment_intersect(ray, segment):
"""
Check if the ray originated from (x, y) with direction theta intersects the line segment (x1, y1) -- (x2, y2),
and return the intersection point if there is one
"""
(x, y), theta = ray
# (x1, y1), (x2, y2) = segment
pt1 = (x, y)
len = 1
pt2 = (x + len * math.cos(theta), y + len * math.sin(theta))
xo, yo, valid, r, s = line_intersect(pt1, pt2, *segment)
if valid and r >= 0 and 0 <= s <= 1:
return (xo, yo)
return None
def point_distance(p1, p2):
x1, y1 = p1
x2, y2 = p2
return ((x1 - x2) ** 2 + (y1 - y2) ** 2) ** 0.5
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Wrapper for creating the ant environment in gym_mujoco."""
import math
import numpy as np
from gym import utils
from gym.envs.mujoco import mujoco_env
class PointEnv(mujoco_env.MujocoEnv, utils.EzPickle):
FILE = "point.xml"
ORI_IND = 2
def __init__(self, file_path=None, expose_all_qpos=True):
self._expose_all_qpos = expose_all_qpos
mujoco_env.MujocoEnv.__init__(self, file_path, 1)
utils.EzPickle.__init__(self)
@property
def physics(self):
return self.model
def _step(self, a):
return self.step(a)
def step(self, action):
action[0] = 0.2 * action[0]
qpos = np.copy(self.physics.data.qpos)
qpos[2] += action[1]
ori = qpos[2]
# compute increment in each direction
dx = math.cos(ori) * action[0]
dy = math.sin(ori) * action[0]
# ensure that the robot is within reasonable range
qpos[0] = np.clip(qpos[0] + dx, -100, 100)
qpos[1] = np.clip(qpos[1] + dy, -100, 100)
qvel = self.physics.data.qvel
self.set_state(qpos, qvel)
for _ in range(0, self.frame_skip):
self.physics.step()
next_obs = self._get_obs()
reward = 0
done = False
info = {}
return next_obs, reward, done, info
def _get_obs(self):
if self._expose_all_qpos:
return np.concatenate([
self.physics.data.qpos.flat[:3], # Only point-relevant coords.
self.physics.data.qvel.flat[:3]])
return np.concatenate([
self.physics.data.qpos.flat[2:3],
self.physics.data.qvel.flat[:3]])
def reset_model(self):
qpos = self.init_qpos + self.np_random.uniform(
size=self.physics.model.nq, low=-.1, high=.1)
qvel = self.init_qvel + self.np_random.randn(self.physics.model.nv) * .1
# Set everything other than point to original position and 0 velocity.
qpos[3:] = self.init_qpos[3:]
qvel[3:] = 0.
self.set_state(qpos, qvel)
return self._get_obs()
def get_ori(self):
return self.model.data.qpos[self.__class__.ORI_IND]
def set_xy(self, xy):
qpos = np.copy(self.physics.data.qpos)
qpos[0] = xy[0]
qpos[1] = xy[1]
qvel = self.physics.data.qvel
# Copyright 2018 The TensorFlow Authors.
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -11,4 +11,11 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
from environments.maze_env import MazeEnv
from environments.point import PointEnv
class PointMazeEnv(MazeEnv):
MODEL_CLASS = PointEnv
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
r"""Script for evaluating a UVF agent.
To run locally: See run_eval.py
To run on borg: See train_eval.borg
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import tensorflow as tf
slim = tf.contrib.slim
import gin.tf
# pylint: disable=unused-import
import agent
import train
from utils import utils as uvf_utils
from utils import eval_utils
from environments import create_maze_env
# pylint: enable=unused-import
flags = tf.app.flags
flags.DEFINE_string('eval_dir', None,
'Directory for writing logs/summaries during eval.')
flags.DEFINE_string('checkpoint_dir', None,
'Directory containing checkpoints to eval.')
FLAGS = flags.FLAGS
def get_evaluate_checkpoint_fn(master, output_dir, eval_step_fns,
model_rollout_fn, gamma, max_steps_per_episode,
num_episodes_eval, num_episodes_videos,
tuner_hook, generate_videos,
generate_summaries, video_settings):
"""Returns a function that evaluates a given checkpoint.
Args:
master: BNS name of the TensorFlow master
output_dir: The output directory to which the metric summaries are written.
eval_step_fns: A dictionary of a functions that return a list of
[state, action, reward, discount, transition_type] tensors,
indexed by summary tag name.
model_rollout_fn: Model rollout fn.
gamma: Discount factor for the reward.
max_steps_per_episode: Maximum steps to run each episode for.
num_episodes_eval: Number of episodes to evaluate and average reward over.
num_episodes_videos: Number of episodes to record for video.
tuner_hook: A callable(average reward, global step) that updates a Vizier
tuner trial.
generate_videos: Whether to generate videos of the agent in action.
generate_summaries: Whether to generate summaries.
video_settings: Settings for generating videos of the agent.
Returns:
A function that evaluates a checkpoint.
"""
sess = tf.Session(master, graph=tf.get_default_graph())
sess.run(tf.global_variables_initializer())
sess.run(tf.local_variables_initializer())
summary_writer = tf.summary.FileWriter(output_dir)
def evaluate_checkpoint(checkpoint_path):
"""Performs a one-time evaluation of the given checkpoint.
Args:
checkpoint_path: Checkpoint to evaluate.
Returns:
True if the evaluation process should stop
"""
restore_fn = tf.contrib.framework.assign_from_checkpoint_fn(
checkpoint_path,
uvf_utils.get_all_vars(),
ignore_missing_vars=True,
reshape_variables=False)
assert restore_fn is not None, 'cannot restore %s' % checkpoint_path
restore_fn(sess)
global_step = sess.run(slim.get_global_step())
should_stop = False
max_reward = -1e10
max_meta_reward = -1e10
for eval_tag, (eval_step, env_base,) in sorted(eval_step_fns.items()):
if hasattr(env_base, 'set_sess'):
env_base.set_sess(sess) # set session
if generate_summaries:
tf.logging.info(
'[%s] Computing average reward over %d episodes at global step %d.',
eval_tag, num_episodes_eval, global_step)
(average_reward, last_reward,
average_meta_reward, last_meta_reward, average_success,
states, actions) = eval_utils.compute_average_reward(
sess, env_base, eval_step, gamma, max_steps_per_episode,
num_episodes_eval)
tf.logging.info('[%s] Average reward = %f', eval_tag, average_reward)
tf.logging.info('[%s] Last reward = %f', eval_tag, last_reward)
tf.logging.info('[%s] Average meta reward = %f', eval_tag, average_meta_reward)
tf.logging.info('[%s] Last meta reward = %f', eval_tag, last_meta_reward)
tf.logging.info('[%s] Average success = %f', eval_tag, average_success)
if model_rollout_fn is not None:
preds, model_losses = eval_utils.compute_model_loss(
sess, model_rollout_fn, states, actions)
for i, (pred, state, model_loss) in enumerate(
zip(preds, states, model_losses)):
tf.logging.info('[%s] Model rollout step %d: loss=%f', eval_tag, i,
model_loss)
tf.logging.info('[%s] Model rollout step %d: pred=%s', eval_tag, i,
str(pred.tolist()))
tf.logging.info('[%s] Model rollout step %d: state=%s', eval_tag, i,
str(state.tolist()))
# Report the eval stats to the tuner.
if average_reward > max_reward:
max_reward = average_reward
if average_meta_reward > max_meta_reward:
max_meta_reward = average_meta_reward
for (tag, value) in [('Reward/average_%s_reward', average_reward),
('Reward/last_%s_reward', last_reward),
('Reward/average_%s_meta_reward', average_meta_reward),
('Reward/last_%s_meta_reward', last_meta_reward),
('Reward/average_%s_success', average_success)]:
summary_str = tf.Summary(value=[
tf.Summary.Value(
tag=tag % eval_tag,
simple_value=value)
])
summary_writer.add_summary(summary_str, global_step)
summary_writer.flush()
if generate_videos or should_stop:
# Do a manual reset before generating the video to see the initial
# pose of the robot, towards which the reset controller is moving.
if hasattr(env_base, '_gym_env'):
tf.logging.info('Resetting before recording video')
if hasattr(env_base._gym_env, 'reset_model'):
env_base._gym_env.reset_model() # pylint: disable=protected-access
else:
env_base._gym_env.wrapped_env.reset_model()
video_filename = os.path.join(output_dir, 'videos',
'%s_step_%d.mp4' % (eval_tag,
global_step))
eval_utils.capture_video(sess, eval_step, env_base,
max_steps_per_episode * num_episodes_videos,
video_filename, video_settings,
reset_every=max_steps_per_episode)
should_stop = should_stop or (generate_summaries and tuner_hook and
tuner_hook(max_reward, global_step))
return bool(should_stop)
return evaluate_checkpoint
def get_model_rollout(uvf_agent, tf_env):
"""Model rollout function."""
state_spec = tf_env.observation_spec()[0]
action_spec = tf_env.action_spec()[0]
state_ph = tf.placeholder(dtype=state_spec.dtype, shape=state_spec.shape)
action_ph = tf.placeholder(dtype=action_spec.dtype, shape=action_spec.shape)
merged_state = uvf_agent.merged_state(state_ph)
diff_value = uvf_agent.critic_net(tf.expand_dims(merged_state, 0),
tf.expand_dims(action_ph, 0))[0]
diff_value = tf.cast(diff_value, dtype=state_ph.dtype)
state_ph.shape.assert_is_compatible_with(diff_value.shape)
next_state = state_ph + diff_value
def model_rollout_fn(sess, state, action):
return sess.run(next_state, feed_dict={state_ph: state, action_ph: action})
return model_rollout_fn
def get_eval_step(uvf_agent,
state_preprocess,
tf_env,
action_fn,
meta_action_fn,
environment_steps,
num_episodes,
mode='eval'):
"""Get one-step policy/env stepping ops.
Args:
uvf_agent: A UVF agent.
tf_env: A TFEnvironment.
action_fn: A function to produce actions given current state.
meta_action_fn: A function to produce meta actions given current state.
environment_steps: A variable to count the number of steps in the tf_env.
num_episodes: A variable to count the number of episodes.
mode: a string representing the mode=[train, explore, eval].
Returns:
A collect_experience_op that excute an action and store into the
replay_buffer
"""
tf_env.start_collect()
state = tf_env.current_obs()
action = action_fn(state, context=None)
state_repr = state_preprocess(state)
action_spec = tf_env.action_spec()
action_ph = tf.placeholder(dtype=action_spec.dtype, shape=action_spec.shape)
with tf.control_dependencies([state]):
transition_type, reward, discount = tf_env.step(action_ph)
def increment_step():
return environment_steps.assign_add(1)
def increment_episode():
return num_episodes.assign_add(1)
def no_op_int():
return tf.constant(0, dtype=tf.int64)
step_cond = uvf_agent.step_cond_fn(state, action,
transition_type,
environment_steps, num_episodes)
reset_episode_cond = uvf_agent.reset_episode_cond_fn(
state, action,
transition_type, environment_steps, num_episodes)
reset_env_cond = uvf_agent.reset_env_cond_fn(state, action,
transition_type,
environment_steps, num_episodes)
increment_step_op = tf.cond(step_cond, increment_step, no_op_int)
with tf.control_dependencies([increment_step_op]):
increment_episode_op = tf.cond(reset_episode_cond, increment_episode,
no_op_int)
with tf.control_dependencies([reward, discount]):
next_state = tf_env.current_obs()
next_state_repr = state_preprocess(next_state)
with tf.control_dependencies([increment_episode_op]):
post_reward, post_meta_reward = uvf_agent.cond_begin_episode_op(
tf.logical_not(reset_episode_cond),
[state, action_ph, reward, next_state,
state_repr, next_state_repr],
mode=mode, meta_action_fn=meta_action_fn)
# Important: do manual reset after getting the final reward from the
# unreset environment.
with tf.control_dependencies([post_reward, post_meta_reward]):
cond_reset_op = tf.cond(reset_env_cond,
tf_env.reset,
tf_env.current_time_step)
# Add a dummy control dependency to force the reset_op to run
with tf.control_dependencies(cond_reset_op):
post_reward, post_meta_reward = map(tf.identity, [post_reward, post_meta_reward])
eval_step = [next_state, action_ph, transition_type, post_reward, post_meta_reward, discount, uvf_agent.context_vars, state_repr]
if callable(action):
def step_fn(sess):
action_value = action(sess)
return sess.run(eval_step, feed_dict={action_ph: action_value})
else:
action = uvf_utils.clip_to_spec(action, action_spec)
def step_fn(sess):
action_value = sess.run(action)
return sess.run(eval_step, feed_dict={action_ph: action_value})
return step_fn
@gin.configurable
def evaluate(checkpoint_dir,
eval_dir,
environment=None,
num_bin_actions=3,
agent_class=None,
meta_agent_class=None,
state_preprocess_class=None,
gamma=1.0,
num_episodes_eval=10,
eval_interval_secs=60,
max_number_of_evaluations=None,
checkpoint_timeout=None,
timeout_fn=None,
tuner_hook=None,
generate_videos=False,
generate_summaries=True,
num_episodes_videos=5,
video_settings=None,
eval_modes=('eval',),
eval_model_rollout=False,
policy_save_dir='policy',
checkpoint_range=None,
checkpoint_path=None,
max_steps_per_episode=None,
evaluate_nohrl=False):
"""Loads and repeatedly evaluates a checkpointed model at a set interval.
Args:
checkpoint_dir: The directory where the checkpoints reside.
eval_dir: Directory to save the evaluation summary results.
environment: A BaseEnvironment to evaluate.
num_bin_actions: Number of bins for discretizing continuous actions.
agent_class: An RL agent class.
meta_agent_class: A Meta agent class.
gamma: Discount factor for the reward.
num_episodes_eval: Number of episodes to evaluate and average reward over.
eval_interval_secs: The number of seconds between each evaluation run.
max_number_of_evaluations: The max number of evaluations. If None the
evaluation continues indefinitely.
checkpoint_timeout: The maximum amount of time to wait between checkpoints.
If left as `None`, then the process will wait indefinitely.
timeout_fn: Optional function to call after a timeout.
tuner_hook: A callable that takes the average reward and global step and
updates a Vizier tuner trial.
generate_videos: Whether to generate videos of the agent in action.
generate_summaries: Whether to generate summaries.
num_episodes_videos: Number of episodes to evaluate for generating videos.
video_settings: Settings for generating videos of the agent.
optimal action based on the critic.
eval_modes: A tuple of eval modes.
eval_model_rollout: Evaluate model rollout.
policy_save_dir: Optional sub-directory where the policies are
saved.
checkpoint_range: Optional. If provided, evaluate all checkpoints in
the range.
checkpoint_path: Optional sub-directory specifying which checkpoint to
evaluate. If None, will evaluate the most recent checkpoint.
"""
tf_env = create_maze_env.TFPyEnvironment(environment)
observation_spec = [tf_env.observation_spec()]
action_spec = [tf_env.action_spec()]
assert max_steps_per_episode, 'max_steps_per_episode need to be set'
if agent_class.ACTION_TYPE == 'discrete':
assert False
else:
assert agent_class.ACTION_TYPE == 'continuous'
if meta_agent_class is not None:
assert agent_class.ACTION_TYPE == meta_agent_class.ACTION_TYPE
with tf.variable_scope('meta_agent'):
meta_agent = meta_agent_class(
observation_spec,
action_spec,
tf_env,
)
else:
meta_agent = None
with tf.variable_scope('uvf_agent'):
uvf_agent = agent_class(
observation_spec,
action_spec,
tf_env,
)
uvf_agent.set_meta_agent(agent=meta_agent)
with tf.variable_scope('state_preprocess'):
state_preprocess = state_preprocess_class()
# run both actor and critic once to ensure networks are initialized
# and gin configs will be saved
# pylint: disable=protected-access
temp_states = tf.expand_dims(
tf.zeros(
dtype=uvf_agent._observation_spec.dtype,
shape=uvf_agent._observation_spec.shape), 0)
# pylint: enable=protected-access
temp_actions = uvf_agent.actor_net(temp_states)
uvf_agent.critic_net(temp_states, temp_actions)
# create eval_step_fns for each action function
eval_step_fns = dict()
meta_agent = uvf_agent.meta_agent
for meta in [True] + [False] * evaluate_nohrl:
meta_tag = 'hrl' if meta else 'nohrl'
uvf_agent.set_meta_agent(meta_agent if meta else None)
for mode in eval_modes:
# wrap environment
wrapped_environment = uvf_agent.get_env_base_wrapper(
environment, mode=mode)
action_wrapper = lambda agent_: agent_.action
action_fn = action_wrapper(uvf_agent)
meta_action_fn = action_wrapper(meta_agent)
eval_step_fns['%s_%s' % (mode, meta_tag)] = (get_eval_step(
uvf_agent=uvf_agent,
state_preprocess=state_preprocess,
tf_env=tf_env,
action_fn=action_fn,
meta_action_fn=meta_action_fn,
environment_steps=tf.Variable(
0, dtype=tf.int64, name='environment_steps'),
num_episodes=tf.Variable(0, dtype=tf.int64, name='num_episodes'),
mode=mode), wrapped_environment,)
model_rollout_fn = None
if eval_model_rollout:
model_rollout_fn = get_model_rollout(uvf_agent, tf_env)
tf.train.get_or_create_global_step()
if policy_save_dir:
checkpoint_dir = os.path.join(checkpoint_dir, policy_save_dir)
tf.logging.info('Evaluating policies at %s', checkpoint_dir)
tf.logging.info('Running episodes for max %d steps', max_steps_per_episode)
evaluate_checkpoint_fn = get_evaluate_checkpoint_fn(
'', eval_dir, eval_step_fns, model_rollout_fn, gamma,
max_steps_per_episode, num_episodes_eval, num_episodes_videos, tuner_hook,
generate_videos, generate_summaries, video_settings)
if checkpoint_path is not None:
checkpoint_path = os.path.join(checkpoint_dir, checkpoint_path)
evaluate_checkpoint_fn(checkpoint_path)
elif checkpoint_range is not None:
model_files = tf.gfile.Glob(
os.path.join(checkpoint_dir, 'model.ckpt-*.index'))
tf.logging.info('Found %s policies at %s', len(model_files), checkpoint_dir)
model_files = {
int(f.split('model.ckpt-', 1)[1].split('.', 1)[0]):
os.path.splitext(f)[0]
for f in model_files
}
model_files = {
k: v
for k, v in model_files.items()
if k >= checkpoint_range[0] and k <= checkpoint_range[1]
}
tf.logging.info('Evaluating %d policies at %s',
len(model_files), checkpoint_dir)
for _, checkpoint_path in sorted(model_files.items()):
evaluate_checkpoint_fn(checkpoint_path)
else:
eval_utils.evaluate_checkpoint_repeatedly(
checkpoint_dir,
evaluate_checkpoint_fn,
eval_interval_secs=eval_interval_secs,
max_number_of_evaluations=max_number_of_evaluations,
checkpoint_timeout=checkpoint_timeout,
timeout_fn=timeout_fn)
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Random policy on an environment."""
import tensorflow as tf
import numpy as np
import random
from environments import create_maze_env
app = tf.app
flags = tf.flags
logging = tf.logging
FLAGS = flags.FLAGS
flags.DEFINE_string('env', 'AntMaze', 'environment name: AntMaze, AntPush, or AntFall')
flags.DEFINE_integer('episode_length', 500, 'episode length')
flags.DEFINE_integer('num_episodes', 50, 'number of episodes')
def get_goal_sample_fn(env_name):
if env_name == 'AntMaze':
# NOTE: When evaluating (i.e. the metrics shown in the paper,
# we use the commented out goal sampling function. The uncommented
# one is only used for training.
#return lambda: np.array([0., 16.])
return lambda: np.random.uniform((-4, -4), (20, 20))
elif env_name == 'AntPush':
return lambda: np.array([0., 19.])
elif env_name == 'AntFall':
return lambda: np.array([0., 27., 4.5])
else:
assert False, 'Unknown env'
def get_reward_fn(env_name):
if env_name == 'AntMaze':
return lambda obs, goal: -np.sum(np.square(obs[:2] - goal)) ** 0.5
elif env_name == 'AntPush':
return lambda obs, goal: -np.sum(np.square(obs[:2] - goal)) ** 0.5
elif env_name == 'AntFall':
return lambda obs, goal: -np.sum(np.square(obs[:3] - goal)) ** 0.5
else:
assert False, 'Unknown env'
def success_fn(last_reward):
return last_reward > -5.0
class EnvWithGoal(object):
def __init__(self, base_env, env_name):
self.base_env = base_env
self.goal_sample_fn = get_goal_sample_fn(env_name)
self.reward_fn = get_reward_fn(env_name)
self.goal = None
def reset(self):
obs = self.base_env.reset()
self.goal = self.goal_sample_fn()
return np.concatenate([obs, self.goal])
def step(self, a):
obs, _, done, info = self.base_env.step(a)
reward = self.reward_fn(obs, self.goal)
return np.concatenate([obs, self.goal]), reward, done, info
@property
def action_space(self):
return self.base_env.action_space
def run_environment(env_name, episode_length, num_episodes):
env = EnvWithGoal(
create_maze_env.create_maze_env(env_name).gym,
env_name)
def action_fn(obs):
action_space = env.action_space
action_space_mean = (action_space.low + action_space.high) / 2.0
action_space_magn = (action_space.high - action_space.low) / 2.0
random_action = (action_space_mean +
action_space_magn *
np.random.uniform(low=-1.0, high=1.0,
size=action_space.shape))
return random_action
rewards = []
successes = []
for ep in range(num_episodes):
rewards.append(0.0)
successes.append(False)
obs = env.reset()
for _ in range(episode_length):
obs, reward, done, _ = env.step(action_fn(obs))
rewards[-1] += reward
successes[-1] = success_fn(reward)
if done:
break
logging.info('Episode %d reward: %.2f, Success: %d', ep + 1, rewards[-1], successes[-1])
logging.info('Average Reward over %d episodes: %.2f',
num_episodes, np.mean(rewards))
logging.info('Average Success over %d episodes: %.2f',
num_episodes, np.mean(successes))
def main(unused_argv):
logging.set_verbosity(logging.INFO)
run_environment(FLAGS.env, FLAGS.episode_length, FLAGS.num_episodes)
if __name__ == '__main__':
app.run()
# Copyright 2018 The TensorFlow Authors.
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -11,8 +11,12 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for config_util.py."""
r"""Script for evaluating a UVF agent.
To run locally: See scripts/local_eval.py
"""
from __future__ import absolute_import
from __future__ import division
......@@ -20,43 +24,28 @@ from __future__ import print_function
import tensorflow as tf
from astronet.util import config_util
class ConfigUtilTest(tf.test.TestCase):
def testUnflatten(self):
# Empty dict.
self.assertDictEqual(config_util.unflatten({}), {})
# Already flat dict.
self.assertDictEqual(
config_util.unflatten({
"a": 1,
"b": 2
}), {
"a": 1,
"b": 2
})
# Nested dict.
self.assertDictEqual(
config_util.unflatten({
"a": 1,
"b.c": 2,
"b.d.e": 3,
"b.d.f": 4,
}), {
"a": 1,
"b": {
"c": 2,
"d": {
"e": 3,
"f": 4,
}
}
})
import gin.tf
# pylint: disable=unused-import
import eval as eval_
# pylint: enable=unused-import
flags = tf.app.flags
FLAGS = flags.FLAGS
def main(_):
tf.logging.set_verbosity(tf.logging.INFO)
assert FLAGS.checkpoint_dir, "Flag 'checkpoint_dir' must be set."
assert FLAGS.eval_dir, "Flag 'eval_dir' must be set."
if FLAGS.config_file:
for config_file in FLAGS.config_file:
gin.parse_config_file(config_file)
if FLAGS.params:
gin.parse_config(FLAGS.params)
eval_.evaluate(FLAGS.checkpoint_dir, FLAGS.eval_dir)
if __name__ == "__main__":
tf.test.main()
tf.app.run()
# Copyright 2018 The TensorFlow Authors.
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -11,8 +11,12 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Kepler light curve inputs to the AstroWaveNet model."""
r"""Script for training an RL agent using the UVF algorithm.
To run locally: See scripts/local_train.py
"""
from __future__ import absolute_import
from __future__ import division
......@@ -20,31 +24,26 @@ from __future__ import print_function
import tensorflow as tf
from astrowavenet.data import base
import gin.tf
# pylint: enable=unused-import
import train
# pylint: disable=unused-import
COND_INPUT_KEY = "mask"
AR_INPUT_KEY = "flux"
flags = tf.app.flags
FLAGS = flags.FLAGS
class KeplerLightCurves(base.TFRecordDataset):
"""Kepler light curve inputs to the AstroWaveNet model."""
def main(_):
tf.logging.set_verbosity(tf.logging.INFO)
if FLAGS.config_file:
for config_file in FLAGS.config_file:
gin.parse_config_file(config_file)
if FLAGS.params:
gin.parse_config(FLAGS.params)
def create_example_parser(self):
assert FLAGS.train_dir, "Flag 'train_dir' must be set."
return train.train_uvf(FLAGS.train_dir)
def _example_parser(serialized):
"""Parses a single tf.Example proto."""
features = tf.parse_single_example(
serialized,
features={
AR_INPUT_KEY: tf.VarLenFeature(tf.float32),
COND_INPUT_KEY: tf.VarLenFeature(tf.int64),
})
# Extract values from SparseTensor objects.
autoregressive_input = features[AR_INPUT_KEY].values
conditioning_stack = tf.to_float(features[COND_INPUT_KEY].values)
return {
"autoregressive_input": autoregressive_input,
"conditioning_stack": conditioning_stack,
}
return _example_parser
if __name__ == '__main__':
tf.app.run()
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Script to run run_eval.py locally.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
from subprocess import call
import sys
CONFIGS_PATH = 'configs'
CONTEXT_CONFIGS_PATH = 'context/configs'
def main():
bb = './'
base_num_args = 6
if len(sys.argv) < base_num_args:
print(
"usage: python %s <exp_name> <context_setting_gin> <context_gin> "
"<agent_gin> <suite> [params...]"
% sys.argv[0])
sys.exit(0)
exp = sys.argv[1]
context_setting = sys.argv[2]
context = sys.argv[3]
agent = sys.argv[4]
assert sys.argv[5] in ["suite"], "args[5] must be `suite'"
suite = ""
binary = "python {bb}/run_eval{suite}.py ".format(bb=bb, suite=suite)
h = os.environ["HOME"]
ucp = CONFIGS_PATH
ccp = CONTEXT_CONFIGS_PATH
extra = ''
command_str = ("{binary} "
"--logtostderr "
"--checkpoint_dir={h}/tmp/{context_setting}/{context}/{agent}/{exp}/train "
"--eval_dir={h}/tmp/{context_setting}/{context}/{agent}/{exp}/eval "
"--config_file={ucp}/{agent}.gin "
"--config_file={ucp}/eval_{extra}uvf.gin "
"--config_file={ccp}/{context_setting}.gin "
"--config_file={ccp}/{context}.gin ").format(
h=h,
ucp=ucp,
ccp=ccp,
context_setting=context_setting,
context=context,
agent=agent,
extra=extra,
suite=suite,
exp=exp,
binary=binary)
for extra_arg in sys.argv[base_num_args:]:
command_str += "--params='%s' " % extra_arg
print(command_str)
call(command_str, shell=True)
if __name__ == "__main__":
main()
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Script to run run_train.py locally.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import random
from subprocess import call
import sys
CONFIGS_PATH = './configs'
CONTEXT_CONFIGS_PATH = './context/configs'
def main():
bb = './'
base_num_args = 6
if len(sys.argv) < base_num_args:
print(
"usage: python %s <exp_name> <context_setting_gin> <env_context_gin> "
"<agent_gin> <suite> [params...]"
% sys.argv[0])
sys.exit(0)
exp = sys.argv[1] # Name for experiment, e.g. 'test001'
context_setting = sys.argv[2] # Context setting, e.g. 'hiro_orig'
context = sys.argv[3] # Environment-specific context, e.g. 'ant_maze'
agent = sys.argv[4] # Agent settings, e.g. 'base_uvf'
assert sys.argv[5] in ["suite"], "args[5] must be `suite'"
suite = ""
binary = "python {bb}/run_train{suite}.py ".format(bb=bb, suite=suite)
h = os.environ["HOME"]
ucp = CONFIGS_PATH
ccp = CONTEXT_CONFIGS_PATH
extra = ''
port = random.randint(2000, 8000)
command_str = ("{binary} "
"--train_dir={h}/tmp/{context_setting}/{context}/{agent}/{exp}/train "
"--config_file={ucp}/{agent}.gin "
"--config_file={ucp}/train_{extra}uvf.gin "
"--config_file={ccp}/{context_setting}.gin "
"--config_file={ccp}/{context}.gin "
"--summarize_gradients=False "
"--save_interval_secs=60 "
"--save_summaries_secs=1 "
"--master=local "
"--alsologtostderr ").format(h=h, ucp=ucp,
context_setting=context_setting,
context=context, ccp=ccp,
suite=suite, agent=agent, extra=extra,
exp=exp, binary=binary,
port=port)
for extra_arg in sys.argv[base_num_args:]:
command_str += "--params='%s' " % extra_arg
print(command_str)
call(command_str, shell=True)
if __name__ == "__main__":
main()
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
r"""Script for training an RL agent using the UVF algorithm.
To run locally: See run_train.py
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import time
import tensorflow as tf
slim = tf.contrib.slim
import gin.tf
# pylint: disable=unused-import
import train_utils
import agent as agent_
from agents import circular_buffer
from utils import utils as uvf_utils
from environments import create_maze_env
# pylint: enable=unused-import
flags = tf.app.flags
FLAGS = flags.FLAGS
flags.DEFINE_string('goal_sample_strategy', 'sample',
'None, sample, FuN')
LOAD_PATH = None
def collect_experience(tf_env, agent, meta_agent, state_preprocess,
replay_buffer, meta_replay_buffer,
action_fn, meta_action_fn,
environment_steps, num_episodes, num_resets,
episode_rewards, episode_meta_rewards,
store_context,
disable_agent_reset):
"""Collect experience in a tf_env into a replay_buffer using action_fn.
Args:
tf_env: A TFEnvironment.
agent: A UVF agent.
meta_agent: A Meta Agent.
replay_buffer: A Replay buffer to collect experience in.
meta_replay_buffer: A Replay buffer to collect meta agent experience in.
action_fn: A function to produce actions given current state.
meta_action_fn: A function to produce meta actions given current state.
environment_steps: A variable to count the number of steps in the tf_env.
num_episodes: A variable to count the number of episodes.
num_resets: A variable to count the number of resets.
store_context: A boolean to check if store context in replay.
disable_agent_reset: A boolean that disables agent from resetting.
Returns:
A collect_experience_op that excute an action and store into the
replay_buffers
"""
tf_env.start_collect()
state = tf_env.current_obs()
state_repr = state_preprocess(state)
action = action_fn(state, context=None)
with tf.control_dependencies([state]):
transition_type, reward, discount = tf_env.step(action)
def increment_step():
return environment_steps.assign_add(1)
def increment_episode():
return num_episodes.assign_add(1)
def increment_reset():
return num_resets.assign_add(1)
def update_episode_rewards(context_reward, meta_reward, reset):
new_episode_rewards = tf.concat(
[episode_rewards[:1] + context_reward, episode_rewards[1:]], 0)
new_episode_meta_rewards = tf.concat(
[episode_meta_rewards[:1] + meta_reward,
episode_meta_rewards[1:]], 0)
return tf.group(
episode_rewards.assign(
tf.cond(reset,
lambda: tf.concat([[0.], episode_rewards[:-1]], 0),
lambda: new_episode_rewards)),
episode_meta_rewards.assign(
tf.cond(reset,
lambda: tf.concat([[0.], episode_meta_rewards[:-1]], 0),
lambda: new_episode_meta_rewards)))
def no_op_int():
return tf.constant(0, dtype=tf.int64)
step_cond = agent.step_cond_fn(state, action,
transition_type,
environment_steps, num_episodes)
reset_episode_cond = agent.reset_episode_cond_fn(
state, action,
transition_type, environment_steps, num_episodes)
reset_env_cond = agent.reset_env_cond_fn(state, action,
transition_type,
environment_steps, num_episodes)
increment_step_op = tf.cond(step_cond, increment_step, no_op_int)
increment_episode_op = tf.cond(reset_episode_cond, increment_episode,
no_op_int)
increment_reset_op = tf.cond(reset_env_cond, increment_reset, no_op_int)
increment_op = tf.group(increment_step_op, increment_episode_op,
increment_reset_op)
with tf.control_dependencies([increment_op, reward, discount]):
next_state = tf_env.current_obs()
next_state_repr = state_preprocess(next_state)
next_reset_episode_cond = tf.logical_or(
agent.reset_episode_cond_fn(
state, action,
transition_type, environment_steps, num_episodes),
tf.equal(discount, 0.0))
if store_context:
context = [tf.identity(var) + tf.zeros_like(var) for var in agent.context_vars]
meta_context = [tf.identity(var) + tf.zeros_like(var) for var in meta_agent.context_vars]
else:
context = []
meta_context = []
with tf.control_dependencies([next_state] + context + meta_context):
if disable_agent_reset:
collect_experience_ops = [tf.no_op()] # don't reset agent
else:
collect_experience_ops = agent.cond_begin_episode_op(
tf.logical_not(reset_episode_cond),
[state, action, reward, next_state,
state_repr, next_state_repr],
mode='explore', meta_action_fn=meta_action_fn)
context_reward, meta_reward = collect_experience_ops
collect_experience_ops = list(collect_experience_ops)
collect_experience_ops.append(
update_episode_rewards(tf.reduce_sum(context_reward), meta_reward,
reset_episode_cond))
meta_action_every_n = agent.tf_context.meta_action_every_n
with tf.control_dependencies(collect_experience_ops):
transition = [state, action, reward, discount, next_state]
meta_action = tf.to_float(
tf.concat(context, -1)) # Meta agent action is low-level context
meta_end = tf.logical_and( # End of meta-transition.
tf.equal(agent.tf_context.t % meta_action_every_n, 1),
agent.tf_context.t > 1)
with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
states_var = tf.get_variable('states_var',
[meta_action_every_n, state.shape[-1]],
state.dtype)
actions_var = tf.get_variable('actions_var',
[meta_action_every_n, action.shape[-1]],
action.dtype)
state_var = tf.get_variable('state_var', state.shape, state.dtype)
reward_var = tf.get_variable('reward_var', reward.shape, reward.dtype)
meta_action_var = tf.get_variable('meta_action_var',
meta_action.shape, meta_action.dtype)
meta_context_var = [
tf.get_variable('meta_context_var%d' % idx,
meta_context[idx].shape, meta_context[idx].dtype)
for idx in range(len(meta_context))]
actions_var_upd = tf.scatter_update(
actions_var, (agent.tf_context.t - 2) % meta_action_every_n, action)
with tf.control_dependencies([actions_var_upd]):
actions = tf.identity(actions_var) + tf.zeros_like(actions_var)
meta_reward = tf.identity(meta_reward) + tf.zeros_like(meta_reward)
meta_reward = tf.reshape(meta_reward, reward.shape)
reward = 0.1 * meta_reward
meta_transition = [state_var, meta_action_var,
reward_var + reward,
discount * (1 - tf.to_float(next_reset_episode_cond)),
next_state]
meta_transition.extend([states_var, actions])
if store_context: # store current and next context into replay
transition += context + list(agent.context_vars)
meta_transition += meta_context_var + list(meta_agent.context_vars)
meta_step_cond = tf.squeeze(tf.logical_and(step_cond, tf.logical_or(next_reset_episode_cond, meta_end)))
collect_experience_op = tf.group(
replay_buffer.maybe_add(transition, step_cond),
meta_replay_buffer.maybe_add(meta_transition, meta_step_cond),
)
with tf.control_dependencies([collect_experience_op]):
collect_experience_op = tf.cond(reset_env_cond,
tf_env.reset,
tf_env.current_time_step)
meta_period = tf.equal(agent.tf_context.t % meta_action_every_n, 1)
states_var_upd = tf.scatter_update(
states_var, (agent.tf_context.t - 1) % meta_action_every_n,
next_state)
state_var_upd = tf.assign(
state_var,
tf.cond(meta_period, lambda: next_state, lambda: state_var))
reward_var_upd = tf.assign(
reward_var,
tf.cond(meta_period,
lambda: tf.zeros_like(reward_var),
lambda: reward_var + reward))
meta_action = tf.to_float(tf.concat(agent.context_vars, -1))
meta_action_var_upd = tf.assign(
meta_action_var,
tf.cond(meta_period, lambda: meta_action, lambda: meta_action_var))
meta_context_var_upd = [
tf.assign(
meta_context_var[idx],
tf.cond(meta_period,
lambda: meta_agent.context_vars[idx],
lambda: meta_context_var[idx]))
for idx in range(len(meta_context))]
return tf.group(
collect_experience_op,
states_var_upd,
state_var_upd,
reward_var_upd,
meta_action_var_upd,
*meta_context_var_upd)
def sample_best_meta_actions(state_reprs, next_state_reprs, prev_meta_actions,
low_states, low_actions, low_state_reprs,
inverse_dynamics, uvf_agent, k=10):
"""Return meta-actions which approximately maximize low-level log-probs."""
sampled_actions = inverse_dynamics.sample(state_reprs, next_state_reprs, k, prev_meta_actions)
sampled_actions = tf.stop_gradient(sampled_actions)
sampled_log_probs = tf.reshape(uvf_agent.log_probs(
tf.tile(low_states, [k, 1, 1]),
tf.tile(low_actions, [k, 1, 1]),
tf.tile(low_state_reprs, [k, 1, 1]),
[tf.reshape(sampled_actions, [-1, sampled_actions.shape[-1]])]),
[k, low_states.shape[0],
low_states.shape[1], -1])
fitness = tf.reduce_sum(sampled_log_probs, [2, 3])
best_actions = tf.argmax(fitness, 0)
actions = tf.gather_nd(
sampled_actions,
tf.stack([best_actions,
tf.range(prev_meta_actions.shape[0], dtype=tf.int64)], -1))
return actions
@gin.configurable
def train_uvf(train_dir,
environment=None,
num_bin_actions=3,
agent_class=None,
meta_agent_class=None,
state_preprocess_class=None,
inverse_dynamics_class=None,
exp_action_wrapper=None,
replay_buffer=None,
meta_replay_buffer=None,
replay_num_steps=1,
meta_replay_num_steps=1,
critic_optimizer=None,
actor_optimizer=None,
meta_critic_optimizer=None,
meta_actor_optimizer=None,
repr_optimizer=None,
relabel_contexts=False,
meta_relabel_contexts=False,
batch_size=64,
repeat_size=0,
num_episodes_train=2000,
initial_episodes=2,
initial_steps=None,
num_updates_per_observation=1,
num_collect_per_update=1,
num_collect_per_meta_update=1,
gamma=1.0,
meta_gamma=1.0,
reward_scale_factor=1.0,
target_update_period=1,
should_stop_early=None,
clip_gradient_norm=0.0,
summarize_gradients=False,
debug_summaries=False,
log_every_n_steps=100,
prefetch_queue_capacity=2,
policy_save_dir='policy',
save_policy_every_n_steps=1000,
save_policy_interval_secs=0,
replay_context_ratio=0.0,
next_state_as_context_ratio=0.0,
state_index=0,
zero_timer_ratio=0.0,
timer_index=-1,
debug=False,
max_policies_to_save=None,
max_steps_per_episode=None,
load_path=LOAD_PATH):
"""Train an agent."""
tf_env = create_maze_env.TFPyEnvironment(environment)
observation_spec = [tf_env.observation_spec()]
action_spec = [tf_env.action_spec()]
max_steps_per_episode = max_steps_per_episode or tf_env.pyenv.max_episode_steps
assert max_steps_per_episode, 'max_steps_per_episode need to be set'
if initial_steps is None:
initial_steps = initial_episodes * max_steps_per_episode
if agent_class.ACTION_TYPE == 'discrete':
assert False
else:
assert agent_class.ACTION_TYPE == 'continuous'
assert agent_class.ACTION_TYPE == meta_agent_class.ACTION_TYPE
with tf.variable_scope('meta_agent'):
meta_agent = meta_agent_class(
observation_spec,
action_spec,
tf_env,
debug_summaries=debug_summaries)
meta_agent.set_replay(replay=meta_replay_buffer)
with tf.variable_scope('uvf_agent'):
uvf_agent = agent_class(
observation_spec,
action_spec,
tf_env,
debug_summaries=debug_summaries)
uvf_agent.set_meta_agent(agent=meta_agent)
uvf_agent.set_replay(replay=replay_buffer)
with tf.variable_scope('state_preprocess'):
state_preprocess = state_preprocess_class()
with tf.variable_scope('inverse_dynamics'):
inverse_dynamics = inverse_dynamics_class(
meta_agent.sub_context_as_action_specs[0])
# Create counter variables
global_step = tf.contrib.framework.get_or_create_global_step()
num_episodes = tf.Variable(0, dtype=tf.int64, name='num_episodes')
num_resets = tf.Variable(0, dtype=tf.int64, name='num_resets')
num_updates = tf.Variable(0, dtype=tf.int64, name='num_updates')
num_meta_updates = tf.Variable(0, dtype=tf.int64, name='num_meta_updates')
episode_rewards = tf.Variable([0.] * 100, name='episode_rewards')
episode_meta_rewards = tf.Variable([0.] * 100, name='episode_meta_rewards')
# Create counter variables summaries
train_utils.create_counter_summaries([
('environment_steps', global_step),
('num_episodes', num_episodes),
('num_resets', num_resets),
('num_updates', num_updates),
('num_meta_updates', num_meta_updates),
('replay_buffer_adds', replay_buffer.get_num_adds()),
('meta_replay_buffer_adds', meta_replay_buffer.get_num_adds()),
])
tf.summary.scalar('avg_episode_rewards',
tf.reduce_mean(episode_rewards[1:]))
tf.summary.scalar('avg_episode_meta_rewards',
tf.reduce_mean(episode_meta_rewards[1:]))
tf.summary.histogram('episode_rewards', episode_rewards[1:])
tf.summary.histogram('episode_meta_rewards', episode_meta_rewards[1:])
# Create init ops
action_fn = uvf_agent.action
action_fn = uvf_agent.add_noise_fn(action_fn, global_step=None)
meta_action_fn = meta_agent.action
meta_action_fn = meta_agent.add_noise_fn(meta_action_fn, global_step=None)
meta_actions_fn = meta_agent.actions
meta_actions_fn = meta_agent.add_noise_fn(meta_actions_fn, global_step=None)
init_collect_experience_op = collect_experience(
tf_env,
uvf_agent,
meta_agent,
state_preprocess,
replay_buffer,
meta_replay_buffer,
action_fn,
meta_action_fn,
environment_steps=global_step,
num_episodes=num_episodes,
num_resets=num_resets,
episode_rewards=episode_rewards,
episode_meta_rewards=episode_meta_rewards,
store_context=True,
disable_agent_reset=False,
)
# Create train ops
collect_experience_op = collect_experience(
tf_env,
uvf_agent,
meta_agent,
state_preprocess,
replay_buffer,
meta_replay_buffer,
action_fn,
meta_action_fn,
environment_steps=global_step,
num_episodes=num_episodes,
num_resets=num_resets,
episode_rewards=episode_rewards,
episode_meta_rewards=episode_meta_rewards,
store_context=True,
disable_agent_reset=False,
)
train_op_list = []
repr_train_op = tf.constant(0.0)
for mode in ['meta', 'nometa']:
if mode == 'meta':
agent = meta_agent
buff = meta_replay_buffer
critic_opt = meta_critic_optimizer
actor_opt = meta_actor_optimizer
relabel = meta_relabel_contexts
num_steps = meta_replay_num_steps
my_gamma = meta_gamma,
n_updates = num_meta_updates
else:
agent = uvf_agent
buff = replay_buffer
critic_opt = critic_optimizer
actor_opt = actor_optimizer
relabel = relabel_contexts
num_steps = replay_num_steps
my_gamma = gamma
n_updates = num_updates
with tf.name_scope(mode):
batch = buff.get_random_batch(batch_size, num_steps=num_steps)
states, actions, rewards, discounts, next_states = batch[:5]
with tf.name_scope('Reward'):
tf.summary.scalar('average_step_reward', tf.reduce_mean(rewards))
rewards *= reward_scale_factor
batch_queue = slim.prefetch_queue.prefetch_queue(
[states, actions, rewards, discounts, next_states] + batch[5:],
capacity=prefetch_queue_capacity,
name='batch_queue')
batch_dequeue = batch_queue.dequeue()
if repeat_size > 0:
batch_dequeue = [
tf.tile(batch, (repeat_size+1,) + (1,) * (batch.shape.ndims - 1))
for batch in batch_dequeue
]
batch_size *= (repeat_size + 1)
states, actions, rewards, discounts, next_states = batch_dequeue[:5]
if mode == 'meta':
low_states = batch_dequeue[5]
low_actions = batch_dequeue[6]
low_state_reprs = state_preprocess(low_states)
state_reprs = state_preprocess(states)
next_state_reprs = state_preprocess(next_states)
if mode == 'meta': # Re-label meta-action
prev_actions = actions
if FLAGS.goal_sample_strategy == 'None':
pass
elif FLAGS.goal_sample_strategy == 'FuN':
actions = inverse_dynamics.sample(state_reprs, next_state_reprs, 1, prev_actions, sc=0.1)
actions = tf.stop_gradient(actions)
elif FLAGS.goal_sample_strategy == 'sample':
actions = sample_best_meta_actions(state_reprs, next_state_reprs, prev_actions,
low_states, low_actions, low_state_reprs,
inverse_dynamics, uvf_agent, k=10)
else:
assert False
if state_preprocess.trainable and mode == 'meta':
# Representation learning is based on meta-transitions, but is trained
# along with low-level policy updates.
repr_loss, _, _ = state_preprocess.loss(states, next_states, low_actions, low_states)
repr_train_op = slim.learning.create_train_op(
repr_loss,
repr_optimizer,
global_step=None,
update_ops=None,
summarize_gradients=summarize_gradients,
clip_gradient_norm=clip_gradient_norm,
variables_to_train=state_preprocess.get_trainable_vars(),)
# Get contexts for training
contexts, next_contexts = agent.sample_contexts(
mode='train', batch_size=batch_size,
state=states, next_state=next_states,
)
if not relabel: # Re-label context (in the style of TDM or HER).
contexts, next_contexts = (
batch_dequeue[-2*len(contexts):-1*len(contexts)],
batch_dequeue[-1*len(contexts):])
merged_states = agent.merged_states(states, contexts)
merged_next_states = agent.merged_states(next_states, next_contexts)
if mode == 'nometa':
context_rewards, context_discounts = agent.compute_rewards(
'train', state_reprs, actions, rewards, next_state_reprs, contexts)
elif mode == 'meta': # Meta-agent uses sum of rewards, not context-specific rewards.
_, context_discounts = agent.compute_rewards(
'train', states, actions, rewards, next_states, contexts)
context_rewards = rewards
if agent.gamma_index is not None:
context_discounts *= tf.cast(
tf.reshape(contexts[agent.gamma_index], (-1,)),
dtype=context_discounts.dtype)
else: context_discounts *= my_gamma
critic_loss = agent.critic_loss(merged_states, actions,
context_rewards, context_discounts,
merged_next_states)
critic_loss = tf.reduce_mean(critic_loss)
actor_loss = agent.actor_loss(merged_states, actions,
context_rewards, context_discounts,
merged_next_states)
actor_loss *= tf.to_float( # Only update actor every N steps.
tf.equal(n_updates % target_update_period, 0))
critic_train_op = slim.learning.create_train_op(
critic_loss,
critic_opt,
global_step=n_updates,
update_ops=None,
summarize_gradients=summarize_gradients,
clip_gradient_norm=clip_gradient_norm,
variables_to_train=agent.get_trainable_critic_vars(),)
critic_train_op = uvf_utils.tf_print(
critic_train_op, [critic_train_op],
message='critic_loss',
print_freq=1000,
name='critic_loss')
train_op_list.append(critic_train_op)
if actor_loss is not None:
actor_train_op = slim.learning.create_train_op(
actor_loss,
actor_opt,
global_step=None,
update_ops=None,
summarize_gradients=summarize_gradients,
clip_gradient_norm=clip_gradient_norm,
variables_to_train=agent.get_trainable_actor_vars(),)
actor_train_op = uvf_utils.tf_print(
actor_train_op, [actor_train_op],
message='actor_loss',
print_freq=1000,
name='actor_loss')
train_op_list.append(actor_train_op)
assert len(train_op_list) == 4
# Update targets should happen after the networks have been updated.
with tf.control_dependencies(train_op_list[2:]):
update_targets_op = uvf_utils.periodically(
uvf_agent.update_targets, target_update_period, 'update_targets')
if meta_agent is not None:
with tf.control_dependencies(train_op_list[:2]):
update_meta_targets_op = uvf_utils.periodically(
meta_agent.update_targets, target_update_period, 'update_targets')
assert_op = tf.Assert( # Hack to get training to stop.
tf.less_equal(global_step, 200 + num_episodes_train * max_steps_per_episode),
[global_step])
with tf.control_dependencies([update_targets_op, assert_op]):
train_op = tf.add_n(train_op_list[2:], name='post_update_targets')
# Representation training steps on every low-level policy training step.
train_op += repr_train_op
with tf.control_dependencies([update_meta_targets_op, assert_op]):
meta_train_op = tf.add_n(train_op_list[:2],
name='post_update_meta_targets')
if debug_summaries:
train_.gen_debug_batch_summaries(batch)
slim.summaries.add_histogram_summaries(
uvf_agent.get_trainable_critic_vars(), 'critic_vars')
slim.summaries.add_histogram_summaries(
uvf_agent.get_trainable_actor_vars(), 'actor_vars')
train_ops = train_utils.TrainOps(train_op, meta_train_op,
collect_experience_op)
policy_save_path = os.path.join(train_dir, policy_save_dir, 'model.ckpt')
policy_vars = uvf_agent.get_actor_vars() + meta_agent.get_actor_vars() + [
global_step, num_episodes, num_resets
] + list(uvf_agent.context_vars) + list(meta_agent.context_vars) + state_preprocess.get_trainable_vars()
# add critic vars, since some test evaluation depends on them
policy_vars += uvf_agent.get_trainable_critic_vars() + meta_agent.get_trainable_critic_vars()
policy_saver = tf.train.Saver(
policy_vars, max_to_keep=max_policies_to_save, sharded=False)
lowlevel_vars = (uvf_agent.get_actor_vars() +
uvf_agent.get_trainable_critic_vars() +
state_preprocess.get_trainable_vars())
lowlevel_saver = tf.train.Saver(lowlevel_vars)
def policy_save_fn(sess):
policy_saver.save(
sess, policy_save_path, global_step=global_step, write_meta_graph=False)
if save_policy_interval_secs > 0:
tf.logging.info(
'Wait %d secs after save policy.' % save_policy_interval_secs)
time.sleep(save_policy_interval_secs)
train_step_fn = train_utils.TrainStep(
max_number_of_steps=num_episodes_train * max_steps_per_episode + 100,
num_updates_per_observation=num_updates_per_observation,
num_collect_per_update=num_collect_per_update,
num_collect_per_meta_update=num_collect_per_meta_update,
log_every_n_steps=log_every_n_steps,
policy_save_fn=policy_save_fn,
save_policy_every_n_steps=save_policy_every_n_steps,
should_stop_early=should_stop_early).train_step
local_init_op = tf.local_variables_initializer()
init_targets_op = tf.group(uvf_agent.update_targets(1.0),
meta_agent.update_targets(1.0))
def initialize_training_fn(sess):
"""Initialize training function."""
sess.run(local_init_op)
sess.run(init_targets_op)
if load_path:
tf.logging.info('Restoring low-level from %s' % load_path)
lowlevel_saver.restore(sess, load_path)
global_step_value = sess.run(global_step)
assert global_step_value == 0, 'Global step should be zero.'
collect_experience_call = sess.make_callable(
init_collect_experience_op)
for _ in range(initial_steps):
collect_experience_call()
train_saver = tf.train.Saver(max_to_keep=2, sharded=True)
tf.logging.info('train dir: %s', train_dir)
return slim.learning.train(
train_ops,
train_dir,
train_step_fn=train_step_fn,
save_interval_secs=FLAGS.save_interval_secs,
saver=train_saver,
log_every_n_steps=0,
global_step=global_step,
master="",
is_chief=(FLAGS.task == 0),
save_summaries_secs=FLAGS.save_summaries_secs,
init_fn=initialize_training_fn)
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
r""""""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from collections import namedtuple
import os
import time
import tensorflow as tf
import gin.tf
flags = tf.app.flags
flags.DEFINE_multi_string('config_file', None,
'List of paths to the config files.')
flags.DEFINE_multi_string('params', None,
'Newline separated list of Gin parameter bindings.')
flags.DEFINE_string('train_dir', None,
'Directory for writing logs/summaries during training.')
flags.DEFINE_string('master', 'local',
'BNS name of the TensorFlow master to use.')
flags.DEFINE_integer('task', 0, 'task id')
flags.DEFINE_integer('save_interval_secs', 300, 'The frequency at which '
'checkpoints are saved, in seconds.')
flags.DEFINE_integer('save_summaries_secs', 30, 'The frequency at which '
'summaries are saved, in seconds.')
flags.DEFINE_boolean('summarize_gradients', False,
'Whether to generate gradient summaries.')
FLAGS = flags.FLAGS
TrainOps = namedtuple('TrainOps',
['train_op', 'meta_train_op', 'collect_experience_op'])
class TrainStep(object):
"""Handles training step."""
def __init__(self,
max_number_of_steps=0,
num_updates_per_observation=1,
num_collect_per_update=1,
num_collect_per_meta_update=1,
log_every_n_steps=1,
policy_save_fn=None,
save_policy_every_n_steps=0,
should_stop_early=None):
"""Returns a function that is executed at each step of slim training.
Args:
max_number_of_steps: Optional maximum number of train steps to take.
num_updates_per_observation: Number of updates per observation.
log_every_n_steps: The frequency, in terms of global steps, that the loss
and global step and logged.
policy_save_fn: A tf.Saver().save function to save the policy.
save_policy_every_n_steps: How frequently to save the policy.
should_stop_early: Optional hook to report whether training should stop.
Raises:
ValueError: If policy_save_fn is not provided when
save_policy_every_n_steps > 0.
"""
if save_policy_every_n_steps and policy_save_fn is None:
raise ValueError(
'policy_save_fn is required when save_policy_every_n_steps > 0')
self.max_number_of_steps = max_number_of_steps
self.num_updates_per_observation = num_updates_per_observation
self.num_collect_per_update = num_collect_per_update
self.num_collect_per_meta_update = num_collect_per_meta_update
self.log_every_n_steps = log_every_n_steps
self.policy_save_fn = policy_save_fn
self.save_policy_every_n_steps = save_policy_every_n_steps
self.should_stop_early = should_stop_early
self.last_global_step_val = 0
self.train_op_fn = None
self.collect_and_train_fn = None
tf.logging.info('Training for %d max_number_of_steps',
self.max_number_of_steps)
def train_step(self, sess, train_ops, global_step, _):
"""This function will be called at each step of training.
This represents one step of the DDPG algorithm and can include:
1. collect a <state, action, reward, next_state> transition
2. update the target network
3. train the actor
4. train the critic
Args:
sess: A Tensorflow session.
train_ops: A DdpgTrainOps tuple of train ops to run.
global_step: The global step.
Returns:
A scalar total loss.
A boolean should stop.
"""
start_time = time.time()
if self.train_op_fn is None:
self.train_op_fn = sess.make_callable([train_ops.train_op, global_step])
self.meta_train_op_fn = sess.make_callable([train_ops.meta_train_op, global_step])
self.collect_fn = sess.make_callable([train_ops.collect_experience_op, global_step])
self.collect_and_train_fn = sess.make_callable(
[train_ops.train_op, global_step, train_ops.collect_experience_op])
self.collect_and_meta_train_fn = sess.make_callable(
[train_ops.meta_train_op, global_step, train_ops.collect_experience_op])
for _ in range(self.num_collect_per_update - 1):
self.collect_fn()
for _ in range(self.num_updates_per_observation - 1):
self.train_op_fn()
total_loss, global_step_val, _ = self.collect_and_train_fn()
if (global_step_val // self.num_collect_per_meta_update !=
self.last_global_step_val // self.num_collect_per_meta_update):
self.meta_train_op_fn()
time_elapsed = time.time() - start_time
should_stop = False
if self.max_number_of_steps:
should_stop = global_step_val >= self.max_number_of_steps
if global_step_val != self.last_global_step_val:
if (self.save_policy_every_n_steps and
global_step_val // self.save_policy_every_n_steps !=
self.last_global_step_val // self.save_policy_every_n_steps):
self.policy_save_fn(sess)
if (self.log_every_n_steps and
global_step_val % self.log_every_n_steps == 0):
tf.logging.info(
'global step %d: loss = %.4f (%.3f sec/step) (%d steps/sec)',
global_step_val, total_loss, time_elapsed, 1 / time_elapsed)
self.last_global_step_val = global_step_val
stop_early = bool(self.should_stop_early and self.should_stop_early())
return total_loss, should_stop or stop_early
def create_counter_summaries(counters):
"""Add named summaries to counters, a list of tuples (name, counter)."""
if counters:
with tf.name_scope('Counters/'):
for name, counter in counters:
tf.summary.scalar(name, counter)
def gen_debug_batch_summaries(batch):
"""Generates summaries for the sampled replay batch."""
states, actions, rewards, _, next_states = batch
with tf.name_scope('batch'):
for s in range(states.get_shape()[-1]):
tf.summary.histogram('states_%d' % s, states[:, s])
for s in range(states.get_shape()[-1]):
tf.summary.histogram('next_states_%d' % s, next_states[:, s])
for a in range(actions.get_shape()[-1]):
tf.summary.histogram('actions_%d' % a, actions[:, a])
tf.summary.histogram('rewards', rewards)
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Evaluation utility functions.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import tensorflow as tf
from collections import namedtuple
logging = tf.logging
import gin.tf
@gin.configurable
def evaluate_checkpoint_repeatedly(checkpoint_dir,
evaluate_checkpoint_fn,
eval_interval_secs=600,
max_number_of_evaluations=None,
checkpoint_timeout=None,
timeout_fn=None):
"""Evaluates a checkpointed model at a set interval."""
if max_number_of_evaluations is not None and max_number_of_evaluations <= 0:
raise ValueError(
'`max_number_of_evaluations` must be either None or a positive number.')
number_of_evaluations = 0
for checkpoint_path in tf.contrib.training.checkpoints_iterator(
checkpoint_dir,
min_interval_secs=eval_interval_secs,
timeout=checkpoint_timeout,
timeout_fn=timeout_fn):
retries = 3
for _ in range(retries):
try:
should_stop = evaluate_checkpoint_fn(checkpoint_path)
break
except tf.errors.DataLossError as e:
logging.warn(
'Encountered a DataLossError while evaluating a checkpoint. This '
'can happen when reading a checkpoint before it is fully written. '
'Retrying...'
)
time.sleep(2.0)
def compute_model_loss(sess, model_rollout_fn, states, actions):
"""Computes model loss."""
preds, losses = [], []
preds.append(states[0])
losses.append(0)
for state, action in zip(states[1:], actions[1:]):
pred = model_rollout_fn(sess, preds[-1], action)
loss = np.sqrt(np.sum((state - pred) ** 2))
preds.append(pred)
losses.append(loss)
return preds, losses
def compute_average_reward(sess, env_base, step_fn, gamma, num_steps,
num_episodes):
"""Computes the discounted reward for a given number of steps.
Args:
sess: The tensorflow session.
env_base: A python environment.
step_fn: A function that takes in `sess` and returns a list of
[state, action, reward, discount, transition_type] values.
gamma: discounting factor to apply to the reward.
num_steps: number of steps to compute the reward over.
num_episodes: number of episodes to average the reward over.
Returns:
average_reward: a scalar of discounted reward.
last_reward: last reward received.
"""
average_reward = 0
average_last_reward = 0
average_meta_reward = 0
average_last_meta_reward = 0
average_success = 0.
states, actions = None, None
for i in range(num_episodes):
env_base.end_episode()
env_base.begin_episode()
(reward, last_reward, meta_reward, last_meta_reward,
states, actions) = compute_reward(
sess, step_fn, gamma, num_steps)
s_reward = last_meta_reward # Navigation
success = (s_reward > -5.0) # When using diff=False
logging.info('Episode = %d, reward = %s, meta_reward = %f, '
'last_reward = %s, last meta_reward = %f, success = %s',
i, reward, meta_reward, last_reward, last_meta_reward,
success)
average_reward += reward
average_last_reward += last_reward
average_meta_reward += meta_reward
average_last_meta_reward += last_meta_reward
average_success += success
average_reward /= num_episodes
average_last_reward /= num_episodes
average_meta_reward /= num_episodes
average_last_meta_reward /= num_episodes
average_success /= num_episodes
return (average_reward, average_last_reward,
average_meta_reward, average_last_meta_reward,
average_success,
states, actions)
def compute_reward(sess, step_fn, gamma, num_steps):
"""Computes the discounted reward for a given number of steps.
Args:
sess: The tensorflow session.
step_fn: A function that takes in `sess` and returns a list of
[state, action, reward, discount, transition_type] values.
gamma: discounting factor to apply to the reward.
num_steps: number of steps to compute the reward over.
Returns:
reward: cumulative discounted reward.
last_reward: reward received at final step.
"""
total_reward = 0
total_meta_reward = 0
gamma_step = 1
states = []
actions = []
for _ in range(num_steps):
state, action, transition_type, reward, meta_reward, discount, _, _ = step_fn(sess)
total_reward += reward * gamma_step * discount
total_meta_reward += meta_reward * gamma_step * discount
gamma_step *= gamma
states.append(state)
actions.append(action)
return (total_reward, reward, total_meta_reward, meta_reward,
states, actions)
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""TensorFlow utility functions.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from copy import deepcopy
import tensorflow as tf
from tf_agents import specs
from tf_agents.utils import common
_tf_print_counts = dict()
_tf_print_running_sums = dict()
_tf_print_running_counts = dict()
_tf_print_ids = 0
def get_contextual_env_base(env_base, begin_ops=None, end_ops=None):
"""Wrap env_base with additional tf ops."""
# pylint: disable=protected-access
def init(self_, env_base):
self_._env_base = env_base
attribute_list = ["_render_mode", "_gym_env"]
for attribute in attribute_list:
if hasattr(env_base, attribute):
setattr(self_, attribute, getattr(env_base, attribute))
if hasattr(env_base, "physics"):
self_._physics = env_base.physics
elif hasattr(env_base, "gym"):
class Physics(object):
def render(self, *args, **kwargs):
return env_base.gym.render("rgb_array")
physics = Physics()
self_._physics = physics
self_.physics = physics
def set_sess(self_, sess):
self_._sess = sess
if hasattr(self_._env_base, "set_sess"):
self_._env_base.set_sess(sess)
def begin_episode(self_):
self_._env_base.reset()
if begin_ops is not None:
self_._sess.run(begin_ops)
def end_episode(self_):
self_._env_base.reset()
if end_ops is not None:
self_._sess.run(end_ops)
return type("ContextualEnvBase", (env_base.__class__,), dict(
__init__=init,
set_sess=set_sess,
begin_episode=begin_episode,
end_episode=end_episode,
))(env_base)
# pylint: enable=protected-access
def merge_specs(specs_):
"""Merge TensorSpecs.
Args:
specs_: List of TensorSpecs to be merged.
Returns:
a TensorSpec: a merged TensorSpec.
"""
shape = specs_[0].shape
dtype = specs_[0].dtype
name = specs_[0].name
for spec in specs_[1:]:
assert shape[1:] == spec.shape[1:], "incompatible shapes: %s, %s" % (
shape, spec.shape)
assert dtype == spec.dtype, "incompatible dtypes: %s, %s" % (
dtype, spec.dtype)
shape = merge_shapes((shape, spec.shape), axis=0)
return specs.TensorSpec(
shape=shape,
dtype=dtype,
name=name,
)
def merge_shapes(shapes, axis=0):
"""Merge TensorShapes.
Args:
shapes: List of TensorShapes to be merged.
axis: optional, the axis to merge shaped.
Returns:
a TensorShape: a merged TensorShape.
"""
assert len(shapes) > 1
dims = deepcopy(shapes[0].dims)
for shape in shapes[1:]:
assert shapes[0].ndims == shape.ndims
dims[axis] += shape.dims[axis]
return tf.TensorShape(dims=dims)
def get_all_vars(ignore_scopes=None):
"""Get all tf variables in scope.
Args:
ignore_scopes: A list of scope names to ignore.
Returns:
A list of all tf variables in scope.
"""
all_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
all_vars = [var for var in all_vars if ignore_scopes is None or not
any(var.name.startswith(scope) for scope in ignore_scopes)]
return all_vars
def clip(tensor, range_=None):
"""Return a tf op which clips tensor according to range_.
Args:
tensor: A Tensor to be clipped.
range_: None, or a tuple representing (minval, maxval)
Returns:
A clipped Tensor.
"""
if range_ is None:
return tf.identity(tensor)
elif isinstance(range_, (tuple, list)):
assert len(range_) == 2
return tf.clip_by_value(tensor, range_[0], range_[1])
else: raise NotImplementedError("Unacceptable range input: %r" % range_)
def clip_to_bounds(value, minimum, maximum):
"""Clips value to be between minimum and maximum.
Args:
value: (tensor) value to be clipped.
minimum: (numpy float array) minimum value to clip to.
maximum: (numpy float array) maximum value to clip to.
Returns:
clipped_value: (tensor) `value` clipped to between `minimum` and `maximum`.
"""
value = tf.minimum(value, maximum)
return tf.maximum(value, minimum)
clip_to_spec = common.clip_to_spec
def _clip_to_spec(value, spec):
"""Clips value to a given bounded tensor spec.
Args:
value: (tensor) value to be clipped.
spec: (BoundedTensorSpec) spec containing min. and max. values for clipping.
Returns:
clipped_value: (tensor) `value` clipped to be compatible with `spec`.
"""
return clip_to_bounds(value, spec.minimum, spec.maximum)
join_scope = common.join_scope
def _join_scope(parent_scope, child_scope):
"""Joins a parent and child scope using `/`, checking for empty/none.
Args:
parent_scope: (string) parent/prefix scope.
child_scope: (string) child/suffix scope.
Returns:
joined scope: (string) parent and child scopes joined by /.
"""
if not parent_scope:
return child_scope
if not child_scope:
return parent_scope
return '/'.join([parent_scope, child_scope])
def assign_vars(vars_, values):
"""Returns the update ops for assigning a list of vars.
Args:
vars_: A list of variables.
values: A list of tensors representing new values.
Returns:
A list of update ops for the variables.
"""
return [var.assign(value) for var, value in zip(vars_, values)]
def identity_vars(vars_):
"""Return the identity ops for a list of tensors.
Args:
vars_: A list of tensors.
Returns:
A list of identity ops.
"""
return [tf.identity(var) for var in vars_]
def tile(var, batch_size=1):
"""Return tiled tensor.
Args:
var: A tensor representing the state.
batch_size: Batch size.
Returns:
A tensor with shape [batch_size,] + var.shape.
"""
batch_var = tf.tile(
tf.expand_dims(var, 0),
(batch_size,) + (1,) * var.get_shape().ndims)
return batch_var
def batch_list(vars_list):
"""Batch a list of variables.
Args:
vars_list: A list of tensor variables.
Returns:
A list of tensor variables with additional first dimension.
"""
return [tf.expand_dims(var, 0) for var in vars_list]
def tf_print(op,
tensors,
message="",
first_n=-1,
name=None,
sub_messages=None,
print_freq=-1,
include_count=True):
"""tf.Print, but to stdout."""
# TODO(shanegu): `name` is deprecated. Remove from the rest of codes.
global _tf_print_ids
_tf_print_ids += 1
name = _tf_print_ids
_tf_print_counts[name] = 0
if print_freq > 0:
_tf_print_running_sums[name] = [0 for _ in tensors]
_tf_print_running_counts[name] = 0
def print_message(*xs):
"""print message fn."""
_tf_print_counts[name] += 1
if print_freq > 0:
for i, x in enumerate(xs):
_tf_print_running_sums[name][i] += x
_tf_print_running_counts[name] += 1
if (print_freq <= 0 or _tf_print_running_counts[name] >= print_freq) and (
first_n < 0 or _tf_print_counts[name] <= first_n):
for i, x in enumerate(xs):
if print_freq > 0:
del x
x = _tf_print_running_sums[name][i]/_tf_print_running_counts[name]
if sub_messages is None:
sub_message = str(i)
else:
sub_message = sub_messages[i]
log_message = "%s, %s" % (message, sub_message)
if include_count:
log_message += ", count=%d" % _tf_print_counts[name]
tf.logging.info("[%s]: %s" % (log_message, x))
if print_freq > 0:
for i, x in enumerate(xs):
_tf_print_running_sums[name][i] = 0
_tf_print_running_counts[name] = 0
return xs[0]
print_op = tf.py_func(print_message, tensors, tensors[0].dtype)
with tf.control_dependencies([print_op]):
op = tf.identity(op)
return op
periodically = common.periodically
def _periodically(body, period, name='periodically'):
"""Periodically performs a tensorflow op."""
if period is None or period == 0:
return tf.no_op()
if period < 0:
raise ValueError("period cannot be less than 0.")
if period == 1:
return body()
with tf.variable_scope(None, default_name=name):
counter = tf.get_variable(
"counter",
shape=[],
dtype=tf.int64,
trainable=False,
initializer=tf.constant_initializer(period, dtype=tf.int64))
def _wrapped_body():
with tf.control_dependencies([body()]):
return counter.assign(1)
update = tf.cond(
tf.equal(counter, period), _wrapped_body,
lambda: counter.assign_add(1))
return update
soft_variables_update = common.soft_variables_update
Tensorflow mobile video object detection implementation porposed in the following paper:
Tensorflow mobile video object detection implementation proposed in the following paper:
Mobile Video Object Detection with Temporally-Aware Feature Maps (CVPR 2018).
http://openaccess.thecvf.com/content_cvpr_2018/papers/Liu_Mobile_Video_Object_CVPR_2018_paper.pdf
......
......@@ -31,7 +31,7 @@ from google3.pyglib import app
from google3.pyglib import flags
from lstm_object_detection import evaluator
from lstm_object_detection import model_builder
from lstm_object_detection import seq_dataset_builder
from lstm_object_detection.inputs import seq_dataset_builder
from lstm_object_detection.utils import config_util
from object_detection.utils import label_map_util
......
......@@ -25,7 +25,7 @@ that wraps the build function.
import tensorflow as tf
import tensorflow.google as google_tf
from tensorflow.contrib.training.python.training import sequence_queueing_state_saver as sqss
from lstm_object_detection import tf_sequence_example_decoder
from lstm_object_detection.inputs import tf_sequence_example_decoder
from lstm_object_detection.protos import input_reader_google_pb2
from object_detection.core import preprocessor
from object_detection.core import preprocessor_cache
......
......@@ -23,7 +23,7 @@ from google.protobuf import text_format
from google3.testing.pybase import parameterized
from tensorflow.core.example import example_pb2
from tensorflow.core.example import feature_pb2
from lstm_object_detection import seq_dataset_builder
from lstm_object_detection.inputs import seq_dataset_builder
from lstm_object_detection.protos import pipeline_pb2 as internal_pipeline_pb2
from object_detection.builders import preprocessor_builder
from object_detection.core import standard_fields as fields
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment