Commit 27b4acd4 authored by Aman Gupta's avatar Aman Gupta
Browse files

Merge remote-tracking branch 'upstream/master'

parents 5133522f d4e1f97f
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A module with utility functions.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
def trajectory_to_deltas(trajectory, state):
"""Computes a sequence of deltas of a state to traverse a trajectory in 2D.
The initial state of the agent contains its pose -- location in 2D and
orientation. When the computed deltas are incrementally added to it, it
traverses the specified trajectory while keeping its orientation parallel to
the trajectory.
Args:
trajectory: a np.array of floats of shape n x 2. The n-th row contains the
n-th point.
state: a 3 element np.array of floats containing agent's location and
orientation in radians.
Returns:
A np.array of floats of size n x 3.
"""
state = np.reshape(state, [-1])
init_xy = state[0:2]
init_theta = state[2]
delta_xy = trajectory - np.concatenate(
[np.reshape(init_xy, [1, 2]), trajectory[:-1, :]], axis=0)
thetas = np.reshape(np.arctan2(delta_xy[:, 1], delta_xy[:, 0]), [-1, 1])
thetas = np.concatenate([np.reshape(init_theta, [1, 1]), thetas], axis=0)
delta_thetas = thetas[1:] - thetas[:-1]
deltas = np.concatenate([delta_xy, delta_thetas], axis=1)
return deltas
item {
name: "/m/01g317"
id: 1
display_name: "person"
}
item {
name: "/m/0199g"
id: 2
display_name: "bicycle"
}
item {
name: "/m/0k4j"
id: 3
display_name: "car"
}
item {
name: "/m/04_sv"
id: 4
display_name: "motorcycle"
}
item {
name: "/m/05czz6l"
id: 5
display_name: "airplane"
}
item {
name: "/m/01bjv"
id: 6
display_name: "bus"
}
item {
name: "/m/07jdr"
id: 7
display_name: "train"
}
item {
name: "/m/07r04"
id: 8
display_name: "truck"
}
item {
name: "/m/019jd"
id: 9
display_name: "boat"
}
item {
name: "/m/015qff"
id: 10
display_name: "traffic light"
}
item {
name: "/m/01pns0"
id: 11
display_name: "fire hydrant"
}
item {
name: "/m/02pv19"
id: 13
display_name: "stop sign"
}
item {
name: "/m/015qbp"
id: 14
display_name: "parking meter"
}
item {
name: "/m/0cvnqh"
id: 15
display_name: "bench"
}
item {
name: "/m/015p6"
id: 16
display_name: "bird"
}
item {
name: "/m/01yrx"
id: 17
display_name: "cat"
}
item {
name: "/m/0bt9lr"
id: 18
display_name: "dog"
}
item {
name: "/m/03k3r"
id: 19
display_name: "horse"
}
item {
name: "/m/07bgp"
id: 20
display_name: "sheep"
}
item {
name: "/m/01xq0k1"
id: 21
display_name: "cow"
}
item {
name: "/m/0bwd_0j"
id: 22
display_name: "elephant"
}
item {
name: "/m/01dws"
id: 23
display_name: "bear"
}
item {
name: "/m/0898b"
id: 24
display_name: "zebra"
}
item {
name: "/m/03bk1"
id: 25
display_name: "giraffe"
}
item {
name: "/m/01940j"
id: 27
display_name: "backpack"
}
item {
name: "/m/0hnnb"
id: 28
display_name: "umbrella"
}
item {
name: "/m/080hkjn"
id: 31
display_name: "handbag"
}
item {
name: "/m/01rkbr"
id: 32
display_name: "tie"
}
item {
name: "/m/01s55n"
id: 33
display_name: "suitcase"
}
item {
name: "/m/02wmf"
id: 34
display_name: "frisbee"
}
item {
name: "/m/071p9"
id: 35
display_name: "skis"
}
item {
name: "/m/06__v"
id: 36
display_name: "snowboard"
}
item {
name: "/m/018xm"
id: 37
display_name: "sports ball"
}
item {
name: "/m/02zt3"
id: 38
display_name: "kite"
}
item {
name: "/m/03g8mr"
id: 39
display_name: "baseball bat"
}
item {
name: "/m/03grzl"
id: 40
display_name: "baseball glove"
}
item {
name: "/m/06_fw"
id: 41
display_name: "skateboard"
}
item {
name: "/m/019w40"
id: 42
display_name: "surfboard"
}
item {
name: "/m/0dv9c"
id: 43
display_name: "tennis racket"
}
item {
name: "/m/04dr76w"
id: 44
display_name: "bottle"
}
item {
name: "/m/09tvcd"
id: 46
display_name: "wine glass"
}
item {
name: "/m/08gqpm"
id: 47
display_name: "cup"
}
item {
name: "/m/0dt3t"
id: 48
display_name: "fork"
}
item {
name: "/m/04ctx"
id: 49
display_name: "knife"
}
item {
name: "/m/0cmx8"
id: 50
display_name: "spoon"
}
item {
name: "/m/04kkgm"
id: 51
display_name: "bowl"
}
item {
name: "/m/09qck"
id: 52
display_name: "banana"
}
item {
name: "/m/014j1m"
id: 53
display_name: "apple"
}
item {
name: "/m/0l515"
id: 54
display_name: "sandwich"
}
item {
name: "/m/0cyhj_"
id: 55
display_name: "orange"
}
item {
name: "/m/0hkxq"
id: 56
display_name: "broccoli"
}
item {
name: "/m/0fj52s"
id: 57
display_name: "carrot"
}
item {
name: "/m/01b9xk"
id: 58
display_name: "hot dog"
}
item {
name: "/m/0663v"
id: 59
display_name: "pizza"
}
item {
name: "/m/0jy4k"
id: 60
display_name: "donut"
}
item {
name: "/m/0fszt"
id: 61
display_name: "cake"
}
item {
name: "/m/01mzpv"
id: 62
display_name: "chair"
}
item {
name: "/m/02crq1"
id: 63
display_name: "couch"
}
item {
name: "/m/03fp41"
id: 64
display_name: "potted plant"
}
item {
name: "/m/03ssj5"
id: 65
display_name: "bed"
}
item {
name: "/m/04bcr3"
id: 67
display_name: "dining table"
}
item {
name: "/m/09g1w"
id: 70
display_name: "toilet"
}
item {
name: "/m/07c52"
id: 72
display_name: "tv"
}
item {
name: "/m/01c648"
id: 73
display_name: "laptop"
}
item {
name: "/m/020lf"
id: 74
display_name: "mouse"
}
item {
name: "/m/0qjjc"
id: 75
display_name: "remote"
}
item {
name: "/m/01m2v"
id: 76
display_name: "keyboard"
}
item {
name: "/m/050k8"
id: 77
display_name: "cell phone"
}
item {
name: "/m/0fx9l"
id: 78
display_name: "microwave"
}
item {
name: "/m/029bxz"
id: 79
display_name: "oven"
}
item {
name: "/m/01k6s3"
id: 80
display_name: "toaster"
}
item {
name: "/m/0130jx"
id: 81
display_name: "sink"
}
item {
name: "/m/040b_t"
id: 82
display_name: "refrigerator"
}
item {
name: "/m/0bt_c3"
id: 84
display_name: "book"
}
item {
name: "/m/01x3z"
id: 85
display_name: "clock"
}
item {
name: "/m/02s195"
id: 86
display_name: "vase"
}
item {
name: "/m/01lsmm"
id: 87
display_name: "scissors"
}
item {
name: "/m/0kmg4"
id: 88
display_name: "teddy bear"
}
item {
name: "/m/03wvsk"
id: 89
display_name: "hair drier"
}
item {
name: "/m/012xff"
id: 90
display_name: "toothbrush"
}
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Label map utility functions."""
import logging
import tensorflow as tf
from google.protobuf import text_format
import string_int_label_map_pb2
def _validate_label_map(label_map):
"""Checks if a label map is valid.
Args:
label_map: StringIntLabelMap to validate.
Raises:
ValueError: if label map is invalid.
"""
for item in label_map.item:
if item.id < 0:
raise ValueError('Label map ids should be >= 0.')
if (item.id == 0 and item.name != 'background' and
item.display_name != 'background'):
raise ValueError('Label map id 0 is reserved for the background label')
def create_category_index(categories):
"""Creates dictionary of COCO compatible categories keyed by category id.
Args:
categories: a list of dicts, each of which has the following keys:
'id': (required) an integer id uniquely identifying this category.
'name': (required) string representing category name
e.g., 'cat', 'dog', 'pizza'.
Returns:
category_index: a dict containing the same entries as categories, but keyed
by the 'id' field of each category.
"""
category_index = {}
for cat in categories:
category_index[cat['id']] = cat
return category_index
def get_max_label_map_index(label_map):
"""Get maximum index in label map.
Args:
label_map: a StringIntLabelMapProto
Returns:
an integer
"""
return max([item.id for item in label_map.item])
def convert_label_map_to_categories(label_map,
max_num_classes,
use_display_name=True):
"""Loads label map proto and returns categories list compatible with eval.
This function loads a label map and returns a list of dicts, each of which
has the following keys:
'id': (required) an integer id uniquely identifying this category.
'name': (required) string representing category name
e.g., 'cat', 'dog', 'pizza'.
We only allow class into the list if its id-label_id_offset is
between 0 (inclusive) and max_num_classes (exclusive).
If there are several items mapping to the same id in the label map,
we will only keep the first one in the categories list.
Args:
label_map: a StringIntLabelMapProto or None. If None, a default categories
list is created with max_num_classes categories.
max_num_classes: maximum number of (consecutive) label indices to include.
use_display_name: (boolean) choose whether to load 'display_name' field
as category name. If False or if the display_name field does not exist,
uses 'name' field as category names instead.
Returns:
categories: a list of dictionaries representing all possible categories.
"""
categories = []
list_of_ids_already_added = []
if not label_map:
label_id_offset = 1
for class_id in range(max_num_classes):
categories.append({
'id': class_id + label_id_offset,
'name': 'category_{}'.format(class_id + label_id_offset)
})
return categories
for item in label_map.item:
if not 0 < item.id <= max_num_classes:
logging.info('Ignore item %d since it falls outside of requested '
'label range.', item.id)
continue
if use_display_name and item.HasField('display_name'):
name = item.display_name
else:
name = item.name
if item.id not in list_of_ids_already_added:
list_of_ids_already_added.append(item.id)
categories.append({'id': item.id, 'name': name})
return categories
def load_labelmap(path):
"""Loads label map proto.
Args:
path: path to StringIntLabelMap proto text file.
Returns:
a StringIntLabelMapProto
"""
with tf.gfile.GFile(path, 'r') as fid:
label_map_string = fid.read()
label_map = string_int_label_map_pb2.StringIntLabelMap()
try:
text_format.Merge(label_map_string, label_map)
except text_format.ParseError:
label_map.ParseFromString(label_map_string)
_validate_label_map(label_map)
return label_map
def get_label_map_dict(label_map_path, use_display_name=False):
"""Reads a label map and returns a dictionary of label names to id.
Args:
label_map_path: path to label_map.
use_display_name: whether to use the label map items' display names as keys.
Returns:
A dictionary mapping label names to id.
"""
label_map = load_labelmap(label_map_path)
label_map_dict = {}
for item in label_map.item:
if use_display_name:
label_map_dict[item.display_name] = item.id
else:
label_map_dict[item.name] = item.id
return label_map_dict
def create_category_index_from_labelmap(label_map_path):
"""Reads a label map and returns a category index.
Args:
label_map_path: Path to `StringIntLabelMap` proto text file.
Returns:
A category index, which is a dictionary that maps integer ids to dicts
containing categories, e.g.
{1: {'id': 1, 'name': 'dog'}, 2: {'id': 2, 'name': 'cat'}, ...}
"""
label_map = load_labelmap(label_map_path)
max_num_classes = max(item.id for item in label_map.item)
categories = convert_label_map_to_categories(label_map, max_num_classes)
return create_category_index(categories)
def create_class_agnostic_category_index():
"""Creates a category index with a single `object` class."""
return {1: {'id': 1, 'name': 'object'}}
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Interface for the policy of the agents use for navigation."""
import abc
import tensorflow as tf
from absl import logging
import embedders
from envs import task_env
slim = tf.contrib.slim
def _print_debug_ios(history, goal, output):
"""Prints sizes of history, goal and outputs."""
if history is not None:
shape = history.get_shape().as_list()
# logging.info('history embedding shape ')
# logging.info(shape)
if len(shape) != 3:
raise ValueError('history Tensor must have rank=3')
if goal is not None:
logging.info('goal embedding shape ')
logging.info(goal.get_shape().as_list())
if output is not None:
logging.info('targets shape ')
logging.info(output.get_shape().as_list())
class Policy(object):
"""Represents the policy of the agent for navigation tasks.
Instantiates a policy that takes embedders for each modality and builds a
model to infer the actions.
"""
__metaclass__ = abc.ABCMeta
def __init__(self, embedders_dict, action_size):
"""Instantiates the policy.
Args:
embedders_dict: Dictionary of embedders for different modalities. Keys
should be identical to keys of observation modality.
action_size: Number of possible actions.
"""
self._embedders = embedders_dict
self._action_size = action_size
@abc.abstractmethod
def build(self, observations, prev_state):
"""Builds the model that represents the policy of the agent.
Args:
observations: Dictionary of observations from different modalities. Keys
are the name of the modalities.
prev_state: The tensor of the previous state of the model. Should be set
to None if the policy is stateless
Returns:
Tuple of (action, state) where action is the action logits and state is
the state of the model after taking new observation.
"""
raise NotImplementedError(
'Needs implementation as part of Policy interface')
class LSTMPolicy(Policy):
"""Represents the implementation of the LSTM based policy.
The architecture of the model is as follows. It embeds all the observations
using the embedders, concatenates the embeddings of all the modalities. Feed
them through two fully connected layers. The lstm takes the features from
fully connected layer and the previous action and success of previous action
and feed them to LSTM. The value for each action is predicted afterwards.
Although the class name has the word LSTM in it, it also supports a mode that
builds the network without LSTM just for comparison purposes.
"""
def __init__(self,
modality_names,
embedders_dict,
action_size,
params,
max_episode_length,
feedforward_mode=False):
"""Instantiates the LSTM policy.
Args:
modality_names: List of modality names. Makes sure the ordering in
concatenation remains the same as modality_names list. Each modality
needs to be in the embedders_dict.
embedders_dict: Dictionary of embedders for different modalities. Keys
should be identical to keys of observation modality. Values should be
instance of Embedder class. All the observations except PREV_ACTION
requires embedder.
action_size: Number of possible actions.
params: is instance of tf.hparams and contains the hyperparameters for the
policy network.
max_episode_length: integer, specifying the maximum length of each
episode.
feedforward_mode: If True, it does not add LSTM to the model. It should
only be set True for comparison between LSTM and feedforward models.
"""
super(LSTMPolicy, self).__init__(embedders_dict, action_size)
self._modality_names = modality_names
self._lstm_state_size = params.lstm_state_size
self._fc_channels = params.fc_channels
self._weight_decay = params.weight_decay
self._target_embedding_size = params.target_embedding_size
self._max_episode_length = max_episode_length
self._feedforward_mode = feedforward_mode
def _build_lstm(self, encoded_inputs, prev_state, episode_length,
prev_action=None):
"""Builds an LSTM on top of the encoded inputs.
If prev_action is not None then it concatenates them to the input of LSTM.
Args:
encoded_inputs: The embedding of the observations and goal.
prev_state: previous state of LSTM.
episode_length: The tensor that contains the length of the sequence for
each element of the batch.
prev_action: tensor to previous chosen action and additional bit for
indicating whether the previous action was successful or not.
Returns:
a tuple of (lstm output, lstm state).
"""
# Adding prev action and success in addition to the embeddings of the
# modalities.
if prev_action is not None:
encoded_inputs = tf.concat([encoded_inputs, prev_action], axis=-1)
with tf.variable_scope('LSTM'):
lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(self._lstm_state_size)
if prev_state is None:
# If prev state is set to None, a state of all zeros will be
# passed as a previous value for the cell. Should be used for the
# first step of each episode.
tf_prev_state = lstm_cell.zero_state(
encoded_inputs.get_shape().as_list()[0], dtype=tf.float32)
else:
tf_prev_state = tf.nn.rnn_cell.LSTMStateTuple(prev_state[0],
prev_state[1])
lstm_outputs, lstm_state = tf.nn.dynamic_rnn(
cell=lstm_cell,
inputs=encoded_inputs,
sequence_length=episode_length,
initial_state=tf_prev_state,
dtype=tf.float32,
)
lstm_outputs = tf.reshape(lstm_outputs, [-1, lstm_cell.output_size])
return lstm_outputs, lstm_state
def build(
self,
observations,
prev_state,
):
"""Builds the model that represents the policy of the agent.
Args:
observations: Dictionary of observations from different modalities. Keys
are the name of the modalities. Observation should have the following
key-values.
observations['goal']: One-hot tensor that indicates the semantic
category of the goal. The shape should be
(batch_size x max_sequence_length x goals).
observations[task_env.ModalityTypes.PREV_ACTION]: has action_size + 1
elements where the first action_size numbers are the one hot vector
of the previous action and the last element indicates whether the
previous action was successful or not. If
task_env.ModalityTypes.PREV_ACTION is not in the observation, it
will not be used in the policy.
prev_state: Previous state of the model. It should be a tuple of (c,h)
where c and h are the previous cell value and hidden state of the lstm.
Each element of tuple has shape of (batch_size x lstm_cell_size).
If it is set to None, then it initializes the state of the lstm with all
zeros.
Returns:
Tuple of (action, state) where action is the action logits and state is
the state of the model after taking new observation.
Raises:
ValueError: If any of the modality names is not in observations or
embedders_dict.
ValueError: If 'goal' is not in the observations.
"""
for modality_name in self._modality_names:
if modality_name not in observations:
raise ValueError('modality name does not exist in observations: {} not '
'in {}'.format(modality_name, observations.keys()))
if modality_name not in self._embedders:
if modality_name == task_env.ModalityTypes.PREV_ACTION:
continue
raise ValueError('modality name does not have corresponding embedder'
' {} not in {}'.format(modality_name,
self._embedders.keys()))
if task_env.ModalityTypes.GOAL not in observations:
raise ValueError('goal should be provided in the observations')
goal = observations[task_env.ModalityTypes.GOAL]
prev_action = None
if task_env.ModalityTypes.PREV_ACTION in observations:
prev_action = observations[task_env.ModalityTypes.PREV_ACTION]
with tf.variable_scope('policy'):
with slim.arg_scope(
[slim.fully_connected],
activation_fn=tf.nn.relu,
weights_initializer=tf.truncated_normal_initializer(stddev=0.01),
weights_regularizer=slim.l2_regularizer(self._weight_decay)):
all_inputs = []
# Concatenating the embedding of each modality by applying the embedders
# to corresponding observations.
def embed(name):
with tf.variable_scope('embed_{}'.format(name)):
# logging.info('Policy uses embedding %s', name)
return self._embedders[name].build(observations[name])
all_inputs = map(embed, [
x for x in self._modality_names
if x != task_env.ModalityTypes.PREV_ACTION
])
# Computing goal embedding.
shape = goal.get_shape().as_list()
with tf.variable_scope('embed_goal'):
encoded_goal = tf.reshape(goal, [shape[0] * shape[1], -1])
encoded_goal = slim.fully_connected(encoded_goal,
self._target_embedding_size)
encoded_goal = tf.reshape(encoded_goal, [shape[0], shape[1], -1])
all_inputs.append(encoded_goal)
# Concatenating all the modalities and goal.
all_inputs = tf.concat(all_inputs, axis=-1, name='concat_embeddings')
shape = all_inputs.get_shape().as_list()
all_inputs = tf.reshape(all_inputs, [shape[0] * shape[1], shape[2]])
# Applying fully connected layers.
encoded_inputs = slim.fully_connected(all_inputs, self._fc_channels)
encoded_inputs = slim.fully_connected(encoded_inputs, self._fc_channels)
if not self._feedforward_mode:
encoded_inputs = tf.reshape(encoded_inputs,
[shape[0], shape[1], self._fc_channels])
lstm_outputs, lstm_state = self._build_lstm(
encoded_inputs=encoded_inputs,
prev_state=prev_state,
episode_length=tf.ones((shape[0],), dtype=tf.float32) *
self._max_episode_length,
prev_action=prev_action,
)
else:
# If feedforward_mode=True, directly compute bypass the whole LSTM
# computations.
lstm_outputs = encoded_inputs
lstm_outputs = slim.fully_connected(lstm_outputs, self._fc_channels)
action_values = slim.fully_connected(
lstm_outputs, self._action_size, activation_fn=None)
action_values = tf.reshape(action_values, [shape[0], shape[1], -1])
if not self._feedforward_mode:
return action_values, lstm_state
else:
return action_values, None
class TaskPolicy(Policy):
"""A covenience abstract class providing functionality to deal with Tasks."""
def __init__(self,
task_config,
model_hparams=None,
embedder_hparams=None,
train_hparams=None):
"""Constructs a policy which knows how to work with tasks (see tasks.py).
It allows to read task history, goal and outputs in consistency with the
task config.
Args:
task_config: an object of type tasks.TaskIOConfig (see tasks.py)
model_hparams: a tf.HParams object containing parameter pertaining to
model (these are implementation specific)
embedder_hparams: a tf.HParams object containing parameter pertaining to
history, goal embedders (these are implementation specific)
train_hparams: a tf.HParams object containing parameter pertaining to
trainin (these are implementation specific)`
"""
super(TaskPolicy, self).__init__(None, None)
self._model_hparams = model_hparams
self._embedder_hparams = embedder_hparams
self._train_hparams = train_hparams
self._task_config = task_config
self._extra_train_ops = []
@property
def extra_train_ops(self):
"""Training ops in addition to the loss, e.g. batch norm updates.
Returns:
A list of tf ops.
"""
return self._extra_train_ops
def _embed_task_ios(self, streams):
"""Embeds a list of heterogenous streams.
These streams correspond to task history, goal and output. The number of
streams is equal to the total number of history, plus one for the goal if
present, plus one for the output. If the number of history is k, then the
first k streams are the history.
The used embedders depend on the input (or goal) types. If an input is an
image, then a ResNet embedder is used, otherwise
MLPEmbedder (see embedders.py).
Args:
streams: a list of Tensors.
Returns:
Three float Tensors history, goal, output. If there are no history, or no
goal, then the corresponding returned values are None. The shape of the
embedded history is batch_size x sequence_length x sum of all embedding
dimensions for all history. The shape of the goal is embedding dimension.
"""
# EMBED history.
index = 0
inps = []
scopes = []
for c in self._task_config.inputs:
if c == task_env.ModalityTypes.IMAGE:
scope_name = 'image_embedder/image'
reuse = scope_name in scopes
scopes.append(scope_name)
with tf.variable_scope(scope_name, reuse=reuse):
resnet_embedder = embedders.ResNet(self._embedder_hparams.image)
image_embeddings = resnet_embedder.build(streams[index])
# Uncover batch norm ops.
if self._embedder_hparams.image.is_train:
self._extra_train_ops += resnet_embedder.extra_train_ops
inps.append(image_embeddings)
index += 1
else:
scope_name = 'input_embedder/vector'
reuse = scope_name in scopes
scopes.append(scope_name)
with tf.variable_scope(scope_name, reuse=reuse):
input_vector_embedder = embedders.MLPEmbedder(
layers=self._embedder_hparams.vector)
vector_embedder = input_vector_embedder.build(streams[index])
inps.append(vector_embedder)
index += 1
history = tf.concat(inps, axis=2) if inps else None
# EMBED goal.
goal = None
if self._task_config.query is not None:
scope_name = 'image_embedder/query'
reuse = scope_name in scopes
scopes.append(scope_name)
with tf.variable_scope(scope_name, reuse=reuse):
resnet_goal_embedder = embedders.ResNet(self._embedder_hparams.goal)
goal = resnet_goal_embedder.build(streams[index])
if self._embedder_hparams.goal.is_train:
self._extra_train_ops += resnet_goal_embedder.extra_train_ops
index += 1
# Embed true targets if needed (tbd).
true_target = streams[index]
return history, goal, true_target
@abc.abstractmethod
def build(self, feeds, prev_state):
pass
class ReactivePolicy(TaskPolicy):
"""A policy which ignores history.
It processes only the current observation (last element in history) and the
goal to output a prediction.
"""
def __init__(self, *args, **kwargs):
super(ReactivePolicy, self).__init__(*args, **kwargs)
# The current implementation ignores the prev_state as it is purely reactive.
# It returns None for the current state.
def build(self, feeds, prev_state):
history, goal, _ = self._embed_task_ios(feeds)
_print_debug_ios(history, goal, None)
with tf.variable_scope('output_decoder'):
# Concatenate the embeddings of the current observation and the goal.
reactive_input = tf.concat([tf.squeeze(history[:, -1, :]), goal], axis=1)
oconfig = self._task_config.output.shape
assert len(oconfig) == 1
decoder = embedders.MLPEmbedder(
layers=self._embedder_hparams.predictions.layer_sizes + oconfig)
predictions = decoder.build(reactive_input)
return predictions, None
class RNNPolicy(TaskPolicy):
"""A policy which takes into account the full history via RNN.
The implementation might and will change.
The history, together with the goal, is processed using a stacked LSTM. The
output of the last LSTM step is used to produce a prediction. Currently, only
a single step output is supported.
"""
def __init__(self, lstm_hparams, *args, **kwargs):
super(RNNPolicy, self).__init__(*args, **kwargs)
self._lstm_hparams = lstm_hparams
# The prev_state is ignored as for now the full history is specified as first
# element of the feeds. It might turn out to be beneficial to keep the state
# as part of the policy object.
def build(self, feeds, state):
history, goal, _ = self._embed_task_ios(feeds)
_print_debug_ios(history, goal, None)
params = self._lstm_hparams
cell = lambda: tf.contrib.rnn.BasicLSTMCell(params.cell_size)
stacked_lstm = tf.contrib.rnn.MultiRNNCell(
[cell() for _ in range(params.num_layers)])
# history is of shape batch_size x seq_len x embedding_dimension
batch_size, seq_len, _ = tuple(history.get_shape().as_list())
if state is None:
state = stacked_lstm.zero_state(batch_size, tf.float32)
for t in range(seq_len):
if params.concat_goal_everywhere:
lstm_input = tf.concat([tf.squeeze(history[:, t, :]), goal], axis=1)
else:
lstm_input = tf.squeeze(history[:, t, :])
output, state = stacked_lstm(lstm_input, state)
with tf.variable_scope('output_decoder'):
oconfig = self._task_config.output.shape
assert len(oconfig) == 1
features = tf.concat([output, goal], axis=1)
assert len(output.get_shape().as_list()) == 2
assert len(goal.get_shape().as_list()) == 2
decoder = embedders.MLPEmbedder(
layers=self._embedder_hparams.predictions.layer_sizes + oconfig)
# Prediction is done off the last step lstm output and the goal.
predictions = decoder.build(features)
return predictions, state
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Provides utilities to preprocess images in CIFAR-10.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
_PADDING = 4
slim = tf.contrib.slim
def preprocess_for_train(image,
output_height,
output_width,
padding=_PADDING,
add_image_summaries=True):
"""Preprocesses the given image for training.
Note that the actual resizing scale is sampled from
[`resize_size_min`, `resize_size_max`].
Args:
image: A `Tensor` representing an image of arbitrary size.
output_height: The height of the image after preprocessing.
output_width: The width of the image after preprocessing.
padding: The amound of padding before and after each dimension of the image.
add_image_summaries: Enable image summaries.
Returns:
A preprocessed image.
"""
if add_image_summaries:
tf.summary.image('image', tf.expand_dims(image, 0))
# Transform the image to floats.
image = tf.to_float(image)
if padding > 0:
image = tf.pad(image, [[padding, padding], [padding, padding], [0, 0]])
# Randomly crop a [height, width] section of the image.
distorted_image = tf.random_crop(image,
[output_height, output_width, 3])
# Randomly flip the image horizontally.
distorted_image = tf.image.random_flip_left_right(distorted_image)
if add_image_summaries:
tf.summary.image('distorted_image', tf.expand_dims(distorted_image, 0))
# Because these operations are not commutative, consider randomizing
# the order their operation.
distorted_image = tf.image.random_brightness(distorted_image,
max_delta=63)
distorted_image = tf.image.random_contrast(distorted_image,
lower=0.2, upper=1.8)
# Subtract off the mean and divide by the variance of the pixels.
return tf.image.per_image_standardization(distorted_image)
def preprocess_for_eval(image, output_height, output_width,
add_image_summaries=True):
"""Preprocesses the given image for evaluation.
Args:
image: A `Tensor` representing an image of arbitrary size.
output_height: The height of the image after preprocessing.
output_width: The width of the image after preprocessing.
add_image_summaries: Enable image summaries.
Returns:
A preprocessed image.
"""
if add_image_summaries:
tf.summary.image('image', tf.expand_dims(image, 0))
# Transform the image to floats.
image = tf.to_float(image)
# Resize and crop if needed.
resized_image = tf.image.resize_image_with_crop_or_pad(image,
output_width,
output_height)
if add_image_summaries:
tf.summary.image('resized_image', tf.expand_dims(resized_image, 0))
# Subtract off the mean and divide by the variance of the pixels.
return tf.image.per_image_standardization(resized_image)
def preprocess_image(image, output_height, output_width, is_training=False,
add_image_summaries=True):
"""Preprocesses the given image.
Args:
image: A `Tensor` representing an image of arbitrary size.
output_height: The height of the image after preprocessing.
output_width: The width of the image after preprocessing.
is_training: `True` if we're preprocessing the image for training and
`False` otherwise.
add_image_summaries: Enable image summaries.
Returns:
A preprocessed image.
"""
if is_training:
return preprocess_for_train(
image, output_height, output_width,
add_image_summaries=add_image_summaries)
else:
return preprocess_for_eval(
image, output_height, output_width,
add_image_summaries=add_image_summaries)
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Provides utilities to preprocess images for the Inception networks."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from tensorflow.python.ops import control_flow_ops
def apply_with_random_selector(x, func, num_cases):
"""Computes func(x, sel), with sel sampled from [0...num_cases-1].
Args:
x: input Tensor.
func: Python function to apply.
num_cases: Python int32, number of cases to sample sel from.
Returns:
The result of func(x, sel), where func receives the value of the
selector as a python integer, but sel is sampled dynamically.
"""
sel = tf.random_uniform([], maxval=num_cases, dtype=tf.int32)
# Pass the real x only to one of the func calls.
return control_flow_ops.merge([
func(control_flow_ops.switch(x, tf.equal(sel, case))[1], case)
for case in range(num_cases)])[0]
def distort_color(image, color_ordering=0, fast_mode=True, scope=None):
"""Distort the color of a Tensor image.
Each color distortion is non-commutative and thus ordering of the color ops
matters. Ideally we would randomly permute the ordering of the color ops.
Rather then adding that level of complication, we select a distinct ordering
of color ops for each preprocessing thread.
Args:
image: 3-D Tensor containing single image in [0, 1].
color_ordering: Python int, a type of distortion (valid values: 0-3).
fast_mode: Avoids slower ops (random_hue and random_contrast)
scope: Optional scope for name_scope.
Returns:
3-D Tensor color-distorted image on range [0, 1]
Raises:
ValueError: if color_ordering not in [0, 3]
"""
with tf.name_scope(scope, 'distort_color', [image]):
if fast_mode:
if color_ordering == 0:
image = tf.image.random_brightness(image, max_delta=32. / 255.)
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
else:
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
image = tf.image.random_brightness(image, max_delta=32. / 255.)
else:
if color_ordering == 0:
image = tf.image.random_brightness(image, max_delta=32. / 255.)
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
image = tf.image.random_hue(image, max_delta=0.2)
image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
elif color_ordering == 1:
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
image = tf.image.random_brightness(image, max_delta=32. / 255.)
image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
image = tf.image.random_hue(image, max_delta=0.2)
elif color_ordering == 2:
image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
image = tf.image.random_hue(image, max_delta=0.2)
image = tf.image.random_brightness(image, max_delta=32. / 255.)
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
elif color_ordering == 3:
image = tf.image.random_hue(image, max_delta=0.2)
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
image = tf.image.random_brightness(image, max_delta=32. / 255.)
else:
raise ValueError('color_ordering must be in [0, 3]')
# The random_* ops do not necessarily clamp.
return tf.clip_by_value(image, 0.0, 1.0)
def distorted_bounding_box_crop(image,
bbox,
min_object_covered=0.1,
aspect_ratio_range=(0.75, 1.33),
area_range=(0.05, 1.0),
max_attempts=100,
scope=None):
"""Generates cropped_image using a one of the bboxes randomly distorted.
See `tf.image.sample_distorted_bounding_box` for more documentation.
Args:
image: 3-D Tensor of image (it will be converted to floats in [0, 1]).
bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
where each coordinate is [0, 1) and the coordinates are arranged
as [ymin, xmin, ymax, xmax]. If num_boxes is 0 then it would use the whole
image.
min_object_covered: An optional `float`. Defaults to `0.1`. The cropped
area of the image must contain at least this fraction of any bounding box
supplied.
aspect_ratio_range: An optional list of `floats`. The cropped area of the
image must have an aspect ratio = width / height within this range.
area_range: An optional list of `floats`. The cropped area of the image
must contain a fraction of the supplied image within in this range.
max_attempts: An optional `int`. Number of attempts at generating a cropped
region of the image of the specified constraints. After `max_attempts`
failures, return the entire image.
scope: Optional scope for name_scope.
Returns:
A tuple, a 3-D Tensor cropped_image and the distorted bbox
"""
with tf.name_scope(scope, 'distorted_bounding_box_crop', [image, bbox]):
# Each bounding box has shape [1, num_boxes, box coords] and
# the coordinates are ordered [ymin, xmin, ymax, xmax].
# A large fraction of image datasets contain a human-annotated bounding
# box delineating the region of the image containing the object of interest.
# We choose to create a new bounding box for the object which is a randomly
# distorted version of the human-annotated bounding box that obeys an
# allowed range of aspect ratios, sizes and overlap with the human-annotated
# bounding box. If no box is supplied, then we assume the bounding box is
# the entire image.
sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
tf.shape(image),
bounding_boxes=bbox,
min_object_covered=min_object_covered,
aspect_ratio_range=aspect_ratio_range,
area_range=area_range,
max_attempts=max_attempts,
use_image_if_no_bounding_boxes=True)
bbox_begin, bbox_size, distort_bbox = sample_distorted_bounding_box
# Crop the image to the specified bounding box.
cropped_image = tf.slice(image, bbox_begin, bbox_size)
return cropped_image, distort_bbox
def preprocess_for_train(image, height, width, bbox,
fast_mode=True,
scope=None,
add_image_summaries=True):
"""Distort one image for training a network.
Distorting images provides a useful technique for augmenting the data
set during training in order to make the network invariant to aspects
of the image that do not effect the label.
Additionally it would create image_summaries to display the different
transformations applied to the image.
Args:
image: 3-D Tensor of image. If dtype is tf.float32 then the range should be
[0, 1], otherwise it would converted to tf.float32 assuming that the range
is [0, MAX], where MAX is largest positive representable number for
int(8/16/32) data type (see `tf.image.convert_image_dtype` for details).
height: integer
width: integer
bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
where each coordinate is [0, 1) and the coordinates are arranged
as [ymin, xmin, ymax, xmax].
fast_mode: Optional boolean, if True avoids slower transformations (i.e.
bi-cubic resizing, random_hue or random_contrast).
scope: Optional scope for name_scope.
add_image_summaries: Enable image summaries.
Returns:
3-D float Tensor of distorted image used for training with range [-1, 1].
"""
with tf.name_scope(scope, 'distort_image', [image, height, width, bbox]):
if bbox is None:
bbox = tf.constant([0.0, 0.0, 1.0, 1.0],
dtype=tf.float32,
shape=[1, 1, 4])
if image.dtype != tf.float32:
image = tf.image.convert_image_dtype(image, dtype=tf.float32)
# Each bounding box has shape [1, num_boxes, box coords] and
# the coordinates are ordered [ymin, xmin, ymax, xmax].
image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
bbox)
if add_image_summaries:
tf.summary.image('image_with_bounding_boxes', image_with_box)
distorted_image, distorted_bbox = distorted_bounding_box_crop(image, bbox)
# Restore the shape since the dynamic slice based upon the bbox_size loses
# the third dimension.
distorted_image.set_shape([None, None, 3])
image_with_distorted_box = tf.image.draw_bounding_boxes(
tf.expand_dims(image, 0), distorted_bbox)
if add_image_summaries:
tf.summary.image('images_with_distorted_bounding_box',
image_with_distorted_box)
# This resizing operation may distort the images because the aspect
# ratio is not respected. We select a resize method in a round robin
# fashion based on the thread number.
# Note that ResizeMethod contains 4 enumerated resizing methods.
# We select only 1 case for fast_mode bilinear.
num_resize_cases = 1 if fast_mode else 4
distorted_image = apply_with_random_selector(
distorted_image,
lambda x, method: tf.image.resize_images(x, [height, width], method),
num_cases=num_resize_cases)
if add_image_summaries:
tf.summary.image('cropped_resized_image',
tf.expand_dims(distorted_image, 0))
# Randomly flip the image horizontally.
distorted_image = tf.image.random_flip_left_right(distorted_image)
# Randomly distort the colors. There are 1 or 4 ways to do it.
num_distort_cases = 1 if fast_mode else 4
distorted_image = apply_with_random_selector(
distorted_image,
lambda x, ordering: distort_color(x, ordering, fast_mode),
num_cases=num_distort_cases)
if add_image_summaries:
tf.summary.image('final_distorted_image',
tf.expand_dims(distorted_image, 0))
distorted_image = tf.subtract(distorted_image, 0.5)
distorted_image = tf.multiply(distorted_image, 2.0)
return distorted_image
def preprocess_for_eval(image, height, width,
central_fraction=0.875, scope=None):
"""Prepare one image for evaluation.
If height and width are specified it would output an image with that size by
applying resize_bilinear.
If central_fraction is specified it would crop the central fraction of the
input image.
Args:
image: 3-D Tensor of image. If dtype is tf.float32 then the range should be
[0, 1], otherwise it would converted to tf.float32 assuming that the range
is [0, MAX], where MAX is largest positive representable number for
int(8/16/32) data type (see `tf.image.convert_image_dtype` for details).
height: integer
width: integer
central_fraction: Optional Float, fraction of the image to crop.
scope: Optional scope for name_scope.
Returns:
3-D float Tensor of prepared image.
"""
with tf.name_scope(scope, 'eval_image', [image, height, width]):
if image.dtype != tf.float32:
image = tf.image.convert_image_dtype(image, dtype=tf.float32)
# Crop the central region of the image with an area containing 87.5% of
# the original image.
if central_fraction:
image = tf.image.central_crop(image, central_fraction=central_fraction)
if height and width:
# Resize the image to the specified height and width.
image = tf.expand_dims(image, 0)
image = tf.image.resize_bilinear(image, [height, width],
align_corners=False)
image = tf.squeeze(image, [0])
image = tf.subtract(image, 0.5)
image = tf.multiply(image, 2.0)
return image
def preprocess_image(image, height, width,
is_training=False,
bbox=None,
fast_mode=True,
add_image_summaries=True):
"""Pre-process one image for training or evaluation.
Args:
image: 3-D Tensor [height, width, channels] with the image. If dtype is
tf.float32 then the range should be [0, 1], otherwise it would converted
to tf.float32 assuming that the range is [0, MAX], where MAX is largest
positive representable number for int(8/16/32) data type (see
`tf.image.convert_image_dtype` for details).
height: integer, image expected height.
width: integer, image expected width.
is_training: Boolean. If true it would transform an image for train,
otherwise it would transform it for evaluation.
bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
where each coordinate is [0, 1) and the coordinates are arranged as
[ymin, xmin, ymax, xmax].
fast_mode: Optional boolean, if True avoids slower transformations.
add_image_summaries: Enable image summaries.
Returns:
3-D float Tensor containing an appropriately scaled image
Raises:
ValueError: if user does not provide bounding box
"""
if is_training:
return preprocess_for_train(image, height, width, bbox, fast_mode,
add_image_summaries=add_image_summaries)
else:
return preprocess_for_eval(image, height, width)
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Provides utilities for preprocessing."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
slim = tf.contrib.slim
def preprocess_image(image, output_height, output_width, is_training):
"""Preprocesses the given image.
Args:
image: A `Tensor` representing an image of arbitrary size.
output_height: The height of the image after preprocessing.
output_width: The width of the image after preprocessing.
is_training: `True` if we're preprocessing the image for training and
`False` otherwise.
Returns:
A preprocessed image.
"""
image = tf.to_float(image)
image = tf.image.resize_image_with_crop_or_pad(
image, output_width, output_height)
image = tf.subtract(image, 128.0)
image = tf.div(image, 128.0)
return image
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Contains a factory for building various models."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from preprocessing import cifarnet_preprocessing
from preprocessing import inception_preprocessing
from preprocessing import lenet_preprocessing
from preprocessing import vgg_preprocessing
slim = tf.contrib.slim
def get_preprocessing(name, is_training=False):
"""Returns preprocessing_fn(image, height, width, **kwargs).
Args:
name: The name of the preprocessing function.
is_training: `True` if the model is being used for training and `False`
otherwise.
Returns:
preprocessing_fn: A function that preprocessing a single image (pre-batch).
It has the following signature:
image = preprocessing_fn(image, output_height, output_width, ...).
Raises:
ValueError: If Preprocessing `name` is not recognized.
"""
preprocessing_fn_map = {
'cifarnet': cifarnet_preprocessing,
'inception': inception_preprocessing,
'inception_v1': inception_preprocessing,
'inception_v2': inception_preprocessing,
'inception_v3': inception_preprocessing,
'inception_v4': inception_preprocessing,
'inception_resnet_v2': inception_preprocessing,
'lenet': lenet_preprocessing,
'mobilenet_v1': inception_preprocessing,
'nasnet_mobile': inception_preprocessing,
'nasnet_large': inception_preprocessing,
'pnasnet_large': inception_preprocessing,
'resnet_v1_50': vgg_preprocessing,
'resnet_v1_101': vgg_preprocessing,
'resnet_v1_152': vgg_preprocessing,
'resnet_v1_200': vgg_preprocessing,
'resnet_v2_50': vgg_preprocessing,
'resnet_v2_101': vgg_preprocessing,
'resnet_v2_152': vgg_preprocessing,
'resnet_v2_200': vgg_preprocessing,
'vgg': vgg_preprocessing,
'vgg_a': vgg_preprocessing,
'vgg_16': vgg_preprocessing,
'vgg_19': vgg_preprocessing,
}
if name not in preprocessing_fn_map:
raise ValueError('Preprocessing name [%s] was not recognized' % name)
def preprocessing_fn(image, output_height, output_width, **kwargs):
return preprocessing_fn_map[name].preprocess_image(
image, output_height, output_width, is_training=is_training, **kwargs)
return preprocessing_fn
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Provides utilities to preprocess images.
The preprocessing steps for VGG were introduced in the following technical
report:
Very Deep Convolutional Networks For Large-Scale Image Recognition
Karen Simonyan and Andrew Zisserman
arXiv technical report, 2015
PDF: http://arxiv.org/pdf/1409.1556.pdf
ILSVRC 2014 Slides: http://www.robots.ox.ac.uk/~karen/pdf/ILSVRC_2014.pdf
CC-BY-4.0
More information can be obtained from the VGG website:
www.robots.ox.ac.uk/~vgg/research/very_deep/
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
slim = tf.contrib.slim
_R_MEAN = 123.68
_G_MEAN = 116.78
_B_MEAN = 103.94
_RESIZE_SIDE_MIN = 256
_RESIZE_SIDE_MAX = 512
def _crop(image, offset_height, offset_width, crop_height, crop_width):
"""Crops the given image using the provided offsets and sizes.
Note that the method doesn't assume we know the input image size but it does
assume we know the input image rank.
Args:
image: an image of shape [height, width, channels].
offset_height: a scalar tensor indicating the height offset.
offset_width: a scalar tensor indicating the width offset.
crop_height: the height of the cropped image.
crop_width: the width of the cropped image.
Returns:
the cropped (and resized) image.
Raises:
InvalidArgumentError: if the rank is not 3 or if the image dimensions are
less than the crop size.
"""
original_shape = tf.shape(image)
rank_assertion = tf.Assert(
tf.equal(tf.rank(image), 3),
['Rank of image must be equal to 3.'])
with tf.control_dependencies([rank_assertion]):
cropped_shape = tf.stack([crop_height, crop_width, original_shape[2]])
size_assertion = tf.Assert(
tf.logical_and(
tf.greater_equal(original_shape[0], crop_height),
tf.greater_equal(original_shape[1], crop_width)),
['Crop size greater than the image size.'])
offsets = tf.to_int32(tf.stack([offset_height, offset_width, 0]))
# Use tf.slice instead of crop_to_bounding box as it accepts tensors to
# define the crop size.
with tf.control_dependencies([size_assertion]):
image = tf.slice(image, offsets, cropped_shape)
return tf.reshape(image, cropped_shape)
def _random_crop(image_list, crop_height, crop_width):
"""Crops the given list of images.
The function applies the same crop to each image in the list. This can be
effectively applied when there are multiple image inputs of the same
dimension such as:
image, depths, normals = _random_crop([image, depths, normals], 120, 150)
Args:
image_list: a list of image tensors of the same dimension but possibly
varying channel.
crop_height: the new height.
crop_width: the new width.
Returns:
the image_list with cropped images.
Raises:
ValueError: if there are multiple image inputs provided with different size
or the images are smaller than the crop dimensions.
"""
if not image_list:
raise ValueError('Empty image_list.')
# Compute the rank assertions.
rank_assertions = []
for i in range(len(image_list)):
image_rank = tf.rank(image_list[i])
rank_assert = tf.Assert(
tf.equal(image_rank, 3),
['Wrong rank for tensor %s [expected] [actual]',
image_list[i].name, 3, image_rank])
rank_assertions.append(rank_assert)
with tf.control_dependencies([rank_assertions[0]]):
image_shape = tf.shape(image_list[0])
image_height = image_shape[0]
image_width = image_shape[1]
crop_size_assert = tf.Assert(
tf.logical_and(
tf.greater_equal(image_height, crop_height),
tf.greater_equal(image_width, crop_width)),
['Crop size greater than the image size.'])
asserts = [rank_assertions[0], crop_size_assert]
for i in range(1, len(image_list)):
image = image_list[i]
asserts.append(rank_assertions[i])
with tf.control_dependencies([rank_assertions[i]]):
shape = tf.shape(image)
height = shape[0]
width = shape[1]
height_assert = tf.Assert(
tf.equal(height, image_height),
['Wrong height for tensor %s [expected][actual]',
image.name, height, image_height])
width_assert = tf.Assert(
tf.equal(width, image_width),
['Wrong width for tensor %s [expected][actual]',
image.name, width, image_width])
asserts.extend([height_assert, width_assert])
# Create a random bounding box.
#
# Use tf.random_uniform and not numpy.random.rand as doing the former would
# generate random numbers at graph eval time, unlike the latter which
# generates random numbers at graph definition time.
with tf.control_dependencies(asserts):
max_offset_height = tf.reshape(image_height - crop_height + 1, [])
with tf.control_dependencies(asserts):
max_offset_width = tf.reshape(image_width - crop_width + 1, [])
offset_height = tf.random_uniform(
[], maxval=max_offset_height, dtype=tf.int32)
offset_width = tf.random_uniform(
[], maxval=max_offset_width, dtype=tf.int32)
return [_crop(image, offset_height, offset_width,
crop_height, crop_width) for image in image_list]
def _central_crop(image_list, crop_height, crop_width):
"""Performs central crops of the given image list.
Args:
image_list: a list of image tensors of the same dimension but possibly
varying channel.
crop_height: the height of the image following the crop.
crop_width: the width of the image following the crop.
Returns:
the list of cropped images.
"""
outputs = []
for image in image_list:
image_height = tf.shape(image)[0]
image_width = tf.shape(image)[1]
offset_height = (image_height - crop_height) / 2
offset_width = (image_width - crop_width) / 2
outputs.append(_crop(image, offset_height, offset_width,
crop_height, crop_width))
return outputs
def _mean_image_subtraction(image, means):
"""Subtracts the given means from each image channel.
For example:
means = [123.68, 116.779, 103.939]
image = _mean_image_subtraction(image, means)
Note that the rank of `image` must be known.
Args:
image: a tensor of size [height, width, C].
means: a C-vector of values to subtract from each channel.
Returns:
the centered image.
Raises:
ValueError: If the rank of `image` is unknown, if `image` has a rank other
than three or if the number of channels in `image` doesn't match the
number of values in `means`.
"""
if image.get_shape().ndims != 3:
raise ValueError('Input must be of size [height, width, C>0]')
num_channels = image.get_shape().as_list()[-1]
if len(means) != num_channels:
raise ValueError('len(means) must match the number of channels')
channels = tf.split(axis=2, num_or_size_splits=num_channels, value=image)
for i in range(num_channels):
channels[i] -= means[i]
return tf.concat(axis=2, values=channels)
def _smallest_size_at_least(height, width, smallest_side):
"""Computes new shape with the smallest side equal to `smallest_side`.
Computes new shape with the smallest side equal to `smallest_side` while
preserving the original aspect ratio.
Args:
height: an int32 scalar tensor indicating the current height.
width: an int32 scalar tensor indicating the current width.
smallest_side: A python integer or scalar `Tensor` indicating the size of
the smallest side after resize.
Returns:
new_height: an int32 scalar tensor indicating the new height.
new_width: and int32 scalar tensor indicating the new width.
"""
smallest_side = tf.convert_to_tensor(smallest_side, dtype=tf.int32)
height = tf.to_float(height)
width = tf.to_float(width)
smallest_side = tf.to_float(smallest_side)
scale = tf.cond(tf.greater(height, width),
lambda: smallest_side / width,
lambda: smallest_side / height)
new_height = tf.to_int32(tf.rint(height * scale))
new_width = tf.to_int32(tf.rint(width * scale))
return new_height, new_width
def _aspect_preserving_resize(image, smallest_side):
"""Resize images preserving the original aspect ratio.
Args:
image: A 3-D image `Tensor`.
smallest_side: A python integer or scalar `Tensor` indicating the size of
the smallest side after resize.
Returns:
resized_image: A 3-D tensor containing the resized image.
"""
smallest_side = tf.convert_to_tensor(smallest_side, dtype=tf.int32)
shape = tf.shape(image)
height = shape[0]
width = shape[1]
new_height, new_width = _smallest_size_at_least(height, width, smallest_side)
image = tf.expand_dims(image, 0)
resized_image = tf.image.resize_bilinear(image, [new_height, new_width],
align_corners=False)
resized_image = tf.squeeze(resized_image)
resized_image.set_shape([None, None, 3])
return resized_image
def preprocess_for_train(image,
output_height,
output_width,
resize_side_min=_RESIZE_SIDE_MIN,
resize_side_max=_RESIZE_SIDE_MAX):
"""Preprocesses the given image for training.
Note that the actual resizing scale is sampled from
[`resize_size_min`, `resize_size_max`].
Args:
image: A `Tensor` representing an image of arbitrary size.
output_height: The height of the image after preprocessing.
output_width: The width of the image after preprocessing.
resize_side_min: The lower bound for the smallest side of the image for
aspect-preserving resizing.
resize_side_max: The upper bound for the smallest side of the image for
aspect-preserving resizing.
Returns:
A preprocessed image.
"""
resize_side = tf.random_uniform(
[], minval=resize_side_min, maxval=resize_side_max+1, dtype=tf.int32)
image = _aspect_preserving_resize(image, resize_side)
image = _random_crop([image], output_height, output_width)[0]
image.set_shape([output_height, output_width, 3])
image = tf.to_float(image)
image = tf.image.random_flip_left_right(image)
return _mean_image_subtraction(image, [_R_MEAN, _G_MEAN, _B_MEAN])
def preprocess_for_eval(image, output_height, output_width, resize_side):
"""Preprocesses the given image for evaluation.
Args:
image: A `Tensor` representing an image of arbitrary size.
output_height: The height of the image after preprocessing.
output_width: The width of the image after preprocessing.
resize_side: The smallest side of the image for aspect-preserving resizing.
Returns:
A preprocessed image.
"""
image = _aspect_preserving_resize(image, resize_side)
image = _central_crop([image], output_height, output_width)[0]
image.set_shape([output_height, output_width, 3])
image = tf.to_float(image)
return _mean_image_subtraction(image, [_R_MEAN, _G_MEAN, _B_MEAN])
def preprocess_image(image, output_height, output_width, is_training=False,
resize_side_min=_RESIZE_SIDE_MIN,
resize_side_max=_RESIZE_SIDE_MAX):
"""Preprocesses the given image.
Args:
image: A `Tensor` representing an image of arbitrary size.
output_height: The height of the image after preprocessing.
output_width: The width of the image after preprocessing.
is_training: `True` if we're preprocessing the image for training and
`False` otherwise.
resize_side_min: The lower bound for the smallest side of the image for
aspect-preserving resizing. If `is_training` is `False`, then this value
is used for rescaling.
resize_side_max: The upper bound for the smallest side of the image for
aspect-preserving resizing. If `is_training` is `False`, this value is
ignored. Otherwise, the resize side is sampled from
[resize_size_min, resize_size_max].
Returns:
A preprocessed image.
"""
if is_training:
return preprocess_for_train(image, output_height, output_width,
resize_side_min, resize_side_max)
else:
return preprocess_for_eval(image, output_height, output_width,
resize_side_min)
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Contains classes specifying naming conventions used for object detection.
Specifies:
InputDataFields: standard fields used by reader/preprocessor/batcher.
DetectionResultFields: standard fields returned by object detector.
BoxListFields: standard field used by BoxList
TfExampleFields: standard fields for tf-example data format (go/tf-example).
"""
class InputDataFields(object):
"""Names for the input tensors.
Holds the standard data field names to use for identifying input tensors. This
should be used by the decoder to identify keys for the returned tensor_dict
containing input tensors. And it should be used by the model to identify the
tensors it needs.
Attributes:
image: image.
image_additional_channels: additional channels.
original_image: image in the original input size.
key: unique key corresponding to image.
source_id: source of the original image.
filename: original filename of the dataset (without common path).
groundtruth_image_classes: image-level class labels.
groundtruth_boxes: coordinates of the ground truth boxes in the image.
groundtruth_classes: box-level class labels.
groundtruth_label_types: box-level label types (e.g. explicit negative).
groundtruth_is_crowd: [DEPRECATED, use groundtruth_group_of instead]
is the groundtruth a single object or a crowd.
groundtruth_area: area of a groundtruth segment.
groundtruth_difficult: is a `difficult` object
groundtruth_group_of: is a `group_of` objects, e.g. multiple objects of the
same class, forming a connected group, where instances are heavily
occluding each other.
proposal_boxes: coordinates of object proposal boxes.
proposal_objectness: objectness score of each proposal.
groundtruth_instance_masks: ground truth instance masks.
groundtruth_instance_boundaries: ground truth instance boundaries.
groundtruth_instance_classes: instance mask-level class labels.
groundtruth_keypoints: ground truth keypoints.
groundtruth_keypoint_visibilities: ground truth keypoint visibilities.
groundtruth_label_scores: groundtruth label scores.
groundtruth_weights: groundtruth weight factor for bounding boxes.
num_groundtruth_boxes: number of groundtruth boxes.
true_image_shapes: true shapes of images in the resized images, as resized
images can be padded with zeros.
multiclass_scores: the label score per class for each box.
"""
image = 'image'
image_additional_channels = 'image_additional_channels'
original_image = 'original_image'
key = 'key'
source_id = 'source_id'
filename = 'filename'
groundtruth_image_classes = 'groundtruth_image_classes'
groundtruth_boxes = 'groundtruth_boxes'
groundtruth_classes = 'groundtruth_classes'
groundtruth_label_types = 'groundtruth_label_types'
groundtruth_is_crowd = 'groundtruth_is_crowd'
groundtruth_area = 'groundtruth_area'
groundtruth_difficult = 'groundtruth_difficult'
groundtruth_group_of = 'groundtruth_group_of'
proposal_boxes = 'proposal_boxes'
proposal_objectness = 'proposal_objectness'
groundtruth_instance_masks = 'groundtruth_instance_masks'
groundtruth_instance_boundaries = 'groundtruth_instance_boundaries'
groundtruth_instance_classes = 'groundtruth_instance_classes'
groundtruth_keypoints = 'groundtruth_keypoints'
groundtruth_keypoint_visibilities = 'groundtruth_keypoint_visibilities'
groundtruth_label_scores = 'groundtruth_label_scores'
groundtruth_weights = 'groundtruth_weights'
num_groundtruth_boxes = 'num_groundtruth_boxes'
true_image_shape = 'true_image_shape'
multiclass_scores = 'multiclass_scores'
class DetectionResultFields(object):
"""Naming conventions for storing the output of the detector.
Attributes:
source_id: source of the original image.
key: unique key corresponding to image.
detection_boxes: coordinates of the detection boxes in the image.
detection_scores: detection scores for the detection boxes in the image.
detection_classes: detection-level class labels.
detection_masks: contains a segmentation mask for each detection box.
detection_boundaries: contains an object boundary for each detection box.
detection_keypoints: contains detection keypoints for each detection box.
num_detections: number of detections in the batch.
"""
source_id = 'source_id'
key = 'key'
detection_boxes = 'detection_boxes'
detection_scores = 'detection_scores'
detection_classes = 'detection_classes'
detection_masks = 'detection_masks'
detection_boundaries = 'detection_boundaries'
detection_keypoints = 'detection_keypoints'
num_detections = 'num_detections'
class BoxListFields(object):
"""Naming conventions for BoxLists.
Attributes:
boxes: bounding box coordinates.
classes: classes per bounding box.
scores: scores per bounding box.
weights: sample weights per bounding box.
objectness: objectness score per bounding box.
masks: masks per bounding box.
boundaries: boundaries per bounding box.
keypoints: keypoints per bounding box.
keypoint_heatmaps: keypoint heatmaps per bounding box.
is_crowd: is_crowd annotation per bounding box.
"""
boxes = 'boxes'
classes = 'classes'
scores = 'scores'
weights = 'weights'
objectness = 'objectness'
masks = 'masks'
boundaries = 'boundaries'
keypoints = 'keypoints'
keypoint_heatmaps = 'keypoint_heatmaps'
is_crowd = 'is_crowd'
class TfExampleFields(object):
"""TF-example proto feature names for object detection.
Holds the standard feature names to load from an Example proto for object
detection.
Attributes:
image_encoded: JPEG encoded string
image_format: image format, e.g. "JPEG"
filename: filename
channels: number of channels of image
colorspace: colorspace, e.g. "RGB"
height: height of image in pixels, e.g. 462
width: width of image in pixels, e.g. 581
source_id: original source of the image
image_class_text: image-level label in text format
image_class_label: image-level label in numerical format
object_class_text: labels in text format, e.g. ["person", "cat"]
object_class_label: labels in numbers, e.g. [16, 8]
object_bbox_xmin: xmin coordinates of groundtruth box, e.g. 10, 30
object_bbox_xmax: xmax coordinates of groundtruth box, e.g. 50, 40
object_bbox_ymin: ymin coordinates of groundtruth box, e.g. 40, 50
object_bbox_ymax: ymax coordinates of groundtruth box, e.g. 80, 70
object_view: viewpoint of object, e.g. ["frontal", "left"]
object_truncated: is object truncated, e.g. [true, false]
object_occluded: is object occluded, e.g. [true, false]
object_difficult: is object difficult, e.g. [true, false]
object_group_of: is object a single object or a group of objects
object_depiction: is object a depiction
object_is_crowd: [DEPRECATED, use object_group_of instead]
is the object a single object or a crowd
object_segment_area: the area of the segment.
object_weight: a weight factor for the object's bounding box.
instance_masks: instance segmentation masks.
instance_boundaries: instance boundaries.
instance_classes: Classes for each instance segmentation mask.
detection_class_label: class label in numbers.
detection_bbox_ymin: ymin coordinates of a detection box.
detection_bbox_xmin: xmin coordinates of a detection box.
detection_bbox_ymax: ymax coordinates of a detection box.
detection_bbox_xmax: xmax coordinates of a detection box.
detection_score: detection score for the class label and box.
"""
image_encoded = 'image/encoded'
image_format = 'image/format' # format is reserved keyword
filename = 'image/filename'
channels = 'image/channels'
colorspace = 'image/colorspace'
height = 'image/height'
width = 'image/width'
source_id = 'image/source_id'
image_class_text = 'image/class/text'
image_class_label = 'image/class/label'
object_class_text = 'image/object/class/text'
object_class_label = 'image/object/class/label'
object_bbox_ymin = 'image/object/bbox/ymin'
object_bbox_xmin = 'image/object/bbox/xmin'
object_bbox_ymax = 'image/object/bbox/ymax'
object_bbox_xmax = 'image/object/bbox/xmax'
object_view = 'image/object/view'
object_truncated = 'image/object/truncated'
object_occluded = 'image/object/occluded'
object_difficult = 'image/object/difficult'
object_group_of = 'image/object/group_of'
object_depiction = 'image/object/depiction'
object_is_crowd = 'image/object/is_crowd'
object_segment_area = 'image/object/segment/area'
object_weight = 'image/object/weight'
instance_masks = 'image/segmentation/object'
instance_boundaries = 'image/boundaries/object'
instance_classes = 'image/segmentation/object/class'
detection_class_label = 'image/detection/label'
detection_bbox_ymin = 'image/detection/bbox/ymin'
detection_bbox_xmin = 'image/detection/bbox/xmin'
detection_bbox_ymax = 'image/detection/bbox/ymax'
detection_bbox_xmax = 'image/detection/bbox/xmax'
detection_score = 'image/detection/score'
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
# Generated by the protocol buffer compiler. DO NOT EDIT!
# source: object_detection/protos/string_int_label_map.proto
import sys
_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
from google.protobuf import descriptor as _descriptor
from google.protobuf import message as _message
from google.protobuf import reflection as _reflection
from google.protobuf import symbol_database as _symbol_database
from google.protobuf import descriptor_pb2
# @@protoc_insertion_point(imports)
_sym_db = _symbol_database.Default()
DESCRIPTOR = _descriptor.FileDescriptor(
name='object_detection/protos/string_int_label_map.proto',
package='object_detection.protos',
syntax='proto2',
serialized_pb=_b('\n2object_detection/protos/string_int_label_map.proto\x12\x17object_detection.protos\"G\n\x15StringIntLabelMapItem\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\n\n\x02id\x18\x02 \x01(\x05\x12\x14\n\x0c\x64isplay_name\x18\x03 \x01(\t\"Q\n\x11StringIntLabelMap\x12<\n\x04item\x18\x01 \x03(\x0b\x32..object_detection.protos.StringIntLabelMapItem')
)
_STRINGINTLABELMAPITEM = _descriptor.Descriptor(
name='StringIntLabelMapItem',
full_name='object_detection.protos.StringIntLabelMapItem',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='name', full_name='object_detection.protos.StringIntLabelMapItem.name', index=0,
number=1, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='id', full_name='object_detection.protos.StringIntLabelMapItem.id', index=1,
number=2, type=5, cpp_type=1, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='display_name', full_name='object_detection.protos.StringIntLabelMapItem.display_name', index=2,
number=3, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None),
],
extensions=[
],
nested_types=[],
enum_types=[
],
options=None,
is_extendable=False,
syntax='proto2',
extension_ranges=[],
oneofs=[
],
serialized_start=79,
serialized_end=150,
)
_STRINGINTLABELMAP = _descriptor.Descriptor(
name='StringIntLabelMap',
full_name='object_detection.protos.StringIntLabelMap',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='item', full_name='object_detection.protos.StringIntLabelMap.item', index=0,
number=1, type=11, cpp_type=10, label=3,
has_default_value=False, default_value=[],
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None),
],
extensions=[
],
nested_types=[],
enum_types=[
],
options=None,
is_extendable=False,
syntax='proto2',
extension_ranges=[],
oneofs=[
],
serialized_start=152,
serialized_end=233,
)
_STRINGINTLABELMAP.fields_by_name['item'].message_type = _STRINGINTLABELMAPITEM
DESCRIPTOR.message_types_by_name['StringIntLabelMapItem'] = _STRINGINTLABELMAPITEM
DESCRIPTOR.message_types_by_name['StringIntLabelMap'] = _STRINGINTLABELMAP
_sym_db.RegisterFileDescriptor(DESCRIPTOR)
StringIntLabelMapItem = _reflection.GeneratedProtocolMessageType('StringIntLabelMapItem', (_message.Message,), dict(
DESCRIPTOR = _STRINGINTLABELMAPITEM,
__module__ = 'object_detection.protos.string_int_label_map_pb2'
# @@protoc_insertion_point(class_scope:object_detection.protos.StringIntLabelMapItem)
))
_sym_db.RegisterMessage(StringIntLabelMapItem)
StringIntLabelMap = _reflection.GeneratedProtocolMessageType('StringIntLabelMap', (_message.Message,), dict(
DESCRIPTOR = _STRINGINTLABELMAP,
__module__ = 'object_detection.protos.string_int_label_map_pb2'
# @@protoc_insertion_point(class_scope:object_detection.protos.StringIntLabelMap)
))
_sym_db.RegisterMessage(StringIntLabelMap)
# @@protoc_insertion_point(module_scope)
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A library of tasks.
This interface is intended to implement a wide variety of navigation
tasks. See go/navigation_tasks for a list.
"""
import abc
import collections
import math
import threading
import networkx as nx
import numpy as np
import tensorflow as tf
#from pyglib import logging
#import gin
from envs import task_env
from envs import util as envs_util
# Utility functions.
def _pad_or_clip_array(np_arr, arr_len, is_front_clip=True, output_mask=False):
"""Make np_arr array to have length arr_len.
If the array is shorter than arr_len, then it is padded from the front with
zeros. If it is longer, then it is clipped either from the back or from the
front. Only the first dimension is modified.
Args:
np_arr: numpy array.
arr_len: integer scalar.
is_front_clip: a boolean. If true then clipping is done in the front,
otherwise in the back.
output_mask: If True, outputs a numpy array of rank 1 which represents
a mask of which values have been added (0 - added, 1 - actual output).
Returns:
A numpy array and the size of padding (as a python int32). This size is
negative is the array is clipped.
"""
shape = list(np_arr.shape)
pad_size = arr_len - shape[0]
padded_or_clipped = None
if pad_size < 0:
if is_front_clip:
padded_or_clipped = np_arr[-pad_size:, :]
else:
padded_or_clipped = np_arr[:arr_len, :]
elif pad_size > 0:
padding = np.zeros([pad_size] + shape[1:], dtype=np_arr.dtype)
padded_or_clipped = np.concatenate([np_arr, padding], axis=0)
else:
padded_or_clipped = np_arr
if output_mask:
mask = np.ones((arr_len,), dtype=np.int)
if pad_size > 0:
mask[-pad_size:] = 0
return padded_or_clipped, pad_size, mask
else:
return padded_or_clipped, pad_size
def classification_loss(truth, predicted, weights=None, is_one_hot=True):
"""A cross entropy loss.
Computes the mean of cross entropy losses for all pairs of true labels and
predictions. It wraps around a tf implementation of the cross entropy loss
with additional reformating of the inputs. If the truth and predicted are
n-rank Tensors with n > 2, then these are reshaped to 2-rank Tensors. It
allows for truth to be specified as one hot vector or class indices. Finally,
a weight can be specified for each element in truth and predicted.
Args:
truth: an n-rank or (n-1)-rank Tensor containing labels. If is_one_hot is
True, then n-rank Tensor is expected, otherwise (n-1) rank one.
predicted: an n-rank float Tensor containing prediction probabilities.
weights: an (n-1)-rank float Tensor of weights
is_one_hot: a boolean.
Returns:
A TF float scalar.
"""
num_labels = predicted.get_shape().as_list()[-1]
if not is_one_hot:
truth = tf.reshape(truth, [-1])
truth = tf.one_hot(
truth, depth=num_labels, on_value=1.0, off_value=0.0, axis=-1)
else:
truth = tf.reshape(truth, [-1, num_labels])
predicted = tf.reshape(predicted, [-1, num_labels])
losses = tf.nn.softmax_cross_entropy_with_logits(
labels=truth, logits=predicted)
if weights is not None:
losses = tf.boolean_mask(losses,
tf.cast(tf.reshape(weights, [-1]), dtype=tf.bool))
return tf.reduce_mean(losses)
class UnrolledTaskIOConfig(object):
"""Configuration of task inputs and outputs.
A task can have multiple inputs, which define the context, and a task query
which defines what is to be executed in this context. The desired execution
is encoded in an output. The config defines the shapes of the inputs, the
query and the outputs.
"""
def __init__(self, inputs, output, query=None):
"""Constructs a Task input/output config.
Args:
inputs: a list of tuples. Each tuple represents the configuration of an
input, with first element being the type (a string value) and the second
element the shape.
output: a tuple representing the configuration of the output.
query: a tuple representing the configuration of the query. If no query,
then None.
"""
# A configuration of a single input, output or query. Consists of the type,
# which can be one of the three specified above, and a shape. The shape must
# be consistent with the type, e.g. if type == 'image', then shape is a 3
# valued list.
io_config = collections.namedtuple('IOConfig', ['type', 'shape'])
def assert_config(config):
if not isinstance(config, tuple):
raise ValueError('config must be a tuple. Received {}'.format(
type(config)))
if len(config) != 2:
raise ValueError('config must have 2 elements, has %d' % len(config))
if not isinstance(config[0], tf.DType):
raise ValueError('First element of config must be a tf.DType.')
if not isinstance(config[1], list):
raise ValueError('Second element of config must be a list.')
assert isinstance(inputs, collections.OrderedDict)
for modality_type in inputs:
assert_config(inputs[modality_type])
self._inputs = collections.OrderedDict(
[(k, io_config(*value)) for k, value in inputs.iteritems()])
if query is not None:
assert_config(query)
self._query = io_config(*query)
else:
self._query = None
assert_config(output)
self._output = io_config(*output)
@property
def inputs(self):
return self._inputs
@property
def output(self):
return self._output
@property
def query(self):
return self._query
class UnrolledTask(object):
"""An interface for a Task which can be unrolled during training.
Each example is called episode and consists of inputs and target output, where
the output can be considered as desired unrolled sequence of actions for the
inputs. For the specified tasks, these action sequences are to be
unambiguously definable.
"""
__metaclass__ = abc.ABCMeta
def __init__(self, config):
assert isinstance(config, UnrolledTaskIOConfig)
self._config = config
# A dict of bookkeeping variables.
self.info = {}
# Tensorflow input is multithreaded and this lock is needed to prevent
# race condition in the environment. Without the lock, non-thread safe
# environments crash.
self._lock = threading.Lock()
@property
def config(self):
return self._config
@abc.abstractmethod
def episode(self):
"""Returns data needed to train and test a single episode.
Each episode consists of inputs, which define the context of the task, a
query which defines the task, and a target output, which defines a
sequence of actions to be executed for this query. This sequence should not
require feedback, i.e. can be predicted purely from input and query.]
Returns:
inputs, query, output, where inputs is a list of numpy arrays and query
and output are numpy arrays. These arrays must be of shape and type as
specified in the task configuration.
"""
pass
def reset(self, observation):
"""Called after the environment is reset."""
pass
def episode_batch(self, batch_size):
"""Returns a batch of episodes.
Args:
batch_size: size of batch.
Returns:
(inputs, query, output, masks) where inputs is list of numpy arrays and
query, output, and mask are numpy arrays. These arrays must be of shape
and type as specified in the task configuration with one additional
preceding dimension corresponding to the batch.
Raises:
ValueError: if self.episode() returns illegal values.
"""
batched_inputs = collections.OrderedDict(
[[mtype, []] for mtype in self.config.inputs])
batched_queries = []
batched_outputs = []
batched_masks = []
for _ in range(int(batch_size)):
with self._lock:
# The episode function needs to be thread-safe. Since the current
# implementation for the envs are not thread safe we need to have lock
# the operations here.
inputs, query, outputs = self.episode()
if not isinstance(outputs, tuple):
raise ValueError('Outputs return value must be tuple.')
if len(outputs) != 2:
raise ValueError('Output tuple must be of size 2.')
if inputs is not None:
for modality_type in batched_inputs:
batched_inputs[modality_type].append(
np.expand_dims(inputs[modality_type], axis=0))
if query is not None:
batched_queries.append(np.expand_dims(query, axis=0))
batched_outputs.append(np.expand_dims(outputs[0], axis=0))
if outputs[1] is not None:
batched_masks.append(np.expand_dims(outputs[1], axis=0))
batched_inputs = {
k: np.concatenate(i, axis=0) for k, i in batched_inputs.iteritems()
}
if batched_queries:
batched_queries = np.concatenate(batched_queries, axis=0)
batched_outputs = np.concatenate(batched_outputs, axis=0)
if batched_masks:
batched_masks = np.concatenate(batched_masks, axis=0).astype(np.float32)
else:
# When the array is empty, the default np.dtype is float64 which causes
# py_func to crash in the tests.
batched_masks = np.array([], dtype=np.float32)
batched_inputs = [batched_inputs[k] for k in self._config.inputs]
return batched_inputs, batched_queries, batched_outputs, batched_masks
def tf_episode_batch(self, batch_size):
"""A batch of episodes as TF Tensors.
Same as episode_batch with the difference that the return values are TF
Tensors.
Args:
batch_size: a python float for the batch size.
Returns:
inputs, query, output, mask where inputs is a dictionary of tf.Tensor
where the keys are the modality types specified in the config.inputs.
query, output, and mask are TF Tensors. These tensors must
be of shape and type as specified in the task configuration with one
additional preceding dimension corresponding to the batch. Both mask and
output have the same shape as output.
"""
# Define TF outputs.
touts = []
shapes = []
for _, i in self._config.inputs.iteritems():
touts.append(i.type)
shapes.append(i.shape)
if self._config.query is not None:
touts.append(self._config.query.type)
shapes.append(self._config.query.shape)
# Shapes and types for batched_outputs.
touts.append(self._config.output.type)
shapes.append(self._config.output.shape)
# Shapes and types for batched_masks.
touts.append(self._config.output.type)
shapes.append(self._config.output.shape[0:1])
def episode_batch_func():
if self.config.query is None:
inp, _, output, masks = self.episode_batch(int(batch_size))
return tuple(inp) + (output, masks)
else:
inp, query, output, masks = self.episode_batch(int(batch_size))
return tuple(inp) + (query, output, masks)
tf_episode_batch = tf.py_func(episode_batch_func, [], touts,
stateful=True, name='taskdata')
for episode, shape in zip(tf_episode_batch, shapes):
episode.set_shape([batch_size] + shape)
tf_episode_batch_dict = collections.OrderedDict([
(mtype, episode)
for mtype, episode in zip(self.config.inputs.keys(), tf_episode_batch)
])
cur_index = len(self.config.inputs.keys())
tf_query = None
if self.config.query is not None:
tf_query = tf_episode_batch[cur_index]
cur_index += 1
tf_outputs = tf_episode_batch[cur_index]
tf_masks = tf_episode_batch[cur_index + 1]
return tf_episode_batch_dict, tf_query, tf_outputs, tf_masks
@abc.abstractmethod
def target_loss(self, true_targets, targets, weights=None):
"""A loss for training a task model.
This loss measures the discrepancy between the task outputs, the true and
predicted ones.
Args:
true_targets: tf.Tensor of shape and type as defined in the task config
containing the true outputs.
targets: tf.Tensor of shape and type as defined in the task config
containing the predicted outputs.
weights: a bool tf.Tensor of shape as targets. Only true values are
considered when formulating the loss.
"""
pass
def reward(self, obs, done, info):
"""Returns a reward.
The tasks has to compute a reward based on the state of the environment. The
reward computation, though, is task specific. The task is to use the
environment interface, as defined in task_env.py, to compute the reward. If
this interface does not expose enough information, it is to be updated.
Args:
obs: Observation from environment's step function.
done: Done flag from environment's step function.
info: Info dict from environment's step function.
Returns:
obs: Observation.
reward: Floating point value.
done: Done flag.
info: Info dict.
"""
# Default implementation does not do anything.
return obs, 0.0, done, info
class RandomExplorationBasedTask(UnrolledTask):
"""A Task which starts with a random exploration of the environment."""
def __init__(self,
env,
seed,
add_query_noise=False,
query_noise_var=0.0,
*args,
**kwargs): # pylint: disable=keyword-arg-before-vararg
"""Initializes a Task using a random exploration runs.
Args:
env: an instance of type TaskEnv and gym.Env.
seed: a random seed.
add_query_noise: boolean, if True then whatever queries are generated,
they are randomly perturbed. The semantics of the queries depends on the
concrete task implementation.
query_noise_var: float, the variance of Gaussian noise used for query
perturbation. Used iff add_query_noise==True.
*args: see super class.
**kwargs: see super class.
"""
super(RandomExplorationBasedTask, self).__init__(*args, **kwargs)
assert isinstance(env, task_env.TaskEnv)
self._env = env
self._env.set_task(self)
self._rng = np.random.RandomState(seed)
self._add_query_noise = add_query_noise
self._query_noise_var = query_noise_var
# GoToStaticXTask can also take empty config but for the rest of the classes
# the number of modality types is 1.
if len(self.config.inputs.keys()) > 1:
raise NotImplementedError('current implementation supports input '
'with only one modality type or less.')
def _exploration(self):
"""Generates a random exploration run.
The function uses the environment to generate a run.
Returns:
A tuple of numpy arrays. The i-th array contains observation of type and
shape as specified in config.inputs[i].
A list of states along the exploration path.
A list of vertex indices corresponding to the path of the exploration.
"""
in_seq_len = self._config.inputs.values()[0].shape[0]
path, _, states, step_outputs = self._env.random_step_sequence(
min_len=in_seq_len)
obs = {modality_type: [] for modality_type in self._config.inputs}
for o in step_outputs:
step_obs, _, done, _ = o
# It is expected that each value of step_obs is a dict of observations,
# whose dimensions are consistent with the config.inputs sizes.
for modality_type in self._config.inputs:
assert modality_type in step_obs, '{}'.format(type(step_obs))
o = step_obs[modality_type]
i = self._config.inputs[modality_type]
assert len(o.shape) == len(i.shape) - 1
for dim_o, dim_i in zip(o.shape, i.shape[1:]):
assert dim_o == dim_i, '{} != {}'.format(dim_o, dim_i)
obs[modality_type].append(o)
if done:
break
if not obs:
return obs, states, path
max_path_len = int(
round(in_seq_len * float(len(path)) / float(len(obs.values()[0]))))
path = path[-max_path_len:]
states = states[-in_seq_len:]
# The above obs is a list of tuples of np,array. Re-format them as tuple of
# np.array, each array containing all observations from all steps.
def regroup(obs, i):
"""Regroups observations.
Args:
obs: a list of tuples of same size. The k-th tuple contains all the
observations from k-th step. Each observation is a numpy array.
i: the index of the observation in each tuple to be grouped.
Returns:
A numpy array of shape config.inputs[i] which contains all i-th
observations from all steps. These are concatenated along the first
dimension. In addition, if the number of observations is different from
the one specified in config.inputs[i].shape[0], then the array is either
padded from front or clipped.
"""
grouped_obs = np.concatenate(
[np.expand_dims(o, axis=0) for o in obs[i]], axis=0)
in_seq_len = self._config.inputs[i].shape[0]
# pylint: disable=unbalanced-tuple-unpacking
grouped_obs, _ = _pad_or_clip_array(
grouped_obs, in_seq_len, is_front_clip=True)
return grouped_obs
all_obs = {i: regroup(obs, i) for i in self._config.inputs}
return all_obs, states, path
def _obs_to_state(self, path, states):
"""Computes mapping between path nodes and states."""
# Generate a numpy array of locations corresponding to the path vertices.
path_coordinates = map(self._env.vertex_to_pose, path)
path_coordinates = np.concatenate(
[np.reshape(p, [1, 2]) for p in path_coordinates])
# The observations are taken along a smoothed trajectory following the path.
# We compute a mapping between the obeservations and the map vertices.
path_to_obs = collections.defaultdict(list)
obs_to_state = []
for i, s in enumerate(states):
location = np.reshape(s[0:2], [1, 2])
index = np.argmin(
np.reshape(
np.sum(np.power(path_coordinates - location, 2), axis=1), [-1]))
index = path[index]
path_to_obs[index].append(i)
obs_to_state.append(index)
return path_to_obs, obs_to_state
def _perturb_state(self, state, noise_var):
"""Perturbes the state.
The location are purturbed using a Gaussian noise with variance
noise_var. The orientation is uniformly sampled.
Args:
state: a numpy array containing an env state (x, y locations).
noise_var: float
Returns:
The perturbed state.
"""
def normal(v, std):
if std > 0:
n = self._rng.normal(0.0, std)
n = min(n, 2.0 * std)
n = max(n, -2.0 * std)
return v + n
else:
return v
state = state.copy()
state[0] = normal(state[0], noise_var)
state[1] = normal(state[1], noise_var)
if state.size > 2:
state[2] = self._rng.uniform(-math.pi, math.pi)
return state
def _sample_obs(self,
indices,
observations,
observation_states,
path_to_obs,
max_obs_index=None,
use_exploration_obs=True):
"""Samples one observation which corresponds to vertex_index in path.
In addition, the sampled observation must have index in observations less
than max_obs_index. If these two conditions cannot be satisfied the
function returns None.
Args:
indices: a list of integers.
observations: a list of numpy arrays containing all the observations.
observation_states: a list of numpy arrays, each array representing the
state of the observation.
path_to_obs: a dict of path indices to lists of observation indices.
max_obs_index: an integer.
use_exploration_obs: if True, then the observation is sampled among the
specified observations, otherwise it is obtained from the environment.
Returns:
A tuple of:
-- A numpy array of size width x height x 3 representing the sampled
observation.
-- The index of the sampld observation among the input observations.
-- The state at which the observation is captured.
Raises:
ValueError: if the observation and observation_states lists are of
different lengths.
"""
if len(observations) != len(observation_states):
raise ValueError('observation and observation_states lists must have '
'equal lengths')
if not indices:
return None, None, None
vertex_index = self._rng.choice(indices)
if use_exploration_obs:
obs_indices = path_to_obs[vertex_index]
if max_obs_index is not None:
obs_indices = [i for i in obs_indices if i < max_obs_index]
if obs_indices:
index = self._rng.choice(obs_indices)
if self._add_query_noise:
xytheta = self._perturb_state(observation_states[index],
self._query_noise_var)
return self._env.observation(xytheta), index, xytheta
else:
return observations[index], index, observation_states[index]
else:
return None, None, None
else:
xy = self._env.vertex_to_pose(vertex_index)
xytheta = np.array([xy[0], xy[1], 0.0])
xytheta = self._perturb_state(xytheta, self._query_noise_var)
return self._env.observation(xytheta), None, xytheta
class AreNearbyTask(RandomExplorationBasedTask):
"""A task of identifying whether a query is nearby current location or not.
The query is guaranteed to be in proximity of an already visited location,
i.e. close to one of the observations. For each observation we have one
query, which is either close or not to this observation.
"""
def __init__(
self,
max_distance=0,
*args,
**kwargs): # pylint: disable=keyword-arg-before-vararg
super(AreNearbyTask, self).__init__(*args, **kwargs)
self._max_distance = max_distance
if len(self.config.inputs.keys()) != 1:
raise NotImplementedError('current implementation supports input '
'with only one modality type')
def episode(self):
"""Episode data.
Returns:
observations: a tuple with one element. This element is a numpy array of
size in_seq_len x observation_size x observation_size x 3 containing
in_seq_len images.
query: a numpy array of size
in_seq_len x observation_size X observation_size x 3 containing a query
image.
A tuple of size two. First element is a in_seq_len x 2 numpy array of
either 1.0 or 0.0. The i-th element denotes whether the i-th query
image is neraby (value 1.0) or not (value 0.0) to the i-th observation.
The second element in the tuple is a mask, a numpy array of size
in_seq_len x 1 and values 1.0 or 0.0 denoting whether the query is
valid or not (it can happen that the query is not valid, e.g. there are
not enough observations to have a meaningful queries).
"""
observations, states, path = self._exploration()
assert len(observations.values()[0]) == len(states)
# The observations are taken along a smoothed trajectory following the path.
# We compute a mapping between the obeservations and the map vertices.
path_to_obs, obs_to_path = self._obs_to_state(path, states)
# Go over all observations, and sample a query. With probability 0.5 this
# query is a nearby observation (defined as belonging to the same vertex
# in path).
g = self._env.graph
queries = []
labels = []
validity_masks = []
query_index_in_observations = []
for i, curr_o in enumerate(observations.values()[0]):
p = obs_to_path[i]
low = max(0, i - self._max_distance)
# A list of lists of vertex indices. Each list in this group corresponds
# to one possible label.
index_groups = [[], [], []]
# Nearby visited indices, label 1.
nearby_visited = [
ii for ii in path[low:i + 1] + g[p].keys() if ii in obs_to_path[:i]
]
nearby_visited = [ii for ii in index_groups[1] if ii in path_to_obs]
# NOT Nearby visited indices, label 0.
not_nearby_visited = [ii for ii in path[:low] if ii not in g[p].keys()]
not_nearby_visited = [ii for ii in index_groups[0] if ii in path_to_obs]
# NOT visited indices, label 2.
not_visited = [
ii for ii in range(g.number_of_nodes()) if ii not in path[:i + 1]
]
index_groups = [not_nearby_visited, nearby_visited, not_visited]
# Consider only labels for which there are indices.
allowed_labels = [ii for ii, group in enumerate(index_groups) if group]
label = self._rng.choice(allowed_labels)
indices = list(set(index_groups[label]))
max_obs_index = None if label == 2 else i
use_exploration_obs = False if label == 2 else True
o, obs_index, _ = self._sample_obs(
indices=indices,
observations=observations.values()[0],
observation_states=states,
path_to_obs=path_to_obs,
max_obs_index=max_obs_index,
use_exploration_obs=use_exploration_obs)
query_index_in_observations.append(obs_index)
# If we cannot sample a valid query, we mark it as not valid in mask.
if o is None:
label = 0.0
o = curr_o
validity_masks.append(0)
else:
validity_masks.append(1)
queries.append(o.values()[0])
labels.append(label)
query = np.concatenate([np.expand_dims(q, axis=0) for q in queries], axis=0)
def one_hot(label, num_labels=3):
a = np.zeros((num_labels,), dtype=np.float)
a[int(label)] = 1.0
return a
outputs = np.stack([one_hot(l) for l in labels], axis=0)
validity_mask = np.reshape(
np.array(validity_masks, dtype=np.int32), [-1, 1])
self.info['query_index_in_observations'] = query_index_in_observations
self.info['observation_states'] = states
return observations, query, (outputs, validity_mask)
def target_loss(self, truth, predicted, weights=None):
pass
class NeighboringQueriesTask(RandomExplorationBasedTask):
"""A task of identifying whether two queries are closeby or not.
The proximity between queries is defined by the length of the shorest path
between them.
"""
def __init__(
self,
max_distance=1,
*args,
**kwargs): # pylint: disable=keyword-arg-before-vararg
"""Initializes a NeighboringQueriesTask.
Args:
max_distance: integer, the maximum distance in terms of number of vertices
between the two queries, so that they are considered neighboring.
*args: for super class.
**kwargs: for super class.
"""
super(NeighboringQueriesTask, self).__init__(*args, **kwargs)
self._max_distance = max_distance
if len(self.config.inputs.keys()) != 1:
raise NotImplementedError('current implementation supports input '
'with only one modality type')
def episode(self):
"""Episode data.
Returns:
observations: a tuple with one element. This element is a numpy array of
size in_seq_len x observation_size x observation_size x 3 containing
in_seq_len images.
query: a numpy array of size
2 x observation_size X observation_size x 3 containing a pair of query
images.
A tuple of size two. First element is a numpy array of size 2 containing
a one hot vector of whether the two observations are neighobring. Second
element is a boolean numpy value denoting whether this is a valid
episode.
"""
observations, states, path = self._exploration()
assert len(observations.values()[0]) == len(states)
path_to_obs, _ = self._obs_to_state(path, states)
# Restrict path to ones for which observations have been generated.
path = [p for p in path if p in path_to_obs]
# Sample first query.
query1_index = self._rng.choice(path)
# Sample label.
label = self._rng.randint(2)
# Sample second query.
# If label == 1, then second query must be nearby, otherwise not.
closest_indices = nx.single_source_shortest_path(
self._env.graph, query1_index, self._max_distance).keys()
if label == 0:
# Closest indices on the path.
indices = [p for p in path if p not in closest_indices]
else:
# Indices which are not closest on the path.
indices = [p for p in closest_indices if p in path]
query2_index = self._rng.choice(indices)
# Generate an observation.
query1, query1_index, _ = self._sample_obs(
[query1_index],
observations.values()[0],
states,
path_to_obs,
max_obs_index=None,
use_exploration_obs=True)
query2, query2_index, _ = self._sample_obs(
[query2_index],
observations.values()[0],
states,
path_to_obs,
max_obs_index=None,
use_exploration_obs=True)
queries = np.concatenate(
[np.expand_dims(q, axis=0) for q in [query1, query2]])
labels = np.array([0, 0])
labels[label] = 1
is_valid = np.array([1])
self.info['observation_states'] = states
self.info['query_indices_in_observations'] = [query1_index, query2_index]
return observations, queries, (labels, is_valid)
def target_loss(self, truth, predicted, weights=None):
pass
#@gin.configurable
class GotoStaticXTask(RandomExplorationBasedTask):
"""Task go to a static X.
If continuous reward is used only one goal is allowed so that the reward can
be computed as a delta-distance to that goal..
"""
def __init__(self,
step_reward=0.0,
goal_reward=1.0,
hit_wall_reward=-1.0,
done_at_target=False,
use_continuous_reward=False,
*args,
**kwargs): # pylint: disable=keyword-arg-before-vararg
super(GotoStaticXTask, self).__init__(*args, **kwargs)
if len(self.config.inputs.keys()) > 1:
raise NotImplementedError('current implementation supports input '
'with only one modality type or less.')
self._step_reward = step_reward
self._goal_reward = goal_reward
self._hit_wall_reward = hit_wall_reward
self._done_at_target = done_at_target
self._use_continuous_reward = use_continuous_reward
self._previous_path_length = None
def episode(self):
observations, _, path = self._exploration()
if len(path) < 2:
raise ValueError('The exploration path has only one node.')
g = self._env.graph
start = path[-1]
while True:
goal = self._rng.choice(path[:-1])
if goal != start:
break
goal_path = nx.shortest_path(g, start, goal)
init_orientation = self._rng.uniform(0, np.pi, (1,))
trajectory = np.array(
[list(self._env.vertex_to_pose(p)) for p in goal_path])
init_xy = np.reshape(trajectory[0, :], [-1])
init_state = np.concatenate([init_xy, init_orientation], 0)
trajectory = trajectory[1:, :]
deltas = envs_util.trajectory_to_deltas(trajectory, init_state)
output_seq_len = self._config.output.shape[0]
arr = _pad_or_clip_array(deltas, output_seq_len, output_mask=True)
# pylint: disable=unbalanced-tuple-unpacking
thetas, _, thetas_mask = arr
query = self._env.observation(self._env.vertex_to_pose(goal)).values()[0]
return observations, query, (thetas, thetas_mask)
def reward(self, obs, done, info):
if 'wall_collision' in info and info['wall_collision']:
return obs, self._hit_wall_reward, done, info
reward = 0.0
current_vertex = self._env.pose_to_vertex(self._env.state)
if current_vertex in self._env.targets():
if self._done_at_target:
done = True
else:
obs = self._env.reset()
reward = self._goal_reward
else:
if self._use_continuous_reward:
if len(self._env.targets()) != 1:
raise ValueError(
'FindX task with continuous reward is assuming only one target.')
goal_vertex = self._env.targets()[0]
path_length = self._compute_path_length(goal_vertex)
reward = self._previous_path_length - path_length
self._previous_path_length = path_length
else:
reward = self._step_reward
return obs, reward, done, info
def _compute_path_length(self, goal_vertex):
current_vertex = self._env.pose_to_vertex(self._env.state)
path = nx.shortest_path(self._env.graph, current_vertex, goal_vertex)
assert len(path) >= 2
curr_xy = np.array(self._env.state[:2])
next_xy = np.array(self._env.vertex_to_pose(path[1]))
last_step_distance = np.linalg.norm(next_xy - curr_xy)
return (len(path) - 2) * self._env.cell_size_px + last_step_distance
def reset(self, observation):
if self._use_continuous_reward:
if len(self._env.targets()) != 1:
raise ValueError(
'FindX task with continuous reward is assuming only one target.')
goal_vertex = self._env.targets()[0]
self._previous_path_length = self._compute_path_length(goal_vertex)
def target_loss(self, truth, predicted, weights=None):
"""Action classification loss.
Args:
truth: a batch_size x sequence length x number of labels float
Tensor containing a one hot vector for each label in each batch and
time.
predicted: a batch_size x sequence length x number of labels float
Tensor containing a predicted distribution over all actions.
weights: a batch_size x sequence_length float Tensor of bool
denoting which actions are valid.
Returns:
An average cross entropy over all batches and elements in sequence.
"""
return classification_loss(
truth=truth, predicted=predicted, weights=weights, is_one_hot=True)
class RelativeLocationTask(RandomExplorationBasedTask):
"""A task of estimating the relative location of a query w.r.t current.
It is to be used for debugging. It is designed such that the output is a
single value, out of a discrete set of values, so that it can be phrased as
a classification problem.
"""
def __init__(self, num_labels, *args, **kwargs):
"""Initializes a relative location task.
Args:
num_labels: integer, number of orientations to bin the relative
orientation into.
*args: see super class.
**kwargs: see super class.
"""
super(RelativeLocationTask, self).__init__(*args, **kwargs)
self._num_labels = num_labels
if len(self.config.inputs.keys()) != 1:
raise NotImplementedError('current implementation supports input '
'with only one modality type')
def episode(self):
observations, states, path = self._exploration()
# Select a random element from history.
path_to_obs, _ = self._obs_to_state(path, states)
use_exploration_obs = not self._add_query_noise
query, _, query_state = self._sample_obs(
path[:-1],
observations.values()[0],
states,
path_to_obs,
max_obs_index=None,
use_exploration_obs=use_exploration_obs)
x, y, theta = tuple(states[-1])
q_x, q_y, _ = tuple(query_state)
t_x, t_y = q_x - x, q_y - y
(rt_x, rt_y) = (np.sin(theta) * t_x - np.cos(theta) * t_y,
np.cos(theta) * t_x + np.sin(theta) * t_y)
# Bins are [a(i), a(i+1)] for a(i) = -pi + 0.5 * bin_size + i * bin_size.
shift = np.pi * (1 - 1.0 / (2.0 * self._num_labels))
orientation = np.arctan2(rt_y, rt_x) + shift
if orientation < 0:
orientation += 2 * np.pi
label = int(np.floor(self._num_labels * orientation / (2 * np.pi)))
out_shape = self._config.output.shape
if len(out_shape) != 1:
raise ValueError('Output shape should be of rank 1.')
if out_shape[0] != self._num_labels:
raise ValueError('Output shape must be of size %d' % self._num_labels)
output = np.zeros(out_shape, dtype=np.float32)
output[label] = 1
return observations, query, (output, None)
def target_loss(self, truth, predicted, weights=None):
return classification_loss(
truth=truth, predicted=predicted, weights=weights, is_one_hot=True)
class LocationClassificationTask(UnrolledTask):
"""A task of classifying a location as one of several classes.
The task does not have an input, but just a query and an output. The query
is an observation of the current location, e.g. an image taken from the
current state. The output is a label classifying this location in one of
predefined set of locations (or landmarks).
The current implementation classifies locations as intersections based on the
number and directions of biforcations. It is expected that a location can have
at most 4 different directions, aligned with the axes. As each of these four
directions might be present or not, the number of possible intersections are
2^4 = 16.
"""
def __init__(self, env, seed, *args, **kwargs):
super(LocationClassificationTask, self).__init__(*args, **kwargs)
self._env = env
self._rng = np.random.RandomState(seed)
# A location property which can be set. If not set, a random one is
# generated.
self._location = None
if len(self.config.inputs.keys()) > 1:
raise NotImplementedError('current implementation supports input '
'with only one modality type or less.')
@property
def location(self):
return self._location
@location.setter
def location(self, location):
self._location = location
def episode(self):
# Get a location. If not set, sample on at a vertex with a random
# orientation
location = self._location
if location is None:
num_nodes = self._env.graph.number_of_nodes()
vertex = int(math.floor(self._rng.uniform(0, num_nodes)))
xy = self._env.vertex_to_pose(vertex)
theta = self._rng.uniform(0, 2 * math.pi)
location = np.concatenate(
[np.reshape(xy, [-1]), np.array([theta])], axis=0)
else:
vertex = self._env.pose_to_vertex(location)
theta = location[2]
neighbors = self._env.graph.neighbors(vertex)
xy_s = [self._env.vertex_to_pose(n) for n in neighbors]
def rotate(xy, theta):
"""Rotates a vector around the origin by angle theta.
Args:
xy: a numpy darray of shape (2, ) of floats containing the x and y
coordinates of a vector.
theta: a python float containing the rotation angle in radians.
Returns:
A numpy darray of floats of shape (2,) containing the x and y
coordinates rotated xy.
"""
rotated_x = np.cos(theta) * xy[0] - np.sin(theta) * xy[1]
rotated_y = np.sin(theta) * xy[0] + np.cos(theta) * xy[1]
return np.array([rotated_x, rotated_y])
# Rotate all intersection biforcation by the orientation of the agent as the
# intersection label is defined in an agent centered fashion.
xy_s = [
rotate(xy - location[0:2], -location[2] - math.pi / 4) for xy in xy_s
]
th_s = [np.arctan2(xy[1], xy[0]) for xy in xy_s]
out_shape = self._config.output.shape
if len(out_shape) != 1:
raise ValueError('Output shape should be of rank 1.')
num_labels = out_shape[0]
if num_labels != 16:
raise ValueError('Currently only 16 labels are supported '
'(there are 16 different 4 way intersection types).')
th_s = set([int(math.floor(4 * (th / (2 * np.pi) + 0.5))) for th in th_s])
one_hot_label = np.zeros((num_labels,), dtype=np.float32)
label = 0
for th in th_s:
label += pow(2, th)
one_hot_label[int(label)] = 1.0
query = self._env.observation(location).values()[0]
return [], query, (one_hot_label, None)
def reward(self, obs, done, info):
raise ValueError('Do not call.')
def target_loss(self, truth, predicted, weights=None):
return classification_loss(
truth=truth, predicted=predicted, weights=weights, is_one_hot=True)
class GotoStaticXNoExplorationTask(UnrolledTask):
"""An interface for findX tasks without exploration.
The agent is initialized a random location in a random world and a random goal
and the objective is for the agent to move toward the goal. This class
generates episode for such task. Each generates a sequence of observations x
and target outputs y. x is the observations and is an OrderedDict with keys
provided from config.inputs.keys() and the shapes provided in the
config.inputs. The output is a numpy arrays with the shape specified in the
config.output. The shape of the array is (sequence_length x action_size) where
action is the number of actions that can be done in the environment. Note that
config.output.shape should be set according to the number of actions that can
be done in the env.
target outputs y are the groundtruth value of each action that is computed
from the environment graph. The target output for each action is proportional
to the progress that each action makes. Target value of 1 means that the
action takes the agent one step closer, -1 means the action takes the agent
one step farther. Value of -2 means that action should not take place at all.
This can be because the action leads to collision or it wants to terminate the
episode prematurely.
"""
def __init__(self, env, *args, **kwargs):
super(GotoStaticXNoExplorationTask, self).__init__(*args, **kwargs)
if self._config.query is not None:
raise ValueError('query should be None.')
if len(self._config.output.shape) != 2:
raise ValueError('output should only have two dimensions:'
'(sequence_length x number_of_actions)')
for input_config in self._config.inputs.values():
if input_config.shape[0] != self._config.output.shape[0]:
raise ValueError('the first dimension of the input and output should'
'be the same.')
if len(self._config.output.shape) != 2:
raise ValueError('output shape should be '
'(sequence_length x number_of_actions)')
self._env = env
def _compute_shortest_path_length(self, vertex, target_vertices):
"""Computes length of the shortest path from vertex to any target vertexes.
Args:
vertex: integer, index of the vertex in the environment graph.
target_vertices: list of the target vertexes
Returns:
integer, minimum distance from the vertex to any of the target_vertices.
Raises:
ValueError: if there is no path between the vertex and at least one of
the target_vertices.
"""
try:
return np.min([
len(nx.shortest_path(self._env.graph, vertex, t))
for t in target_vertices
])
except:
#logging.error('there is no path between vertex %d and at least one of '
# 'the targets %r', vertex, target_vertices)
raise
def _compute_gt_value(self, vertex, target_vertices):
"""Computes groundtruth value of all the actions at the vertex.
The value of each action is the difference each action makes in the length
of the shortest path to the goal. If an action takes the agent one step
closer to the goal the value is 1. In case, it takes the agent one step away
from the goal it would be -1. If it leads to collision or if the agent uses
action stop before reaching to the goal it is -2. To avoid scale issues the
gt_values are multipled by 0.5.
Args:
vertex: integer, the index of current vertex.
target_vertices: list of the integer indexes of the target views.
Returns:
numpy array with shape (action_size,) and each element is the groundtruth
value of each action based on the progress each action makes.
"""
action_size = self._config.output.shape[1]
output_value = np.ones((action_size), dtype=np.float32) * -2
my_distance = self._compute_shortest_path_length(vertex, target_vertices)
for adj in self._env.graph[vertex]:
adj_distance = self._compute_shortest_path_length(adj, target_vertices)
if adj_distance is None:
continue
action_index = self._env.action(
self._env.vertex_to_pose(vertex), self._env.vertex_to_pose(adj))
assert action_index is not None, ('{} is not adjacent to {}. There might '
'be a problem in environment graph '
'connectivity because there is no '
'direct edge between the given '
'vertices').format(
self._env.vertex_to_pose(vertex),
self._env.vertex_to_pose(adj))
output_value[action_index] = my_distance - adj_distance
return output_value * 0.5
def episode(self):
"""Returns data needed to train and test a single episode.
Returns:
(inputs, None, output) where inputs is a dictionary of modality types to
numpy arrays. The second element is query but we assume that the goal
is also given as part of observation so it should be None for this task,
and the outputs is the tuple of ground truth action values with the
shape of (sequence_length x action_size) that is coming from
config.output.shape and a numpy array with the shape of
(sequence_length,) that is 1 if the corresponding element of the
input and output should be used in the training optimization.
Raises:
ValueError: If the output values for env.random_step_sequence is not
valid.
ValueError: If the shape of observations coming from the env is not
consistent with the config.
ValueError: If there is a modality type specified in the config but the
environment does not return that.
"""
# Sequence length is the first dimension of any of the input tensors.
sequence_length = self._config.inputs.values()[0].shape[0]
modality_types = self._config.inputs.keys()
path, _, _, step_outputs = self._env.random_step_sequence(
max_len=sequence_length)
target_vertices = [self._env.pose_to_vertex(x) for x in self._env.targets()]
if len(path) != len(step_outputs):
raise ValueError('path, and step_outputs should have equal length'
' {}!={}'.format(len(path), len(step_outputs)))
# Building up observations. observations will be a OrderedDict of
# modality types. The values are numpy arrays that follow the given shape
# in the input config for each modality type.
observations = collections.OrderedDict([k, []] for k in modality_types)
for step_output in step_outputs:
obs_dict = step_output[0]
# Only going over the modality types that are specified in the input
# config.
for modality_type in modality_types:
if modality_type not in obs_dict:
raise ValueError('modality type is not returned from the environment.'
'{} not in {}'.format(modality_type,
obs_dict.keys()))
obs = obs_dict[modality_type]
if np.any(
obs.shape != tuple(self._config.inputs[modality_type].shape[1:])):
raise ValueError(
'The observations should have the same size as speicifed in'
'config for modality type {}. {} != {}'.format(
modality_type, obs.shape,
self._config.inputs[modality_type].shape[1:]))
observations[modality_type].append(obs)
gt_value = [self._compute_gt_value(v, target_vertices) for v in path]
# pylint: disable=unbalanced-tuple-unpacking
gt_value, _, value_mask = _pad_or_clip_array(
np.array(gt_value),
sequence_length,
is_front_clip=False,
output_mask=True,
)
for modality_type, obs in observations.iteritems():
observations[modality_type], _, mask = _pad_or_clip_array(
np.array(obs), sequence_length, is_front_clip=False, output_mask=True)
assert np.all(mask == value_mask)
return observations, None, (gt_value, value_mask)
def reset(self, observation):
"""Called after the environment is reset."""
pass
def target_loss(self, true_targets, targets, weights=None):
"""A loss for training a task model.
This loss measures the discrepancy between the task outputs, the true and
predicted ones.
Args:
true_targets: tf.Tensor of tf.float32 with the shape of
(batch_size x sequence_length x action_size).
targets: tf.Tensor of tf.float32 with the shape of
(batch_size x sequence_length x action_size).
weights: tf.Tensor of tf.bool with the shape of
(batch_size x sequence_length).
Raises:
ValueError: if the shapes of the input tensors are not consistent.
Returns:
L2 loss between the predicted action values and true action values.
"""
targets_shape = targets.get_shape().as_list()
true_targets_shape = true_targets.get_shape().as_list()
if len(targets_shape) != 3 or len(true_targets_shape) != 3:
raise ValueError('invalid shape for targets or true_targets_shape')
if np.any(targets_shape != true_targets_shape):
raise ValueError('the shape of targets and true_targets are not the same'
'{} != {}'.format(targets_shape, true_targets_shape))
if weights is not None:
# Filtering targets and true_targets using weights.
weights_shape = weights.get_shape().as_list()
if np.any(weights_shape != targets_shape[0:2]):
raise ValueError('The first two elements of weights shape should match'
'target. {} != {}'.format(weights_shape,
targets_shape))
true_targets = tf.boolean_mask(true_targets, weights)
targets = tf.boolean_mask(targets, weights)
return tf.losses.mean_squared_error(tf.reshape(targets, [-1]),
tf.reshape(true_targets, [-1]))
def reward(self, obs, done, info):
raise NotImplementedError('reward is not implemented for this task')
################################################################################
class NewTask(UnrolledTask):
def __init__(self, env, *args, **kwargs):
super(NewTask, self).__init__(*args, **kwargs)
self._env = env
def _compute_shortest_path_length(self, vertex, target_vertices):
"""Computes length of the shortest path from vertex to any target vertexes.
Args:
vertex: integer, index of the vertex in the environment graph.
target_vertices: list of the target vertexes
Returns:
integer, minimum distance from the vertex to any of the target_vertices.
Raises:
ValueError: if there is no path between the vertex and at least one of
the target_vertices.
"""
try:
return np.min([
len(nx.shortest_path(self._env.graph, vertex, t))
for t in target_vertices
])
except:
logging.error('there is no path between vertex %d and at least one of '
'the targets %r', vertex, target_vertices)
raise
def _compute_gt_value(self, vertex, target_vertices):
"""Computes groundtruth value of all the actions at the vertex.
The value of each action is the difference each action makes in the length
of the shortest path to the goal. If an action takes the agent one step
closer to the goal the value is 1. In case, it takes the agent one step away
from the goal it would be -1. If it leads to collision or if the agent uses
action stop before reaching to the goal it is -2. To avoid scale issues the
gt_values are multipled by 0.5.
Args:
vertex: integer, the index of current vertex.
target_vertices: list of the integer indexes of the target views.
Returns:
numpy array with shape (action_size,) and each element is the groundtruth
value of each action based on the progress each action makes.
"""
action_size = self._config.output.shape[1]
output_value = np.ones((action_size), dtype=np.float32) * -2
# own compute _compute_shortest_path_length - returnts float
my_distance = self._compute_shortest_path_length(vertex, target_vertices)
for adj in self._env.graph[vertex]:
adj_distance = self._compute_shortest_path_length(adj, target_vertices)
if adj_distance is None:
continue
action_index = self._env.action(
self._env.vertex_to_pose(vertex), self._env.vertex_to_pose(adj))
assert action_index is not None, ('{} is not adjacent to {}. There might '
'be a problem in environment graph '
'connectivity because there is no '
'direct edge between the given '
'vertices').format(
self._env.vertex_to_pose(vertex),
self._env.vertex_to_pose(adj))
output_value[action_index] = my_distance - adj_distance
return output_value * 0.5
def episode(self):
"""Returns data needed to train and test a single episode.
Returns:
(inputs, None, output) where inputs is a dictionary of modality types to
numpy arrays. The second element is query but we assume that the goal
is also given as part of observation so it should be None for this task,
and the outputs is the tuple of ground truth action values with the
shape of (sequence_length x action_size) that is coming from
config.output.shape and a numpy array with the shape of
(sequence_length,) that is 1 if the corresponding element of the
input and output should be used in the training optimization.
Raises:
ValueError: If the output values for env.random_step_sequence is not
valid.
ValueError: If the shape of observations coming from the env is not
consistent with the config.
ValueError: If there is a modality type specified in the config but the
environment does not return that.
"""
# Sequence length is the first dimension of any of the input tensors.
sequence_length = self._config.inputs.values()[0].shape[0]
modality_types = self._config.inputs.keys()
path, _, _, step_outputs = self._env.random_step_sequence(
max_len=sequence_length)
target_vertices = [self._env.pose_to_vertex(x) for x in self._env.targets()]
if len(path) != len(step_outputs):
raise ValueError('path, and step_outputs should have equal length'
' {}!={}'.format(len(path), len(step_outputs)))
# Building up observations. observations will be a OrderedDict of
# modality types. The values are numpy arrays that follow the given shape
# in the input config for each modality type.
observations = collections.OrderedDict([k, []] for k in modality_types)
for step_output in step_outputs:
obs_dict = step_output[0]
# Only going over the modality types that are specified in the input
# config.
for modality_type in modality_types:
if modality_type not in obs_dict:
raise ValueError('modality type is not returned from the environment.'
'{} not in {}'.format(modality_type,
obs_dict.keys()))
obs = obs_dict[modality_type]
if np.any(
obs.shape != tuple(self._config.inputs[modality_type].shape[1:])):
raise ValueError(
'The observations should have the same size as speicifed in'
'config for modality type {}. {} != {}'.format(
modality_type, obs.shape,
self._config.inputs[modality_type].shape[1:]))
observations[modality_type].append(obs)
gt_value = [self._compute_gt_value(v, target_vertices) for v in path]
# pylint: disable=unbalanced-tuple-unpacking
gt_value, _, value_mask = _pad_or_clip_array(
np.array(gt_value),
sequence_length,
is_front_clip=False,
output_mask=True,
)
for modality_type, obs in observations.iteritems():
observations[modality_type], _, mask = _pad_or_clip_array(
np.array(obs), sequence_length, is_front_clip=False, output_mask=True)
assert np.all(mask == value_mask)
return observations, None, (gt_value, value_mask)
def reset(self, observation):
"""Called after the environment is reset."""
pass
def target_loss(self, true_targets, targets, weights=None):
"""A loss for training a task model.
This loss measures the discrepancy between the task outputs, the true and
predicted ones.
Args:
true_targets: tf.Tensor of tf.float32 with the shape of
(batch_size x sequence_length x action_size).
targets: tf.Tensor of tf.float32 with the shape of
(batch_size x sequence_length x action_size).
weights: tf.Tensor of tf.bool with the shape of
(batch_size x sequence_length).
Raises:
ValueError: if the shapes of the input tensors are not consistent.
Returns:
L2 loss between the predicted action values and true action values.
"""
targets_shape = targets.get_shape().as_list()
true_targets_shape = true_targets.get_shape().as_list()
if len(targets_shape) != 3 or len(true_targets_shape) != 3:
raise ValueError('invalid shape for targets or true_targets_shape')
if np.any(targets_shape != true_targets_shape):
raise ValueError('the shape of targets and true_targets are not the same'
'{} != {}'.format(targets_shape, true_targets_shape))
if weights is not None:
# Filtering targets and true_targets using weights.
weights_shape = weights.get_shape().as_list()
if np.any(weights_shape != targets_shape[0:2]):
raise ValueError('The first two elements of weights shape should match'
'target. {} != {}'.format(weights_shape,
targets_shape))
true_targets = tf.boolean_mask(true_targets, weights)
targets = tf.boolean_mask(targets, weights)
return tf.losses.mean_squared_error(tf.reshape(targets, [-1]),
tf.reshape(true_targets, [-1]))
def reward(self, obs, done, info):
raise NotImplementedError('reward is not implemented for this task')
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
# pylint: disable=line-too-long
# pyformat: disable
"""Train and eval for supervised navigation training.
For training:
python train_supervised_active_vision.py \
--mode='train' \
--logdir=$logdir/checkin_log_det/ \
--modality_types='det' \
--batch_size=8 \
--train_iters=200000 \
--lstm_cell_size=2048 \
--policy_fc_size=2048 \
--sequence_length=20 \
--max_eval_episode_length=100 \
--test_iters=194 \
--gin_config=envs/configs/active_vision_config.gin \
--gin_params='ActiveVisionDatasetEnv.dataset_root="$datadir"' \
--logtostderr
For testing:
python train_supervised_active_vision.py
--mode='eval' \
--logdir=$logdir/checkin_log_det/ \
--modality_types='det' \
--batch_size=8 \
--train_iters=200000 \
--lstm_cell_size=2048 \
--policy_fc_size=2048 \
--sequence_length=20 \
--max_eval_episode_length=100 \
--test_iters=194 \
--gin_config=envs/configs/active_vision_config.gin \
--gin_params='ActiveVisionDatasetEnv.dataset_root="$datadir"' \
--logtostderr
"""
import collections
import os
import time
from absl import app
from absl import flags
from absl import logging
import networkx as nx
import numpy as np
import tensorflow as tf
import gin
import embedders
import policies
import tasks
from envs import active_vision_dataset_env
from envs import task_env
slim = tf.contrib.slim
flags.DEFINE_string('logdir', '',
'Path to a directory to write summaries and checkpoints')
# Parameters controlling the training setup. In general one would not need to
# modify them.
flags.DEFINE_string('master', 'local',
'BNS name of the TensorFlow master, or local.')
flags.DEFINE_integer('task_id', 0,
'Task id of the replica running the training.')
flags.DEFINE_integer('ps_tasks', 0,
'Number of tasks in the ps job. If 0 no ps job is used.')
flags.DEFINE_integer('decay_steps', 1000,
'Number of steps for exponential decay.')
flags.DEFINE_float('learning_rate', 0.0001, 'Learning rate.')
flags.DEFINE_integer('batch_size', 8, 'Batch size.')
flags.DEFINE_integer('sequence_length', 20, 'sequence length')
flags.DEFINE_integer('train_iters', 200000, 'number of training iterations.')
flags.DEFINE_integer('save_summaries_secs', 300,
'number of seconds between saving summaries')
flags.DEFINE_integer('save_interval_secs', 300,
'numer of seconds between saving variables')
flags.DEFINE_integer('log_every_n_steps', 20, 'number of steps between logging')
flags.DEFINE_string('modality_types', '',
'modality names in _ separated format')
flags.DEFINE_string('conv_window_sizes', '8_4_3',
'conv window size in separated by _')
flags.DEFINE_string('conv_strides', '4_2_1', '')
flags.DEFINE_string('conv_channels', '8_16_16', '')
flags.DEFINE_integer('embedding_fc_size', 128,
'size of embedding for each modality')
flags.DEFINE_integer('obs_resolution', 64,
'resolution of the input observations')
flags.DEFINE_integer('lstm_cell_size', 2048, 'size of lstm cell size')
flags.DEFINE_integer('policy_fc_size', 2048,
'size of fully connected layers for policy part')
flags.DEFINE_float('weight_decay', 0.0002, 'weight decay')
flags.DEFINE_integer('goal_category_count', 5, 'number of goal categories')
flags.DEFINE_integer('action_size', 7, 'number of possible actions')
flags.DEFINE_integer('max_eval_episode_length', 100,
'maximum sequence length for evaluation.')
flags.DEFINE_enum('mode', 'train', ['train', 'eval'],
'indicates whether it is in training or evaluation')
flags.DEFINE_integer('test_iters', 194,
'number of iterations that the eval needs to be run')
flags.DEFINE_multi_string('gin_config', [],
'List of paths to a gin config files for the env.')
flags.DEFINE_multi_string('gin_params', [],
'Newline separated list of Gin parameter bindings.')
flags.DEFINE_string(
'resnet50_path', './resnet_v2_50_checkpoint/resnet_v2_50.ckpt', 'path to resnet50'
'checkpoint')
flags.DEFINE_bool('freeze_resnet_weights', True, '')
flags.DEFINE_string(
'eval_init_points_file_name', '',
'Name of the file that containts the initial locations and'
'worlds for each evalution point')
FLAGS = flags.FLAGS
TRAIN_WORLDS = [
'Home_001_1', 'Home_001_2', 'Home_002_1', 'Home_003_1', 'Home_003_2',
'Home_004_1', 'Home_004_2', 'Home_005_1', 'Home_005_2', 'Home_006_1',
'Home_010_1'
]
TEST_WORLDS = ['Home_011_1', 'Home_013_1', 'Home_016_1']
def create_modality_types():
"""Parses the modality_types and returns a list of task_env.ModalityType."""
if not FLAGS.modality_types:
raise ValueError('there needs to be at least one modality type')
modality_types = FLAGS.modality_types.split('_')
for x in modality_types:
if x not in ['image', 'sseg', 'det', 'depth']:
raise ValueError('invalid modality type: {}'.format(x))
conversion_dict = {
'image': task_env.ModalityTypes.IMAGE,
'sseg': task_env.ModalityTypes.SEMANTIC_SEGMENTATION,
'depth': task_env.ModalityTypes.DEPTH,
'det': task_env.ModalityTypes.OBJECT_DETECTION,
}
return [conversion_dict[k] for k in modality_types]
def create_task_io_config(
modality_types,
goal_category_count,
action_size,
sequence_length,
):
"""Generates task io config."""
shape_prefix = [sequence_length, FLAGS.obs_resolution, FLAGS.obs_resolution]
shapes = {
task_env.ModalityTypes.IMAGE: [sequence_length, 224, 224, 3],
task_env.ModalityTypes.DEPTH: shape_prefix + [
2,
],
task_env.ModalityTypes.SEMANTIC_SEGMENTATION: shape_prefix + [
1,
],
task_env.ModalityTypes.OBJECT_DETECTION: shape_prefix + [
90,
]
}
types = {k: tf.float32 for k in shapes}
types[task_env.ModalityTypes.IMAGE] = tf.uint8
inputs = collections.OrderedDict(
[[mtype, (types[mtype], shapes[mtype])] for mtype in modality_types])
inputs[task_env.ModalityTypes.GOAL] = (tf.float32,
[sequence_length, goal_category_count])
inputs[task_env.ModalityTypes.PREV_ACTION] = (tf.float32, [
sequence_length, action_size + 1
])
print inputs
return tasks.UnrolledTaskIOConfig(
inputs=inputs,
output=(tf.float32, [sequence_length, action_size]),
query=None)
def map_to_embedder(modality_type):
"""Maps modality_type to its corresponding embedder."""
if modality_type == task_env.ModalityTypes.PREV_ACTION:
return None
if modality_type == task_env.ModalityTypes.GOAL:
return embedders.IdentityEmbedder()
if modality_type == task_env.ModalityTypes.IMAGE:
return embedders.ResNet50Embedder()
conv_window_sizes = [int(x) for x in FLAGS.conv_window_sizes.split('_')]
conv_channels = [int(x) for x in FLAGS.conv_channels.split('_')]
conv_strides = [int(x) for x in FLAGS.conv_strides.split('_')]
params = tf.contrib.training.HParams(
to_one_hot=modality_type == task_env.ModalityTypes.SEMANTIC_SEGMENTATION,
one_hot_length=10,
conv_sizes=conv_window_sizes,
conv_strides=conv_strides,
conv_channels=conv_channels,
embedding_size=FLAGS.embedding_fc_size,
weight_decay_rate=FLAGS.weight_decay,
)
return embedders.SmallNetworkEmbedder(params)
def create_train_and_init_ops(policy, task):
"""Creates training ops given the arguments.
Args:
policy: the policy for the task.
task: the task instance.
Returns:
train_op: the op that needs to be runned at each step.
summaries_op: the summary op that is executed.
init_fn: the op that initializes the variables if there is no previous
checkpoint. If Resnet50 is not used in the model it is None, otherwise
it reads the weights from FLAGS.resnet50_path and sets the init_fn
to the op that initializes the ResNet50 with the pre-trained weights.
"""
assert isinstance(task, tasks.GotoStaticXNoExplorationTask)
assert isinstance(policy, policies.Policy)
inputs, _, gt_outputs, masks = task.tf_episode_batch(FLAGS.batch_size)
outputs, _ = policy.build(inputs, None)
loss = task.target_loss(gt_outputs, outputs, masks)
init_fn = None
# If resnet is added to the graph, init_fn should initialize resnet weights
# if there is no previous checkpoint.
variables_assign_dict = {}
vars_list = []
for v in slim.get_model_variables():
if v.name.find('resnet') >= 0:
if not FLAGS.freeze_resnet_weights:
vars_list.append(v)
variables_assign_dict[v.name[v.name.find('resnet'):-2]] = v
else:
vars_list.append(v)
global_step = tf.train.get_or_create_global_step()
learning_rate = tf.train.exponential_decay(
FLAGS.learning_rate,
global_step,
decay_steps=FLAGS.decay_steps,
decay_rate=0.98,
staircase=True)
optimizer = tf.train.AdamOptimizer(learning_rate)
train_op = slim.learning.create_train_op(
loss,
optimizer,
global_step=global_step,
variables_to_train=vars_list,
)
if variables_assign_dict:
init_fn = slim.assign_from_checkpoint_fn(
FLAGS.resnet50_path,
variables_assign_dict,
ignore_missing_vars=False)
scalar_summaries = {}
scalar_summaries['LR'] = learning_rate
scalar_summaries['loss'] = loss
for name, summary in scalar_summaries.iteritems():
tf.summary.scalar(name, summary)
return train_op, init_fn
def create_eval_ops(policy, config, possible_targets):
"""Creates the necessary ops for evaluation."""
inputs_feed = collections.OrderedDict([[
mtype,
tf.placeholder(config.inputs[mtype].type,
[1] + config.inputs[mtype].shape)
] for mtype in config.inputs])
inputs_feed[task_env.ModalityTypes.PREV_ACTION] = tf.placeholder(
tf.float32, [1, 1] + [
config.output.shape[-1] + 1,
])
prev_state_feed = [
tf.placeholder(
tf.float32, [1, FLAGS.lstm_cell_size], name='prev_state_{}'.format(i))
for i in range(2)
]
policy_outputs = policy.build(inputs_feed, prev_state_feed)
summary_feed = {}
for c in possible_targets + ['mean']:
summary_feed[c] = tf.placeholder(
tf.float32, [], name='eval_in_range_{}_input'.format(c))
tf.summary.scalar('eval_in_range_{}'.format(c), summary_feed[c])
return inputs_feed, prev_state_feed, policy_outputs, (tf.summary.merge_all(),
summary_feed)
def unroll_policy_for_eval(
sess,
env,
inputs_feed,
prev_state_feed,
policy_outputs,
number_of_steps,
output_folder,
):
"""unrolls the policy for testing.
Args:
sess: tf.Session
env: The environment.
inputs_feed: dictionary of placeholder for the input modalities.
prev_state_feed: placeholder for the input to the prev_state of the model.
policy_outputs: tensor that contains outputs of the policy.
number_of_steps: maximum number of unrolling steps.
output_folder: output_folder where the function writes a dictionary of
detailed information about the path. The dictionary keys are 'states' and
'distance'. The value for 'states' is the list of states that the agent
goes along the path. The value for 'distance' contains the length of
shortest path to the goal at each step.
Returns:
states: list of states along the path.
distance: list of distances along the path.
"""
prev_state = [
np.zeros((1, FLAGS.lstm_cell_size), dtype=np.float32) for _ in range(2)
]
prev_action = np.zeros((1, 1, FLAGS.action_size + 1), dtype=np.float32)
obs = env.reset()
distances_to_goal = []
states = []
unique_id = '{}_{}'.format(env.cur_image_id(), env.goal_string)
for _ in range(number_of_steps):
distances_to_goal.append(
np.min([
len(
nx.shortest_path(env.graph, env.pose_to_vertex(env.state()),
env.pose_to_vertex(target_view)))
for target_view in env.targets()
]))
states.append(env.state())
feed_dict = {inputs_feed[mtype]: [[obs[mtype]]] for mtype in inputs_feed}
feed_dict[prev_state_feed[0]] = prev_state[0]
feed_dict[prev_state_feed[1]] = prev_state[1]
action_values, prev_state = sess.run(policy_outputs, feed_dict=feed_dict)
chosen_action = np.argmax(action_values[0])
obs, _, done, info = env.step(np.int32(chosen_action))
prev_action[0][0][chosen_action] = 1.
prev_action[0][0][-1] = float(info['success'])
# If the agent chooses action stop or the number of steps exceeeded
# env._episode_length.
if done:
break
# logging.info('distance = %d, id = %s, #steps = %d', distances_to_goal[-1],
output_path = os.path.join(output_folder, unique_id + '.npy')
with tf.gfile.Open(output_path, 'w') as f:
print 'saving path information to {}'.format(output_path)
np.save(f, {'states': states, 'distance': distances_to_goal})
return states, distances_to_goal
def init(sequence_length, eval_init_points_file_name, worlds):
"""Initializes the common operations between train and test."""
modality_types = create_modality_types()
logging.info('modality types: %r', modality_types)
# negative reward_goal_range prevents the env from terminating early when the
# agent is close to the goal. The policy should keep the agent until the end
# of the 100 steps either through chosing stop action or oscilating around
# the target.
env = active_vision_dataset_env.ActiveVisionDatasetEnv(
modality_types=modality_types +
[task_env.ModalityTypes.GOAL, task_env.ModalityTypes.PREV_ACTION],
reward_goal_range=-1,
eval_init_points_file_name=eval_init_points_file_name,
worlds=worlds,
output_size=FLAGS.obs_resolution,
)
config = create_task_io_config(
modality_types=modality_types,
goal_category_count=FLAGS.goal_category_count,
action_size=FLAGS.action_size,
sequence_length=sequence_length,
)
task = tasks.GotoStaticXNoExplorationTask(env=env, config=config)
embedders_dict = {mtype: map_to_embedder(mtype) for mtype in config.inputs}
policy_params = tf.contrib.training.HParams(
lstm_state_size=FLAGS.lstm_cell_size,
fc_channels=FLAGS.policy_fc_size,
weight_decay=FLAGS.weight_decay,
target_embedding_size=FLAGS.embedding_fc_size,
)
policy = policies.LSTMPolicy(
modality_names=config.inputs.keys(),
embedders_dict=embedders_dict,
action_size=FLAGS.action_size,
params=policy_params,
max_episode_length=sequence_length)
return env, config, task, policy
def test():
"""Contains all the operations for testing policies."""
env, config, _, policy = init(1, 'all_init_configs', TEST_WORLDS)
inputs_feed, prev_state_feed, policy_outputs, summary_op = create_eval_ops(
policy, config, env.possible_targets)
sv = tf.train.Supervisor(logdir=FLAGS.logdir)
prev_checkpoint = None
with sv.managed_session(
start_standard_services=False,
config=tf.ConfigProto(allow_soft_placement=True)) as sess:
while not sv.should_stop():
while True:
new_checkpoint = tf.train.latest_checkpoint(FLAGS.logdir)
print 'new_checkpoint ', new_checkpoint
if not new_checkpoint:
time.sleep(1)
continue
if prev_checkpoint is None:
prev_checkpoint = new_checkpoint
break
if prev_checkpoint != new_checkpoint:
prev_checkpoint = new_checkpoint
break
else: # if prev_checkpoint == new_checkpoint, we have to wait more.
time.sleep(1)
checkpoint_step = int(new_checkpoint[new_checkpoint.rfind('-') + 1:])
sv.saver.restore(sess, new_checkpoint)
print '--------------------'
print 'evaluating checkpoint {}'.format(new_checkpoint)
folder_path = os.path.join(FLAGS.logdir, 'evals', str(checkpoint_step))
if not tf.gfile.Exists(folder_path):
tf.gfile.MakeDirs(folder_path)
eval_stats = {c: [] for c in env.possible_targets}
for test_iter in range(FLAGS.test_iters):
print 'evaluating {} of {}'.format(test_iter, FLAGS.test_iters)
_, distance_to_goal = unroll_policy_for_eval(
sess,
env,
inputs_feed,
prev_state_feed,
policy_outputs,
FLAGS.max_eval_episode_length,
folder_path,
)
print 'goal = {}'.format(env.goal_string)
eval_stats[env.goal_string].append(float(distance_to_goal[-1] <= 7))
eval_stats = {k: np.mean(v) for k, v in eval_stats.iteritems()}
eval_stats['mean'] = np.mean(eval_stats.values())
print eval_stats
feed_dict = {summary_op[1][c]: eval_stats[c] for c in eval_stats}
summary_str = sess.run(summary_op[0], feed_dict=feed_dict)
writer = sv.summary_writer
writer.add_summary(summary_str, checkpoint_step)
writer.flush()
def train():
_, _, task, policy = init(FLAGS.sequence_length, None, TRAIN_WORLDS)
print(FLAGS.save_summaries_secs)
print(FLAGS.save_interval_secs)
print(FLAGS.logdir)
with tf.device(
tf.train.replica_device_setter(ps_tasks=FLAGS.ps_tasks, merge_devices=True)):
train_op, init_fn = create_train_and_init_ops(policy=policy, task=task)
print(FLAGS.logdir)
slim.learning.train(
train_op=train_op,
init_fn=init_fn,
logdir=FLAGS.logdir,
is_chief=FLAGS.task_id == 0,
number_of_steps=FLAGS.train_iters,
save_summaries_secs=FLAGS.save_summaries_secs,
save_interval_secs=FLAGS.save_interval_secs,
session_config=tf.ConfigProto(allow_soft_placement=True),
)
def main(_):
gin.parse_config_files_and_bindings(FLAGS.gin_config, FLAGS.gin_params)
if FLAGS.mode == 'train':
train()
else:
test()
if __name__ == '__main__':
app.run(main)
#!/bin/bash
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
# blaze build -c opt train_supervised_active_vision
# bazel build -c opt --config=cuda --copt=-mavx train_supervised_active_vision && \
bazel-bin/research/cognitive_planning/train_supervised_active_vision \
--mode='train' \
--logdir=/usr/local/google/home/kosecka/local_avd_train/ \
--modality_types='det' \
--batch_size=8 \
--train_iters=200000 \
--lstm_cell_size=2048 \
--policy_fc_size=2048 \
--sequence_length=20 \
--max_eval_episode_length=100 \
--test_iters=194 \
--gin_config=envs/configs/active_vision_config.gin \
--gin_params='ActiveVisionDatasetEnv.dataset_root="/cns/jn-d/home/kosecka/AVD_Minimal/"' \
--logtostderr
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A set of functions that are used for visualization.
These functions often receive an image, perform some visualization on the image.
The functions do not return a value, instead they modify the image itself.
"""
import collections
import functools
# Set headless-friendly backend.
import matplotlib; matplotlib.use('Agg') # pylint: disable=multiple-statements
import matplotlib.pyplot as plt # pylint: disable=g-import-not-at-top
import numpy as np
import PIL.Image as Image
import PIL.ImageColor as ImageColor
import PIL.ImageDraw as ImageDraw
import PIL.ImageFont as ImageFont
import six
import tensorflow as tf
import standard_fields as fields
_TITLE_LEFT_MARGIN = 10
_TITLE_TOP_MARGIN = 10
STANDARD_COLORS = [
'AliceBlue', 'Chartreuse', 'Aqua', 'Aquamarine', 'Azure', 'Beige', 'Bisque',
'BlanchedAlmond', 'BlueViolet', 'BurlyWood', 'CadetBlue', 'AntiqueWhite',
'Chocolate', 'Coral', 'CornflowerBlue', 'Cornsilk', 'Crimson', 'Cyan',
'DarkCyan', 'DarkGoldenRod', 'DarkGrey', 'DarkKhaki', 'DarkOrange',
'DarkOrchid', 'DarkSalmon', 'DarkSeaGreen', 'DarkTurquoise', 'DarkViolet',
'DeepPink', 'DeepSkyBlue', 'DodgerBlue', 'FireBrick', 'FloralWhite',
'ForestGreen', 'Fuchsia', 'Gainsboro', 'GhostWhite', 'Gold', 'GoldenRod',
'Salmon', 'Tan', 'HoneyDew', 'HotPink', 'IndianRed', 'Ivory', 'Khaki',
'Lavender', 'LavenderBlush', 'LawnGreen', 'LemonChiffon', 'LightBlue',
'LightCoral', 'LightCyan', 'LightGoldenRodYellow', 'LightGray', 'LightGrey',
'LightGreen', 'LightPink', 'LightSalmon', 'LightSeaGreen', 'LightSkyBlue',
'LightSlateGray', 'LightSlateGrey', 'LightSteelBlue', 'LightYellow', 'Lime',
'LimeGreen', 'Linen', 'Magenta', 'MediumAquaMarine', 'MediumOrchid',
'MediumPurple', 'MediumSeaGreen', 'MediumSlateBlue', 'MediumSpringGreen',
'MediumTurquoise', 'MediumVioletRed', 'MintCream', 'MistyRose', 'Moccasin',
'NavajoWhite', 'OldLace', 'Olive', 'OliveDrab', 'Orange', 'OrangeRed',
'Orchid', 'PaleGoldenRod', 'PaleGreen', 'PaleTurquoise', 'PaleVioletRed',
'PapayaWhip', 'PeachPuff', 'Peru', 'Pink', 'Plum', 'PowderBlue', 'Purple',
'Red', 'RosyBrown', 'RoyalBlue', 'SaddleBrown', 'Green', 'SandyBrown',
'SeaGreen', 'SeaShell', 'Sienna', 'Silver', 'SkyBlue', 'SlateBlue',
'SlateGray', 'SlateGrey', 'Snow', 'SpringGreen', 'SteelBlue', 'GreenYellow',
'Teal', 'Thistle', 'Tomato', 'Turquoise', 'Violet', 'Wheat', 'White',
'WhiteSmoke', 'Yellow', 'YellowGreen'
]
def save_image_array_as_png(image, output_path):
"""Saves an image (represented as a numpy array) to PNG.
Args:
image: a numpy array with shape [height, width, 3].
output_path: path to which image should be written.
"""
image_pil = Image.fromarray(np.uint8(image)).convert('RGB')
with tf.gfile.Open(output_path, 'w') as fid:
image_pil.save(fid, 'PNG')
def encode_image_array_as_png_str(image):
"""Encodes a numpy array into a PNG string.
Args:
image: a numpy array with shape [height, width, 3].
Returns:
PNG encoded image string.
"""
image_pil = Image.fromarray(np.uint8(image))
output = six.BytesIO()
image_pil.save(output, format='PNG')
png_string = output.getvalue()
output.close()
return png_string
def draw_bounding_box_on_image_array(image,
ymin,
xmin,
ymax,
xmax,
color='red',
thickness=4,
display_str_list=(),
use_normalized_coordinates=True):
"""Adds a bounding box to an image (numpy array).
Bounding box coordinates can be specified in either absolute (pixel) or
normalized coordinates by setting the use_normalized_coordinates argument.
Args:
image: a numpy array with shape [height, width, 3].
ymin: ymin of bounding box.
xmin: xmin of bounding box.
ymax: ymax of bounding box.
xmax: xmax of bounding box.
color: color to draw bounding box. Default is red.
thickness: line thickness. Default value is 4.
display_str_list: list of strings to display in box
(each to be shown on its own line).
use_normalized_coordinates: If True (default), treat coordinates
ymin, xmin, ymax, xmax as relative to the image. Otherwise treat
coordinates as absolute.
"""
image_pil = Image.fromarray(np.uint8(image)).convert('RGB')
draw_bounding_box_on_image(image_pil, ymin, xmin, ymax, xmax, color,
thickness, display_str_list,
use_normalized_coordinates)
np.copyto(image, np.array(image_pil))
def draw_bounding_box_on_image(image,
ymin,
xmin,
ymax,
xmax,
color='red',
thickness=4,
display_str_list=(),
use_normalized_coordinates=True):
"""Adds a bounding box to an image.
Bounding box coordinates can be specified in either absolute (pixel) or
normalized coordinates by setting the use_normalized_coordinates argument.
Each string in display_str_list is displayed on a separate line above the
bounding box in black text on a rectangle filled with the input 'color'.
If the top of the bounding box extends to the edge of the image, the strings
are displayed below the bounding box.
Args:
image: a PIL.Image object.
ymin: ymin of bounding box.
xmin: xmin of bounding box.
ymax: ymax of bounding box.
xmax: xmax of bounding box.
color: color to draw bounding box. Default is red.
thickness: line thickness. Default value is 4.
display_str_list: list of strings to display in box
(each to be shown on its own line).
use_normalized_coordinates: If True (default), treat coordinates
ymin, xmin, ymax, xmax as relative to the image. Otherwise treat
coordinates as absolute.
"""
draw = ImageDraw.Draw(image)
im_width, im_height = image.size
if use_normalized_coordinates:
(left, right, top, bottom) = (xmin * im_width, xmax * im_width,
ymin * im_height, ymax * im_height)
else:
(left, right, top, bottom) = (xmin, xmax, ymin, ymax)
draw.line([(left, top), (left, bottom), (right, bottom),
(right, top), (left, top)], width=thickness, fill=color)
try:
font = ImageFont.truetype('arial.ttf', 24)
except IOError:
font = ImageFont.load_default()
# If the total height of the display strings added to the top of the bounding
# box exceeds the top of the image, stack the strings below the bounding box
# instead of above.
display_str_heights = [font.getsize(ds)[1] for ds in display_str_list]
# Each display_str has a top and bottom margin of 0.05x.
total_display_str_height = (1 + 2 * 0.05) * sum(display_str_heights)
if top > total_display_str_height:
text_bottom = top
else:
text_bottom = bottom + total_display_str_height
# Reverse list and print from bottom to top.
for display_str in display_str_list[::-1]:
text_width, text_height = font.getsize(display_str)
margin = np.ceil(0.05 * text_height)
draw.rectangle(
[(left, text_bottom - text_height - 2 * margin), (left + text_width,
text_bottom)],
fill=color)
draw.text(
(left + margin, text_bottom - text_height - margin),
display_str,
fill='black',
font=font)
text_bottom -= text_height - 2 * margin
def draw_bounding_boxes_on_image_array(image,
boxes,
color='red',
thickness=4,
display_str_list_list=()):
"""Draws bounding boxes on image (numpy array).
Args:
image: a numpy array object.
boxes: a 2 dimensional numpy array of [N, 4]: (ymin, xmin, ymax, xmax).
The coordinates are in normalized format between [0, 1].
color: color to draw bounding box. Default is red.
thickness: line thickness. Default value is 4.
display_str_list_list: list of list of strings.
a list of strings for each bounding box.
The reason to pass a list of strings for a
bounding box is that it might contain
multiple labels.
Raises:
ValueError: if boxes is not a [N, 4] array
"""
image_pil = Image.fromarray(image)
draw_bounding_boxes_on_image(image_pil, boxes, color, thickness,
display_str_list_list)
np.copyto(image, np.array(image_pil))
def draw_bounding_boxes_on_image(image,
boxes,
color='red',
thickness=4,
display_str_list_list=()):
"""Draws bounding boxes on image.
Args:
image: a PIL.Image object.
boxes: a 2 dimensional numpy array of [N, 4]: (ymin, xmin, ymax, xmax).
The coordinates are in normalized format between [0, 1].
color: color to draw bounding box. Default is red.
thickness: line thickness. Default value is 4.
display_str_list_list: list of list of strings.
a list of strings for each bounding box.
The reason to pass a list of strings for a
bounding box is that it might contain
multiple labels.
Raises:
ValueError: if boxes is not a [N, 4] array
"""
boxes_shape = boxes.shape
if not boxes_shape:
return
if len(boxes_shape) != 2 or boxes_shape[1] != 4:
raise ValueError('Input must be of size [N, 4]')
for i in range(boxes_shape[0]):
display_str_list = ()
if display_str_list_list:
display_str_list = display_str_list_list[i]
draw_bounding_box_on_image(image, boxes[i, 0], boxes[i, 1], boxes[i, 2],
boxes[i, 3], color, thickness, display_str_list)
def _visualize_boxes(image, boxes, classes, scores, category_index, **kwargs):
return visualize_boxes_and_labels_on_image_array(
image, boxes, classes, scores, category_index=category_index, **kwargs)
def _visualize_boxes_and_masks(image, boxes, classes, scores, masks,
category_index, **kwargs):
return visualize_boxes_and_labels_on_image_array(
image,
boxes,
classes,
scores,
category_index=category_index,
instance_masks=masks,
**kwargs)
def _visualize_boxes_and_keypoints(image, boxes, classes, scores, keypoints,
category_index, **kwargs):
return visualize_boxes_and_labels_on_image_array(
image,
boxes,
classes,
scores,
category_index=category_index,
keypoints=keypoints,
**kwargs)
def _visualize_boxes_and_masks_and_keypoints(
image, boxes, classes, scores, masks, keypoints, category_index, **kwargs):
return visualize_boxes_and_labels_on_image_array(
image,
boxes,
classes,
scores,
category_index=category_index,
instance_masks=masks,
keypoints=keypoints,
**kwargs)
def draw_bounding_boxes_on_image_tensors(images,
boxes,
classes,
scores,
category_index,
instance_masks=None,
keypoints=None,
max_boxes_to_draw=20,
min_score_thresh=0.2,
use_normalized_coordinates=True):
"""Draws bounding boxes, masks, and keypoints on batch of image tensors.
Args:
images: A 4D uint8 image tensor of shape [N, H, W, C]. If C > 3, additional
channels will be ignored.
boxes: [N, max_detections, 4] float32 tensor of detection boxes.
classes: [N, max_detections] int tensor of detection classes. Note that
classes are 1-indexed.
scores: [N, max_detections] float32 tensor of detection scores.
category_index: a dict that maps integer ids to category dicts. e.g.
{1: {1: 'dog'}, 2: {2: 'cat'}, ...}
instance_masks: A 4D uint8 tensor of shape [N, max_detection, H, W] with
instance masks.
keypoints: A 4D float32 tensor of shape [N, max_detection, num_keypoints, 2]
with keypoints.
max_boxes_to_draw: Maximum number of boxes to draw on an image. Default 20.
min_score_thresh: Minimum score threshold for visualization. Default 0.2.
use_normalized_coordinates: Whether to assume boxes and kepoints are in
normalized coordinates (as opposed to absolute coordiantes).
Default is True.
Returns:
4D image tensor of type uint8, with boxes drawn on top.
"""
# Additional channels are being ignored.
images = images[:, :, :, 0:3]
visualization_keyword_args = {
'use_normalized_coordinates': use_normalized_coordinates,
'max_boxes_to_draw': max_boxes_to_draw,
'min_score_thresh': min_score_thresh,
'agnostic_mode': False,
'line_thickness': 4
}
if instance_masks is not None and keypoints is None:
visualize_boxes_fn = functools.partial(
_visualize_boxes_and_masks,
category_index=category_index,
**visualization_keyword_args)
elems = [images, boxes, classes, scores, instance_masks]
elif instance_masks is None and keypoints is not None:
visualize_boxes_fn = functools.partial(
_visualize_boxes_and_keypoints,
category_index=category_index,
**visualization_keyword_args)
elems = [images, boxes, classes, scores, keypoints]
elif instance_masks is not None and keypoints is not None:
visualize_boxes_fn = functools.partial(
_visualize_boxes_and_masks_and_keypoints,
category_index=category_index,
**visualization_keyword_args)
elems = [images, boxes, classes, scores, instance_masks, keypoints]
else:
visualize_boxes_fn = functools.partial(
_visualize_boxes,
category_index=category_index,
**visualization_keyword_args)
elems = [images, boxes, classes, scores]
def draw_boxes(image_and_detections):
"""Draws boxes on image."""
image_with_boxes = tf.py_func(visualize_boxes_fn, image_and_detections,
tf.uint8)
return image_with_boxes
images = tf.map_fn(draw_boxes, elems, dtype=tf.uint8, back_prop=False)
return images
def draw_side_by_side_evaluation_image(eval_dict,
category_index,
max_boxes_to_draw=20,
min_score_thresh=0.2,
use_normalized_coordinates=True):
"""Creates a side-by-side image with detections and groundtruth.
Bounding boxes (and instance masks, if available) are visualized on both
subimages.
Args:
eval_dict: The evaluation dictionary returned by
eval_util.result_dict_for_single_example().
category_index: A category index (dictionary) produced from a labelmap.
max_boxes_to_draw: The maximum number of boxes to draw for detections.
min_score_thresh: The minimum score threshold for showing detections.
use_normalized_coordinates: Whether to assume boxes and kepoints are in
normalized coordinates (as opposed to absolute coordiantes).
Default is True.
Returns:
A [1, H, 2 * W, C] uint8 tensor. The subimage on the left corresponds to
detections, while the subimage on the right corresponds to groundtruth.
"""
detection_fields = fields.DetectionResultFields()
input_data_fields = fields.InputDataFields()
instance_masks = None
if detection_fields.detection_masks in eval_dict:
instance_masks = tf.cast(
tf.expand_dims(eval_dict[detection_fields.detection_masks], axis=0),
tf.uint8)
keypoints = None
if detection_fields.detection_keypoints in eval_dict:
keypoints = tf.expand_dims(
eval_dict[detection_fields.detection_keypoints], axis=0)
groundtruth_instance_masks = None
if input_data_fields.groundtruth_instance_masks in eval_dict:
groundtruth_instance_masks = tf.cast(
tf.expand_dims(
eval_dict[input_data_fields.groundtruth_instance_masks], axis=0),
tf.uint8)
images_with_detections = draw_bounding_boxes_on_image_tensors(
eval_dict[input_data_fields.original_image],
tf.expand_dims(eval_dict[detection_fields.detection_boxes], axis=0),
tf.expand_dims(eval_dict[detection_fields.detection_classes], axis=0),
tf.expand_dims(eval_dict[detection_fields.detection_scores], axis=0),
category_index,
instance_masks=instance_masks,
keypoints=keypoints,
max_boxes_to_draw=max_boxes_to_draw,
min_score_thresh=min_score_thresh,
use_normalized_coordinates=use_normalized_coordinates)
images_with_groundtruth = draw_bounding_boxes_on_image_tensors(
eval_dict[input_data_fields.original_image],
tf.expand_dims(eval_dict[input_data_fields.groundtruth_boxes], axis=0),
tf.expand_dims(eval_dict[input_data_fields.groundtruth_classes], axis=0),
tf.expand_dims(
tf.ones_like(
eval_dict[input_data_fields.groundtruth_classes],
dtype=tf.float32),
axis=0),
category_index,
instance_masks=groundtruth_instance_masks,
keypoints=None,
max_boxes_to_draw=None,
min_score_thresh=0.0,
use_normalized_coordinates=use_normalized_coordinates)
return tf.concat([images_with_detections, images_with_groundtruth], axis=2)
def draw_keypoints_on_image_array(image,
keypoints,
color='red',
radius=2,
use_normalized_coordinates=True):
"""Draws keypoints on an image (numpy array).
Args:
image: a numpy array with shape [height, width, 3].
keypoints: a numpy array with shape [num_keypoints, 2].
color: color to draw the keypoints with. Default is red.
radius: keypoint radius. Default value is 2.
use_normalized_coordinates: if True (default), treat keypoint values as
relative to the image. Otherwise treat them as absolute.
"""
image_pil = Image.fromarray(np.uint8(image)).convert('RGB')
draw_keypoints_on_image(image_pil, keypoints, color, radius,
use_normalized_coordinates)
np.copyto(image, np.array(image_pil))
def draw_keypoints_on_image(image,
keypoints,
color='red',
radius=2,
use_normalized_coordinates=True):
"""Draws keypoints on an image.
Args:
image: a PIL.Image object.
keypoints: a numpy array with shape [num_keypoints, 2].
color: color to draw the keypoints with. Default is red.
radius: keypoint radius. Default value is 2.
use_normalized_coordinates: if True (default), treat keypoint values as
relative to the image. Otherwise treat them as absolute.
"""
draw = ImageDraw.Draw(image)
im_width, im_height = image.size
keypoints_x = [k[1] for k in keypoints]
keypoints_y = [k[0] for k in keypoints]
if use_normalized_coordinates:
keypoints_x = tuple([im_width * x for x in keypoints_x])
keypoints_y = tuple([im_height * y for y in keypoints_y])
for keypoint_x, keypoint_y in zip(keypoints_x, keypoints_y):
draw.ellipse([(keypoint_x - radius, keypoint_y - radius),
(keypoint_x + radius, keypoint_y + radius)],
outline=color, fill=color)
def draw_mask_on_image_array(image, mask, color='red', alpha=0.4):
"""Draws mask on an image.
Args:
image: uint8 numpy array with shape (img_height, img_height, 3)
mask: a uint8 numpy array of shape (img_height, img_height) with
values between either 0 or 1.
color: color to draw the keypoints with. Default is red.
alpha: transparency value between 0 and 1. (default: 0.4)
Raises:
ValueError: On incorrect data type for image or masks.
"""
if image.dtype != np.uint8:
raise ValueError('`image` not of type np.uint8')
if mask.dtype != np.uint8:
raise ValueError('`mask` not of type np.uint8')
if np.any(np.logical_and(mask != 1, mask != 0)):
raise ValueError('`mask` elements should be in [0, 1]')
if image.shape[:2] != mask.shape:
raise ValueError('The image has spatial dimensions %s but the mask has '
'dimensions %s' % (image.shape[:2], mask.shape))
rgb = ImageColor.getrgb(color)
pil_image = Image.fromarray(image)
solid_color = np.expand_dims(
np.ones_like(mask), axis=2) * np.reshape(list(rgb), [1, 1, 3])
pil_solid_color = Image.fromarray(np.uint8(solid_color)).convert('RGBA')
pil_mask = Image.fromarray(np.uint8(255.0*alpha*mask)).convert('L')
pil_image = Image.composite(pil_solid_color, pil_image, pil_mask)
np.copyto(image, np.array(pil_image.convert('RGB')))
def visualize_boxes_and_labels_on_image_array(
image,
boxes,
classes,
scores,
category_index,
instance_masks=None,
instance_boundaries=None,
keypoints=None,
use_normalized_coordinates=False,
max_boxes_to_draw=20,
min_score_thresh=.5,
agnostic_mode=False,
line_thickness=4,
groundtruth_box_visualization_color='black',
skip_scores=False,
skip_labels=False):
"""Overlay labeled boxes on an image with formatted scores and label names.
This function groups boxes that correspond to the same location
and creates a display string for each detection and overlays these
on the image. Note that this function modifies the image in place, and returns
that same image.
Args:
image: uint8 numpy array with shape (img_height, img_width, 3)
boxes: a numpy array of shape [N, 4]
classes: a numpy array of shape [N]. Note that class indices are 1-based,
and match the keys in the label map.
scores: a numpy array of shape [N] or None. If scores=None, then
this function assumes that the boxes to be plotted are groundtruth
boxes and plot all boxes as black with no classes or scores.
category_index: a dict containing category dictionaries (each holding
category index `id` and category name `name`) keyed by category indices.
instance_masks: a numpy array of shape [N, image_height, image_width] with
values ranging between 0 and 1, can be None.
instance_boundaries: a numpy array of shape [N, image_height, image_width]
with values ranging between 0 and 1, can be None.
keypoints: a numpy array of shape [N, num_keypoints, 2], can
be None
use_normalized_coordinates: whether boxes is to be interpreted as
normalized coordinates or not.
max_boxes_to_draw: maximum number of boxes to visualize. If None, draw
all boxes.
min_score_thresh: minimum score threshold for a box to be visualized
agnostic_mode: boolean (default: False) controlling whether to evaluate in
class-agnostic mode or not. This mode will display scores but ignore
classes.
line_thickness: integer (default: 4) controlling line width of the boxes.
groundtruth_box_visualization_color: box color for visualizing groundtruth
boxes
skip_scores: whether to skip score when drawing a single detection
skip_labels: whether to skip label when drawing a single detection
Returns:
uint8 numpy array with shape (img_height, img_width, 3) with overlaid boxes.
"""
# Create a display string (and color) for every box location, group any boxes
# that correspond to the same location.
box_to_display_str_map = collections.defaultdict(list)
box_to_color_map = collections.defaultdict(str)
box_to_instance_masks_map = {}
box_to_instance_boundaries_map = {}
box_to_keypoints_map = collections.defaultdict(list)
if not max_boxes_to_draw:
max_boxes_to_draw = boxes.shape[0]
for i in range(min(max_boxes_to_draw, boxes.shape[0])):
if scores is None or scores[i] > min_score_thresh:
box = tuple(boxes[i].tolist())
if instance_masks is not None:
box_to_instance_masks_map[box] = instance_masks[i]
if instance_boundaries is not None:
box_to_instance_boundaries_map[box] = instance_boundaries[i]
if keypoints is not None:
box_to_keypoints_map[box].extend(keypoints[i])
if scores is None:
box_to_color_map[box] = groundtruth_box_visualization_color
else:
display_str = ''
if not skip_labels:
if not agnostic_mode:
if classes[i] in category_index.keys():
class_name = category_index[classes[i]]['name']
else:
class_name = 'N/A'
display_str = str(class_name)
if not skip_scores:
if not display_str:
display_str = '{}%'.format(int(100*scores[i]))
else:
display_str = '{}: {}%'.format(display_str, int(100*scores[i]))
box_to_display_str_map[box].append(display_str)
if agnostic_mode:
box_to_color_map[box] = 'DarkOrange'
else:
box_to_color_map[box] = STANDARD_COLORS[
classes[i] % len(STANDARD_COLORS)]
# Draw all boxes onto image.
for box, color in box_to_color_map.items():
ymin, xmin, ymax, xmax = box
if instance_masks is not None:
draw_mask_on_image_array(
image,
box_to_instance_masks_map[box],
color=color
)
if instance_boundaries is not None:
draw_mask_on_image_array(
image,
box_to_instance_boundaries_map[box],
color='red',
alpha=1.0
)
draw_bounding_box_on_image_array(
image,
ymin,
xmin,
ymax,
xmax,
color=color,
thickness=line_thickness,
display_str_list=box_to_display_str_map[box],
use_normalized_coordinates=use_normalized_coordinates)
if keypoints is not None:
draw_keypoints_on_image_array(
image,
box_to_keypoints_map[box],
color=color,
radius=line_thickness / 2,
use_normalized_coordinates=use_normalized_coordinates)
return image
def add_cdf_image_summary(values, name):
"""Adds a tf.summary.image for a CDF plot of the values.
Normalizes `values` such that they sum to 1, plots the cumulative distribution
function and creates a tf image summary.
Args:
values: a 1-D float32 tensor containing the values.
name: name for the image summary.
"""
def cdf_plot(values):
"""Numpy function to plot CDF."""
normalized_values = values / np.sum(values)
sorted_values = np.sort(normalized_values)
cumulative_values = np.cumsum(sorted_values)
fraction_of_examples = (np.arange(cumulative_values.size, dtype=np.float32)
/ cumulative_values.size)
fig = plt.figure(frameon=False)
ax = fig.add_subplot('111')
ax.plot(fraction_of_examples, cumulative_values)
ax.set_ylabel('cumulative normalized values')
ax.set_xlabel('fraction of examples')
fig.canvas.draw()
width, height = fig.get_size_inches() * fig.get_dpi()
image = np.fromstring(fig.canvas.tostring_rgb(), dtype='uint8').reshape(
1, int(height), int(width), 3)
return image
cdf_plot = tf.py_func(cdf_plot, [values], tf.uint8)
tf.summary.image(name, cdf_plot)
def add_hist_image_summary(values, bins, name):
"""Adds a tf.summary.image for a histogram plot of the values.
Plots the histogram of values and creates a tf image summary.
Args:
values: a 1-D float32 tensor containing the values.
bins: bin edges which will be directly passed to np.histogram.
name: name for the image summary.
"""
def hist_plot(values, bins):
"""Numpy function to plot hist."""
fig = plt.figure(frameon=False)
ax = fig.add_subplot('111')
y, x = np.histogram(values, bins=bins)
ax.plot(x[:-1], y)
ax.set_ylabel('count')
ax.set_xlabel('value')
fig.canvas.draw()
width, height = fig.get_size_inches() * fig.get_dpi()
image = np.fromstring(
fig.canvas.tostring_rgb(), dtype='uint8').reshape(
1, int(height), int(width), 3)
return image
hist_plot = tf.py_func(hist_plot, [values, bins], tf.uint8)
tf.summary.image(name, hist_plot)
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Initializes at random location and visualizes the optimal path.
Different modes of execution:
1) benchmark: It generates benchmark_iter sample trajectory to random goals
and plots the histogram of path lengths. It can be also used to see how fast
it runs.
2) vis: It visualizes the generated paths by image, semantic segmentation, and
so on.
3) human: allows the user to navigate through environment from keyboard input.
python viz_active_vision_dataset_main -- \
--mode=benchmark --benchmark_iter=1000 --gin_config=envs/configs/active_vision_config.gin
python viz_active_vision_dataset_main -- \
--mode=vis \
--gin_config=envs/configs/active_vision_config.gin
python viz_active_vision_dataset_main -- \
--mode=human \
--gin_config=envs/configs/active_vision_config.gin
python viz_active_vision_dataset_main.py --mode=eval --eval_folder=/usr/local/google/home/$USER/checkin_log_det/evals/ --output_folder=/usr/local/google/home/$USER/test_imgs/ --gin_config=envs/configs/active_vision_config.gin
"""
import matplotlib
# pylint: disable=g-import-not-at-top
# Need Tk for interactive plots.
matplotlib.use('TkAgg')
import tensorflow as tf
from matplotlib import pyplot as plt
import numpy as np
import os
from pyglib import app
from pyglib import flags
import gin
import cv2
from envs import active_vision_dataset_env
from envs import task_env
VIS_MODE = 'vis'
HUMAN_MODE = 'human'
BENCHMARK_MODE = 'benchmark'
GRAPH_MODE = 'graph'
EVAL_MODE = 'eval'
flags.DEFINE_enum('mode', VIS_MODE,
[VIS_MODE, HUMAN_MODE, BENCHMARK_MODE, GRAPH_MODE, EVAL_MODE],
'mode of the execution')
flags.DEFINE_integer('benchmark_iter', 1000,
'number of iterations for benchmarking')
flags.DEFINE_string('eval_folder', '', 'the path to the eval folder')
flags.DEFINE_string('output_folder', '',
'the path to which the images and gifs are written')
flags.DEFINE_multi_string('gin_config', [],
'List of paths to a gin config files for the env.')
flags.DEFINE_multi_string('gin_params', [],
'Newline separated list of Gin parameter bindings.')
mt = task_env.ModalityTypes
FLAGS = flags.FLAGS
def benchmark(env, targets):
"""Benchmarks the speed of sequence generation by env.
Args:
env: environment.
targets: list of target classes.
"""
episode_lengths = {}
all_init_configs = {}
all_actions = dict([(a, 0.) for a in env.actions])
for i in range(FLAGS.benchmark_iter):
path, actions, _, _ = env.random_step_sequence()
selected_actions = np.argmax(actions, axis=-1)
new_actions = dict([(a, 0.) for a in env.actions])
for a in selected_actions:
new_actions[env.actions[a]] += 1. / selected_actions.shape[0]
for a in new_actions:
all_actions[a] += new_actions[a] / FLAGS.benchmark_iter
start_image_id, world, goal = env.get_init_config(path)
print world
if world not in all_init_configs:
all_init_configs[world] = set()
all_init_configs[world].add((start_image_id, goal, len(actions)))
if env.goal_index not in episode_lengths:
episode_lengths[env.goal_index] = []
episode_lengths[env.goal_index].append(len(actions))
for i, cls in enumerate(episode_lengths):
plt.subplot(231 + i)
plt.hist(episode_lengths[cls])
plt.title(targets[cls])
plt.show()
def human(env, targets):
"""Lets user play around the env manually."""
string_key_map = {
'a': 'left',
'd': 'right',
'w': 'forward',
's': 'backward',
'j': 'rotate_ccw',
'l': 'rotate_cw',
'n': 'stop'
}
integer_key_map = {
'a': env.actions.index('left'),
'd': env.actions.index('right'),
'w': env.actions.index('forward'),
's': env.actions.index('backward'),
'j': env.actions.index('rotate_ccw'),
'l': env.actions.index('rotate_cw'),
'n': env.actions.index('stop')
}
for k in integer_key_map:
integer_key_map[k] = np.int32(integer_key_map[k])
plt.ion()
for _ in range(20):
obs = env.reset()
steps = -1
action = None
while True:
print 'distance = ', obs[task_env.ModalityTypes.DISTANCE]
steps += 1
depth_value = obs[task_env.ModalityTypes.DEPTH][:, :, 0]
depth_mask = obs[task_env.ModalityTypes.DEPTH][:, :, 1]
seg_mask = np.squeeze(obs[task_env.ModalityTypes.SEMANTIC_SEGMENTATION])
det_mask = np.argmax(
obs[task_env.ModalityTypes.OBJECT_DETECTION], axis=-1)
img = obs[task_env.ModalityTypes.IMAGE]
plt.subplot(231)
plt.title('steps = {}'.format(steps))
plt.imshow(img.astype(np.uint8))
plt.subplot(232)
plt.imshow(depth_value)
plt.title('depth value')
plt.subplot(233)
plt.imshow(depth_mask)
plt.title('depth mask')
plt.subplot(234)
plt.imshow(seg_mask)
plt.title('seg')
plt.subplot(235)
plt.imshow(det_mask)
plt.title('det')
plt.subplot(236)
plt.title('goal={}'.format(targets[env.goal_index]))
plt.draw()
while True:
s = raw_input('key = ')
if np.random.rand() > 0.5:
key_map = string_key_map
else:
key_map = integer_key_map
if s in key_map:
action = key_map[s]
break
else:
print 'invalid action'
print 'action = {}'.format(action)
if action == 'stop':
print 'dist to goal: {}'.format(len(env.path_to_goal()) - 2)
break
obs, reward, done, info = env.step(action)
print 'reward = {}, done = {}, success = {}'.format(
reward, done, info['success'])
def visualize_random_step_sequence(env):
"""Visualizes random sequence of steps."""
plt.ion()
for _ in range(20):
path, actions, _, step_outputs = env.random_step_sequence(max_len=30)
print 'path = {}'.format(path)
for action, step_output in zip(actions, step_outputs):
obs, _, done, _ = step_output
depth_value = obs[task_env.ModalityTypes.DEPTH][:, :, 0]
depth_mask = obs[task_env.ModalityTypes.DEPTH][:, :, 1]
seg_mask = np.squeeze(obs[task_env.ModalityTypes.SEMANTIC_SEGMENTATION])
det_mask = np.argmax(
obs[task_env.ModalityTypes.OBJECT_DETECTION], axis=-1)
img = obs[task_env.ModalityTypes.IMAGE]
plt.subplot(231)
plt.imshow(img.astype(np.uint8))
plt.subplot(232)
plt.imshow(depth_value)
plt.title('depth value')
plt.subplot(233)
plt.imshow(depth_mask)
plt.title('depth mask')
plt.subplot(234)
plt.imshow(seg_mask)
plt.title('seg')
plt.subplot(235)
plt.imshow(det_mask)
plt.title('det')
plt.subplot(236)
print 'action = {}'.format(action)
print 'done = {}'.format(done)
plt.draw()
if raw_input('press \'n\' to go to the next random sequence. Otherwise, '
'press any key to continue...') == 'n':
break
def visualize(env, input_folder, output_root_folder):
"""visualizes images for sequence of steps from the evals folder."""
def which_env(file_name):
img_name = file_name.split('_')[0][2:5]
env_dict = {'161': 'Home_016_1', '131': 'Home_013_1', '111': 'Home_011_1'}
if img_name in env_dict:
return env_dict[img_name]
else:
raise ValueError('could not resolve env: {} {}'.format(
img_name, file_name))
def which_goal(file_name):
return file_name[file_name.find('_')+1:]
output_images_folder = os.path.join(output_root_folder, 'images')
output_gifs_folder = os.path.join(output_root_folder, 'gifs')
if not tf.gfile.IsDirectory(output_images_folder):
tf.gfile.MakeDirs(output_images_folder)
if not tf.gfile.IsDirectory(output_gifs_folder):
tf.gfile.MakeDirs(output_gifs_folder)
npy_files = [
os.path.join(input_folder, name)
for name in tf.gfile.ListDirectory(input_folder)
if name.find('npy') >= 0
]
for i, npy_file in enumerate(npy_files):
print 'saving images {}/{}'.format(i, len(npy_files))
pure_name = npy_file[npy_file.rfind('/') + 1:-4]
output_folder = os.path.join(output_images_folder, pure_name)
if not tf.gfile.IsDirectory(output_folder):
tf.gfile.MakeDirs(output_folder)
print '*******'
print pure_name[0:pure_name.find('_')]
env.reset_for_eval(which_env(pure_name),
which_goal(pure_name),
pure_name[0:pure_name.find('_')],
)
with tf.gfile.Open(npy_file) as h:
states = np.load(h).item()['states']
images = [
env.observation(state)[mt.IMAGE] for state in states
]
for j, img in enumerate(images):
cv2.imwrite(os.path.join(output_folder, '{0:03d}'.format(j) + '.jpg'),
img[:, :, ::-1])
print 'converting to gif'
os.system(
'convert -set delay 20 -colors 256 -dispose 1 {}/*.jpg {}.gif'.format(
output_folder,
os.path.join(output_gifs_folder, pure_name + '.gif')
)
)
def evaluate_folder(env, folder_path):
"""Evaluates the performance from the evals folder."""
targets = ['fridge', 'dining_table', 'microwave', 'tv', 'couch']
def compute_acc(npy_file):
with tf.gfile.Open(npy_file) as h:
data = np.load(h).item()
if npy_file.find('dining_table') >= 0:
category = 'dining_table'
else:
category = npy_file[npy_file.rfind('_') + 1:-4]
return category, data['distance'][-1] - 2
def evaluate_iteration(folder):
"""Evaluates the data from the folder of certain eval iteration."""
print folder
npy_files = [
os.path.join(folder, name)
for name in tf.gfile.ListDirectory(folder)
if name.find('npy') >= 0
]
eval_stats = {c: [] for c in targets}
for npy_file in npy_files:
try:
category, dist = compute_acc(npy_file)
except: # pylint: disable=bare-except
continue
eval_stats[category].append(float(dist <= 5))
for c in eval_stats:
if not eval_stats[c]:
print 'incomplete eval {}: empty class {}'.format(folder_path, c)
return None
eval_stats[c] = np.mean(eval_stats[c])
eval_stats['mean'] = np.mean(eval_stats.values())
return eval_stats
checkpoint_folders = [
folder_path + x
for x in tf.gfile.ListDirectory(folder_path)
if tf.gfile.IsDirectory(folder_path + x)
]
print '{} folders found'.format(len(checkpoint_folders))
print '------------------------'
all_iters = []
all_accs = []
for i, folder in enumerate(checkpoint_folders):
print 'processing {}/{}'.format(i, len(checkpoint_folders))
eval_stats = evaluate_iteration(folder)
if eval_stats is None:
continue
else:
iter_no = int(folder[folder.rfind('/') + 1:])
print 'result ', iter_no, eval_stats['mean']
all_accs.append(eval_stats['mean'])
all_iters.append(iter_no)
all_accs = np.asarray(all_accs)
all_iters = np.asarray(all_iters)
idx = np.argmax(all_accs)
print 'best result at iteration {} was {}'.format(all_iters[idx],
all_accs[idx])
order = np.argsort(all_iters)
all_iters = all_iters[order]
all_accs = all_accs[order]
#plt.plot(all_iters, all_accs)
#plt.show()
#print 'done plotting'
best_iteration_folder = os.path.join(folder_path, str(all_iters[idx]))
print 'generating gifs and images for {}'.format(best_iteration_folder)
visualize(env, best_iteration_folder, FLAGS.output_folder)
def main(_):
gin.parse_config_files_and_bindings(FLAGS.gin_config, FLAGS.gin_params)
print('********')
print(FLAGS.mode)
print(FLAGS.gin_config)
print(FLAGS.gin_params)
env = active_vision_dataset_env.ActiveVisionDatasetEnv(modality_types=[
task_env.ModalityTypes.IMAGE,
task_env.ModalityTypes.SEMANTIC_SEGMENTATION,
task_env.ModalityTypes.OBJECT_DETECTION, task_env.ModalityTypes.DEPTH,
task_env.ModalityTypes.DISTANCE
])
if FLAGS.mode == BENCHMARK_MODE:
benchmark(env, env.possible_targets)
elif FLAGS.mode == GRAPH_MODE:
for loc in env.worlds:
env.check_scene_graph(loc, 'fridge')
elif FLAGS.mode == HUMAN_MODE:
human(env, env.possible_targets)
elif FLAGS.mode == VIS_MODE:
visualize_random_step_sequence(env)
elif FLAGS.mode == EVAL_MODE:
evaluate_folder(env, FLAGS.eval_folder)
if __name__ == '__main__':
app.run(main)
# Cross-View Training
This repository contains code for *Semi-Supervised Sequence Modeling with Cross-View Training*. Currently sequence tagging and dependency parsing tasks are supported.
## Requirements
* [Tensorflow](https://www.tensorflow.org/)
* [Numpy](http://www.numpy.org/)
This code has been run with TensorFlow 1.10.1 and Numpy 1.14.5; other versions may work, but have not been tested.
## Fetching and Preprocessing Data
Run `fetch_data.sh` to download and extract pretrained [GloVe](https://nlp.stanford.edu/projects/glove/) vectors, the [1 Billion Word Language Model Benchmark](http://www.statmt.org/lm-benchmark/) corpus of unlabeled data, and the CoNLL-2000 [text chunking](https://www.clips.uantwerpen.be/conll2000/chunking/) dataset. Unfortunately the other datasets from our paper are not freely available and so can't be included in this repository.
To apply CVT to other datasets, the data should be placed in `data/raw_data/<task_name>/(train|dev|test).txt`. For sequence tagging data, each line should contain a word followed by a space followed by that word's tag. Sentences should be separated by empty lines. For dependency parsing, each tag should be of the form ``<index_of_head>-<relation>`` (e.g., `0-root`).
After all of the data has been downloaded, run `preprocessing.py`.
## Training a Model
Run `python cvt.py --mode=train --model_name=chunking_model`. By default this trains a model on the chunking data downloaded with `fetch_data.sh`. To change which task(s) are trained on or model hyperparameters, modify [base/configure.py](base/configure.py). Models are automatically checkpointed every 1000 steps; training will continue from the latest checkpoint if training is interrupted and restarted. Model checkpoints and other data such as dev set accuracy over time are stored in `data/models/<model_name>`.
## Evaluating a Model
Run `python cvt.py --mode=eval --model_name=chunking_model`. A CVT model trained on the chunking data for 200k steps should get at least 97.1 F1 on the dev set and 96.6 F1 on the test set.
## Citation
If you use this code for your publication, please cite the original paper
```
@inproceedings{clark2018semi,
title = {Semi-Supervised Sequence Modeling with Cross-View Training},
author = {Kevin Clark and Minh-Thang Luong and Christopher D. Manning and Quoc V. Le},
booktitle = {ACL},
year = {2018}
}
```
## Contact
* [Kevin Clark](https://cs.stanford.edu/~kevclark/) (@clarkkev).
* [Thang Luong](https://nlp.stanford.edu/~lmthang/) (@lmthang).
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment