"git@developer.sourcefind.cn:zhaoyu6/sglang.git" did not exist on "f44db16c8e0fbee1b964e802f1ab493afb6f7996"
Commit f5fc733a authored by Byzantine's avatar Byzantine
Browse files

Removing research/community models

parent 09bc9f54
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Provides utilities for preprocessing."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
slim = tf.contrib.slim
def preprocess_image(image, output_height, output_width, is_training):
"""Preprocesses the given image.
Args:
image: A `Tensor` representing an image of arbitrary size.
output_height: The height of the image after preprocessing.
output_width: The width of the image after preprocessing.
is_training: `True` if we're preprocessing the image for training and
`False` otherwise.
Returns:
A preprocessed image.
"""
image = tf.to_float(image)
image = tf.image.resize_image_with_crop_or_pad(
image, output_width, output_height)
image = tf.subtract(image, 128.0)
image = tf.div(image, 128.0)
return image
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Contains a factory for building various models."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from preprocessing import cifarnet_preprocessing
from preprocessing import inception_preprocessing
from preprocessing import lenet_preprocessing
from preprocessing import vgg_preprocessing
slim = tf.contrib.slim
def get_preprocessing(name, is_training=False):
"""Returns preprocessing_fn(image, height, width, **kwargs).
Args:
name: The name of the preprocessing function.
is_training: `True` if the model is being used for training and `False`
otherwise.
Returns:
preprocessing_fn: A function that preprocessing a single image (pre-batch).
It has the following signature:
image = preprocessing_fn(image, output_height, output_width, ...).
Raises:
ValueError: If Preprocessing `name` is not recognized.
"""
preprocessing_fn_map = {
'cifarnet': cifarnet_preprocessing,
'inception': inception_preprocessing,
'inception_v1': inception_preprocessing,
'inception_v2': inception_preprocessing,
'inception_v3': inception_preprocessing,
'inception_v4': inception_preprocessing,
'inception_resnet_v2': inception_preprocessing,
'lenet': lenet_preprocessing,
'mobilenet_v1': inception_preprocessing,
'nasnet_mobile': inception_preprocessing,
'nasnet_large': inception_preprocessing,
'pnasnet_large': inception_preprocessing,
'resnet_v1_50': vgg_preprocessing,
'resnet_v1_101': vgg_preprocessing,
'resnet_v1_152': vgg_preprocessing,
'resnet_v1_200': vgg_preprocessing,
'resnet_v2_50': vgg_preprocessing,
'resnet_v2_101': vgg_preprocessing,
'resnet_v2_152': vgg_preprocessing,
'resnet_v2_200': vgg_preprocessing,
'vgg': vgg_preprocessing,
'vgg_a': vgg_preprocessing,
'vgg_16': vgg_preprocessing,
'vgg_19': vgg_preprocessing,
}
if name not in preprocessing_fn_map:
raise ValueError('Preprocessing name [%s] was not recognized' % name)
def preprocessing_fn(image, output_height, output_width, **kwargs):
return preprocessing_fn_map[name].preprocess_image(
image, output_height, output_width, is_training=is_training, **kwargs)
return preprocessing_fn
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Provides utilities to preprocess images.
The preprocessing steps for VGG were introduced in the following technical
report:
Very Deep Convolutional Networks For Large-Scale Image Recognition
Karen Simonyan and Andrew Zisserman
arXiv technical report, 2015
PDF: http://arxiv.org/pdf/1409.1556.pdf
ILSVRC 2014 Slides: http://www.robots.ox.ac.uk/~karen/pdf/ILSVRC_2014.pdf
CC-BY-4.0
More information can be obtained from the VGG website:
www.robots.ox.ac.uk/~vgg/research/very_deep/
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
slim = tf.contrib.slim
_R_MEAN = 123.68
_G_MEAN = 116.78
_B_MEAN = 103.94
_RESIZE_SIDE_MIN = 256
_RESIZE_SIDE_MAX = 512
def _crop(image, offset_height, offset_width, crop_height, crop_width):
"""Crops the given image using the provided offsets and sizes.
Note that the method doesn't assume we know the input image size but it does
assume we know the input image rank.
Args:
image: an image of shape [height, width, channels].
offset_height: a scalar tensor indicating the height offset.
offset_width: a scalar tensor indicating the width offset.
crop_height: the height of the cropped image.
crop_width: the width of the cropped image.
Returns:
the cropped (and resized) image.
Raises:
InvalidArgumentError: if the rank is not 3 or if the image dimensions are
less than the crop size.
"""
original_shape = tf.shape(image)
rank_assertion = tf.Assert(
tf.equal(tf.rank(image), 3),
['Rank of image must be equal to 3.'])
with tf.control_dependencies([rank_assertion]):
cropped_shape = tf.stack([crop_height, crop_width, original_shape[2]])
size_assertion = tf.Assert(
tf.logical_and(
tf.greater_equal(original_shape[0], crop_height),
tf.greater_equal(original_shape[1], crop_width)),
['Crop size greater than the image size.'])
offsets = tf.to_int32(tf.stack([offset_height, offset_width, 0]))
# Use tf.slice instead of crop_to_bounding box as it accepts tensors to
# define the crop size.
with tf.control_dependencies([size_assertion]):
image = tf.slice(image, offsets, cropped_shape)
return tf.reshape(image, cropped_shape)
def _random_crop(image_list, crop_height, crop_width):
"""Crops the given list of images.
The function applies the same crop to each image in the list. This can be
effectively applied when there are multiple image inputs of the same
dimension such as:
image, depths, normals = _random_crop([image, depths, normals], 120, 150)
Args:
image_list: a list of image tensors of the same dimension but possibly
varying channel.
crop_height: the new height.
crop_width: the new width.
Returns:
the image_list with cropped images.
Raises:
ValueError: if there are multiple image inputs provided with different size
or the images are smaller than the crop dimensions.
"""
if not image_list:
raise ValueError('Empty image_list.')
# Compute the rank assertions.
rank_assertions = []
for i in range(len(image_list)):
image_rank = tf.rank(image_list[i])
rank_assert = tf.Assert(
tf.equal(image_rank, 3),
['Wrong rank for tensor %s [expected] [actual]',
image_list[i].name, 3, image_rank])
rank_assertions.append(rank_assert)
with tf.control_dependencies([rank_assertions[0]]):
image_shape = tf.shape(image_list[0])
image_height = image_shape[0]
image_width = image_shape[1]
crop_size_assert = tf.Assert(
tf.logical_and(
tf.greater_equal(image_height, crop_height),
tf.greater_equal(image_width, crop_width)),
['Crop size greater than the image size.'])
asserts = [rank_assertions[0], crop_size_assert]
for i in range(1, len(image_list)):
image = image_list[i]
asserts.append(rank_assertions[i])
with tf.control_dependencies([rank_assertions[i]]):
shape = tf.shape(image)
height = shape[0]
width = shape[1]
height_assert = tf.Assert(
tf.equal(height, image_height),
['Wrong height for tensor %s [expected][actual]',
image.name, height, image_height])
width_assert = tf.Assert(
tf.equal(width, image_width),
['Wrong width for tensor %s [expected][actual]',
image.name, width, image_width])
asserts.extend([height_assert, width_assert])
# Create a random bounding box.
#
# Use tf.random_uniform and not numpy.random.rand as doing the former would
# generate random numbers at graph eval time, unlike the latter which
# generates random numbers at graph definition time.
with tf.control_dependencies(asserts):
max_offset_height = tf.reshape(image_height - crop_height + 1, [])
with tf.control_dependencies(asserts):
max_offset_width = tf.reshape(image_width - crop_width + 1, [])
offset_height = tf.random_uniform(
[], maxval=max_offset_height, dtype=tf.int32)
offset_width = tf.random_uniform(
[], maxval=max_offset_width, dtype=tf.int32)
return [_crop(image, offset_height, offset_width,
crop_height, crop_width) for image in image_list]
def _central_crop(image_list, crop_height, crop_width):
"""Performs central crops of the given image list.
Args:
image_list: a list of image tensors of the same dimension but possibly
varying channel.
crop_height: the height of the image following the crop.
crop_width: the width of the image following the crop.
Returns:
the list of cropped images.
"""
outputs = []
for image in image_list:
image_height = tf.shape(image)[0]
image_width = tf.shape(image)[1]
offset_height = (image_height - crop_height) / 2
offset_width = (image_width - crop_width) / 2
outputs.append(_crop(image, offset_height, offset_width,
crop_height, crop_width))
return outputs
def _mean_image_subtraction(image, means):
"""Subtracts the given means from each image channel.
For example:
means = [123.68, 116.779, 103.939]
image = _mean_image_subtraction(image, means)
Note that the rank of `image` must be known.
Args:
image: a tensor of size [height, width, C].
means: a C-vector of values to subtract from each channel.
Returns:
the centered image.
Raises:
ValueError: If the rank of `image` is unknown, if `image` has a rank other
than three or if the number of channels in `image` doesn't match the
number of values in `means`.
"""
if image.get_shape().ndims != 3:
raise ValueError('Input must be of size [height, width, C>0]')
num_channels = image.get_shape().as_list()[-1]
if len(means) != num_channels:
raise ValueError('len(means) must match the number of channels')
channels = tf.split(axis=2, num_or_size_splits=num_channels, value=image)
for i in range(num_channels):
channels[i] -= means[i]
return tf.concat(axis=2, values=channels)
def _smallest_size_at_least(height, width, smallest_side):
"""Computes new shape with the smallest side equal to `smallest_side`.
Computes new shape with the smallest side equal to `smallest_side` while
preserving the original aspect ratio.
Args:
height: an int32 scalar tensor indicating the current height.
width: an int32 scalar tensor indicating the current width.
smallest_side: A python integer or scalar `Tensor` indicating the size of
the smallest side after resize.
Returns:
new_height: an int32 scalar tensor indicating the new height.
new_width: and int32 scalar tensor indicating the new width.
"""
smallest_side = tf.convert_to_tensor(smallest_side, dtype=tf.int32)
height = tf.to_float(height)
width = tf.to_float(width)
smallest_side = tf.to_float(smallest_side)
scale = tf.cond(tf.greater(height, width),
lambda: smallest_side / width,
lambda: smallest_side / height)
new_height = tf.to_int32(tf.rint(height * scale))
new_width = tf.to_int32(tf.rint(width * scale))
return new_height, new_width
def _aspect_preserving_resize(image, smallest_side):
"""Resize images preserving the original aspect ratio.
Args:
image: A 3-D image `Tensor`.
smallest_side: A python integer or scalar `Tensor` indicating the size of
the smallest side after resize.
Returns:
resized_image: A 3-D tensor containing the resized image.
"""
smallest_side = tf.convert_to_tensor(smallest_side, dtype=tf.int32)
shape = tf.shape(image)
height = shape[0]
width = shape[1]
new_height, new_width = _smallest_size_at_least(height, width, smallest_side)
image = tf.expand_dims(image, 0)
resized_image = tf.image.resize_bilinear(image, [new_height, new_width],
align_corners=False)
resized_image = tf.squeeze(resized_image)
resized_image.set_shape([None, None, 3])
return resized_image
def preprocess_for_train(image,
output_height,
output_width,
resize_side_min=_RESIZE_SIDE_MIN,
resize_side_max=_RESIZE_SIDE_MAX):
"""Preprocesses the given image for training.
Note that the actual resizing scale is sampled from
[`resize_size_min`, `resize_size_max`].
Args:
image: A `Tensor` representing an image of arbitrary size.
output_height: The height of the image after preprocessing.
output_width: The width of the image after preprocessing.
resize_side_min: The lower bound for the smallest side of the image for
aspect-preserving resizing.
resize_side_max: The upper bound for the smallest side of the image for
aspect-preserving resizing.
Returns:
A preprocessed image.
"""
resize_side = tf.random_uniform(
[], minval=resize_side_min, maxval=resize_side_max+1, dtype=tf.int32)
image = _aspect_preserving_resize(image, resize_side)
image = _random_crop([image], output_height, output_width)[0]
image.set_shape([output_height, output_width, 3])
image = tf.to_float(image)
image = tf.image.random_flip_left_right(image)
return _mean_image_subtraction(image, [_R_MEAN, _G_MEAN, _B_MEAN])
def preprocess_for_eval(image, output_height, output_width, resize_side):
"""Preprocesses the given image for evaluation.
Args:
image: A `Tensor` representing an image of arbitrary size.
output_height: The height of the image after preprocessing.
output_width: The width of the image after preprocessing.
resize_side: The smallest side of the image for aspect-preserving resizing.
Returns:
A preprocessed image.
"""
image = _aspect_preserving_resize(image, resize_side)
image = _central_crop([image], output_height, output_width)[0]
image.set_shape([output_height, output_width, 3])
image = tf.to_float(image)
return _mean_image_subtraction(image, [_R_MEAN, _G_MEAN, _B_MEAN])
def preprocess_image(image, output_height, output_width, is_training=False,
resize_side_min=_RESIZE_SIDE_MIN,
resize_side_max=_RESIZE_SIDE_MAX):
"""Preprocesses the given image.
Args:
image: A `Tensor` representing an image of arbitrary size.
output_height: The height of the image after preprocessing.
output_width: The width of the image after preprocessing.
is_training: `True` if we're preprocessing the image for training and
`False` otherwise.
resize_side_min: The lower bound for the smallest side of the image for
aspect-preserving resizing. If `is_training` is `False`, then this value
is used for rescaling.
resize_side_max: The upper bound for the smallest side of the image for
aspect-preserving resizing. If `is_training` is `False`, this value is
ignored. Otherwise, the resize side is sampled from
[resize_size_min, resize_size_max].
Returns:
A preprocessed image.
"""
if is_training:
return preprocess_for_train(image, output_height, output_width,
resize_side_min, resize_side_max)
else:
return preprocess_for_eval(image, output_height, output_width,
resize_side_min)
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Contains classes specifying naming conventions used for object detection.
Specifies:
InputDataFields: standard fields used by reader/preprocessor/batcher.
DetectionResultFields: standard fields returned by object detector.
BoxListFields: standard field used by BoxList
TfExampleFields: standard fields for tf-example data format (go/tf-example).
"""
class InputDataFields(object):
"""Names for the input tensors.
Holds the standard data field names to use for identifying input tensors. This
should be used by the decoder to identify keys for the returned tensor_dict
containing input tensors. And it should be used by the model to identify the
tensors it needs.
Attributes:
image: image.
image_additional_channels: additional channels.
original_image: image in the original input size.
key: unique key corresponding to image.
source_id: source of the original image.
filename: original filename of the dataset (without common path).
groundtruth_image_classes: image-level class labels.
groundtruth_boxes: coordinates of the ground truth boxes in the image.
groundtruth_classes: box-level class labels.
groundtruth_label_types: box-level label types (e.g. explicit negative).
groundtruth_is_crowd: [DEPRECATED, use groundtruth_group_of instead]
is the groundtruth a single object or a crowd.
groundtruth_area: area of a groundtruth segment.
groundtruth_difficult: is a `difficult` object
groundtruth_group_of: is a `group_of` objects, e.g. multiple objects of the
same class, forming a connected group, where instances are heavily
occluding each other.
proposal_boxes: coordinates of object proposal boxes.
proposal_objectness: objectness score of each proposal.
groundtruth_instance_masks: ground truth instance masks.
groundtruth_instance_boundaries: ground truth instance boundaries.
groundtruth_instance_classes: instance mask-level class labels.
groundtruth_keypoints: ground truth keypoints.
groundtruth_keypoint_visibilities: ground truth keypoint visibilities.
groundtruth_label_scores: groundtruth label scores.
groundtruth_weights: groundtruth weight factor for bounding boxes.
num_groundtruth_boxes: number of groundtruth boxes.
true_image_shapes: true shapes of images in the resized images, as resized
images can be padded with zeros.
multiclass_scores: the label score per class for each box.
"""
image = 'image'
image_additional_channels = 'image_additional_channels'
original_image = 'original_image'
key = 'key'
source_id = 'source_id'
filename = 'filename'
groundtruth_image_classes = 'groundtruth_image_classes'
groundtruth_boxes = 'groundtruth_boxes'
groundtruth_classes = 'groundtruth_classes'
groundtruth_label_types = 'groundtruth_label_types'
groundtruth_is_crowd = 'groundtruth_is_crowd'
groundtruth_area = 'groundtruth_area'
groundtruth_difficult = 'groundtruth_difficult'
groundtruth_group_of = 'groundtruth_group_of'
proposal_boxes = 'proposal_boxes'
proposal_objectness = 'proposal_objectness'
groundtruth_instance_masks = 'groundtruth_instance_masks'
groundtruth_instance_boundaries = 'groundtruth_instance_boundaries'
groundtruth_instance_classes = 'groundtruth_instance_classes'
groundtruth_keypoints = 'groundtruth_keypoints'
groundtruth_keypoint_visibilities = 'groundtruth_keypoint_visibilities'
groundtruth_label_scores = 'groundtruth_label_scores'
groundtruth_weights = 'groundtruth_weights'
num_groundtruth_boxes = 'num_groundtruth_boxes'
true_image_shape = 'true_image_shape'
multiclass_scores = 'multiclass_scores'
class DetectionResultFields(object):
"""Naming conventions for storing the output of the detector.
Attributes:
source_id: source of the original image.
key: unique key corresponding to image.
detection_boxes: coordinates of the detection boxes in the image.
detection_scores: detection scores for the detection boxes in the image.
detection_classes: detection-level class labels.
detection_masks: contains a segmentation mask for each detection box.
detection_boundaries: contains an object boundary for each detection box.
detection_keypoints: contains detection keypoints for each detection box.
num_detections: number of detections in the batch.
"""
source_id = 'source_id'
key = 'key'
detection_boxes = 'detection_boxes'
detection_scores = 'detection_scores'
detection_classes = 'detection_classes'
detection_masks = 'detection_masks'
detection_boundaries = 'detection_boundaries'
detection_keypoints = 'detection_keypoints'
num_detections = 'num_detections'
class BoxListFields(object):
"""Naming conventions for BoxLists.
Attributes:
boxes: bounding box coordinates.
classes: classes per bounding box.
scores: scores per bounding box.
weights: sample weights per bounding box.
objectness: objectness score per bounding box.
masks: masks per bounding box.
boundaries: boundaries per bounding box.
keypoints: keypoints per bounding box.
keypoint_heatmaps: keypoint heatmaps per bounding box.
is_crowd: is_crowd annotation per bounding box.
"""
boxes = 'boxes'
classes = 'classes'
scores = 'scores'
weights = 'weights'
objectness = 'objectness'
masks = 'masks'
boundaries = 'boundaries'
keypoints = 'keypoints'
keypoint_heatmaps = 'keypoint_heatmaps'
is_crowd = 'is_crowd'
class TfExampleFields(object):
"""TF-example proto feature names for object detection.
Holds the standard feature names to load from an Example proto for object
detection.
Attributes:
image_encoded: JPEG encoded string
image_format: image format, e.g. "JPEG"
filename: filename
channels: number of channels of image
colorspace: colorspace, e.g. "RGB"
height: height of image in pixels, e.g. 462
width: width of image in pixels, e.g. 581
source_id: original source of the image
image_class_text: image-level label in text format
image_class_label: image-level label in numerical format
object_class_text: labels in text format, e.g. ["person", "cat"]
object_class_label: labels in numbers, e.g. [16, 8]
object_bbox_xmin: xmin coordinates of groundtruth box, e.g. 10, 30
object_bbox_xmax: xmax coordinates of groundtruth box, e.g. 50, 40
object_bbox_ymin: ymin coordinates of groundtruth box, e.g. 40, 50
object_bbox_ymax: ymax coordinates of groundtruth box, e.g. 80, 70
object_view: viewpoint of object, e.g. ["frontal", "left"]
object_truncated: is object truncated, e.g. [true, false]
object_occluded: is object occluded, e.g. [true, false]
object_difficult: is object difficult, e.g. [true, false]
object_group_of: is object a single object or a group of objects
object_depiction: is object a depiction
object_is_crowd: [DEPRECATED, use object_group_of instead]
is the object a single object or a crowd
object_segment_area: the area of the segment.
object_weight: a weight factor for the object's bounding box.
instance_masks: instance segmentation masks.
instance_boundaries: instance boundaries.
instance_classes: Classes for each instance segmentation mask.
detection_class_label: class label in numbers.
detection_bbox_ymin: ymin coordinates of a detection box.
detection_bbox_xmin: xmin coordinates of a detection box.
detection_bbox_ymax: ymax coordinates of a detection box.
detection_bbox_xmax: xmax coordinates of a detection box.
detection_score: detection score for the class label and box.
"""
image_encoded = 'image/encoded'
image_format = 'image/format' # format is reserved keyword
filename = 'image/filename'
channels = 'image/channels'
colorspace = 'image/colorspace'
height = 'image/height'
width = 'image/width'
source_id = 'image/source_id'
image_class_text = 'image/class/text'
image_class_label = 'image/class/label'
object_class_text = 'image/object/class/text'
object_class_label = 'image/object/class/label'
object_bbox_ymin = 'image/object/bbox/ymin'
object_bbox_xmin = 'image/object/bbox/xmin'
object_bbox_ymax = 'image/object/bbox/ymax'
object_bbox_xmax = 'image/object/bbox/xmax'
object_view = 'image/object/view'
object_truncated = 'image/object/truncated'
object_occluded = 'image/object/occluded'
object_difficult = 'image/object/difficult'
object_group_of = 'image/object/group_of'
object_depiction = 'image/object/depiction'
object_is_crowd = 'image/object/is_crowd'
object_segment_area = 'image/object/segment/area'
object_weight = 'image/object/weight'
instance_masks = 'image/segmentation/object'
instance_boundaries = 'image/boundaries/object'
instance_classes = 'image/segmentation/object/class'
detection_class_label = 'image/detection/label'
detection_bbox_ymin = 'image/detection/bbox/ymin'
detection_bbox_xmin = 'image/detection/bbox/xmin'
detection_bbox_ymax = 'image/detection/bbox/ymax'
detection_bbox_xmax = 'image/detection/bbox/xmax'
detection_score = 'image/detection/score'
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
# Generated by the protocol buffer compiler. DO NOT EDIT!
# source: object_detection/protos/string_int_label_map.proto
import sys
_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
from google.protobuf import descriptor as _descriptor
from google.protobuf import message as _message
from google.protobuf import reflection as _reflection
from google.protobuf import symbol_database as _symbol_database
from google.protobuf import descriptor_pb2
# @@protoc_insertion_point(imports)
_sym_db = _symbol_database.Default()
DESCRIPTOR = _descriptor.FileDescriptor(
name='object_detection/protos/string_int_label_map.proto',
package='object_detection.protos',
syntax='proto2',
serialized_pb=_b('\n2object_detection/protos/string_int_label_map.proto\x12\x17object_detection.protos\"G\n\x15StringIntLabelMapItem\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\n\n\x02id\x18\x02 \x01(\x05\x12\x14\n\x0c\x64isplay_name\x18\x03 \x01(\t\"Q\n\x11StringIntLabelMap\x12<\n\x04item\x18\x01 \x03(\x0b\x32..object_detection.protos.StringIntLabelMapItem')
)
_STRINGINTLABELMAPITEM = _descriptor.Descriptor(
name='StringIntLabelMapItem',
full_name='object_detection.protos.StringIntLabelMapItem',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='name', full_name='object_detection.protos.StringIntLabelMapItem.name', index=0,
number=1, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='id', full_name='object_detection.protos.StringIntLabelMapItem.id', index=1,
number=2, type=5, cpp_type=1, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='display_name', full_name='object_detection.protos.StringIntLabelMapItem.display_name', index=2,
number=3, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None),
],
extensions=[
],
nested_types=[],
enum_types=[
],
options=None,
is_extendable=False,
syntax='proto2',
extension_ranges=[],
oneofs=[
],
serialized_start=79,
serialized_end=150,
)
_STRINGINTLABELMAP = _descriptor.Descriptor(
name='StringIntLabelMap',
full_name='object_detection.protos.StringIntLabelMap',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='item', full_name='object_detection.protos.StringIntLabelMap.item', index=0,
number=1, type=11, cpp_type=10, label=3,
has_default_value=False, default_value=[],
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None),
],
extensions=[
],
nested_types=[],
enum_types=[
],
options=None,
is_extendable=False,
syntax='proto2',
extension_ranges=[],
oneofs=[
],
serialized_start=152,
serialized_end=233,
)
_STRINGINTLABELMAP.fields_by_name['item'].message_type = _STRINGINTLABELMAPITEM
DESCRIPTOR.message_types_by_name['StringIntLabelMapItem'] = _STRINGINTLABELMAPITEM
DESCRIPTOR.message_types_by_name['StringIntLabelMap'] = _STRINGINTLABELMAP
_sym_db.RegisterFileDescriptor(DESCRIPTOR)
StringIntLabelMapItem = _reflection.GeneratedProtocolMessageType('StringIntLabelMapItem', (_message.Message,), dict(
DESCRIPTOR = _STRINGINTLABELMAPITEM,
__module__ = 'object_detection.protos.string_int_label_map_pb2'
# @@protoc_insertion_point(class_scope:object_detection.protos.StringIntLabelMapItem)
))
_sym_db.RegisterMessage(StringIntLabelMapItem)
StringIntLabelMap = _reflection.GeneratedProtocolMessageType('StringIntLabelMap', (_message.Message,), dict(
DESCRIPTOR = _STRINGINTLABELMAP,
__module__ = 'object_detection.protos.string_int_label_map_pb2'
# @@protoc_insertion_point(class_scope:object_detection.protos.StringIntLabelMap)
))
_sym_db.RegisterMessage(StringIntLabelMap)
# @@protoc_insertion_point(module_scope)
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A library of tasks.
This interface is intended to implement a wide variety of navigation
tasks. See go/navigation_tasks for a list.
"""
import abc
import collections
import math
import threading
import networkx as nx
import numpy as np
import tensorflow as tf
#from pyglib import logging
#import gin
from envs import task_env
from envs import util as envs_util
# Utility functions.
def _pad_or_clip_array(np_arr, arr_len, is_front_clip=True, output_mask=False):
"""Make np_arr array to have length arr_len.
If the array is shorter than arr_len, then it is padded from the front with
zeros. If it is longer, then it is clipped either from the back or from the
front. Only the first dimension is modified.
Args:
np_arr: numpy array.
arr_len: integer scalar.
is_front_clip: a boolean. If true then clipping is done in the front,
otherwise in the back.
output_mask: If True, outputs a numpy array of rank 1 which represents
a mask of which values have been added (0 - added, 1 - actual output).
Returns:
A numpy array and the size of padding (as a python int32). This size is
negative is the array is clipped.
"""
shape = list(np_arr.shape)
pad_size = arr_len - shape[0]
padded_or_clipped = None
if pad_size < 0:
if is_front_clip:
padded_or_clipped = np_arr[-pad_size:, :]
else:
padded_or_clipped = np_arr[:arr_len, :]
elif pad_size > 0:
padding = np.zeros([pad_size] + shape[1:], dtype=np_arr.dtype)
padded_or_clipped = np.concatenate([np_arr, padding], axis=0)
else:
padded_or_clipped = np_arr
if output_mask:
mask = np.ones((arr_len,), dtype=np.int)
if pad_size > 0:
mask[-pad_size:] = 0
return padded_or_clipped, pad_size, mask
else:
return padded_or_clipped, pad_size
def classification_loss(truth, predicted, weights=None, is_one_hot=True):
"""A cross entropy loss.
Computes the mean of cross entropy losses for all pairs of true labels and
predictions. It wraps around a tf implementation of the cross entropy loss
with additional reformating of the inputs. If the truth and predicted are
n-rank Tensors with n > 2, then these are reshaped to 2-rank Tensors. It
allows for truth to be specified as one hot vector or class indices. Finally,
a weight can be specified for each element in truth and predicted.
Args:
truth: an n-rank or (n-1)-rank Tensor containing labels. If is_one_hot is
True, then n-rank Tensor is expected, otherwise (n-1) rank one.
predicted: an n-rank float Tensor containing prediction probabilities.
weights: an (n-1)-rank float Tensor of weights
is_one_hot: a boolean.
Returns:
A TF float scalar.
"""
num_labels = predicted.get_shape().as_list()[-1]
if not is_one_hot:
truth = tf.reshape(truth, [-1])
truth = tf.one_hot(
truth, depth=num_labels, on_value=1.0, off_value=0.0, axis=-1)
else:
truth = tf.reshape(truth, [-1, num_labels])
predicted = tf.reshape(predicted, [-1, num_labels])
losses = tf.nn.softmax_cross_entropy_with_logits(
labels=truth, logits=predicted)
if weights is not None:
losses = tf.boolean_mask(losses,
tf.cast(tf.reshape(weights, [-1]), dtype=tf.bool))
return tf.reduce_mean(losses)
class UnrolledTaskIOConfig(object):
"""Configuration of task inputs and outputs.
A task can have multiple inputs, which define the context, and a task query
which defines what is to be executed in this context. The desired execution
is encoded in an output. The config defines the shapes of the inputs, the
query and the outputs.
"""
def __init__(self, inputs, output, query=None):
"""Constructs a Task input/output config.
Args:
inputs: a list of tuples. Each tuple represents the configuration of an
input, with first element being the type (a string value) and the second
element the shape.
output: a tuple representing the configuration of the output.
query: a tuple representing the configuration of the query. If no query,
then None.
"""
# A configuration of a single input, output or query. Consists of the type,
# which can be one of the three specified above, and a shape. The shape must
# be consistent with the type, e.g. if type == 'image', then shape is a 3
# valued list.
io_config = collections.namedtuple('IOConfig', ['type', 'shape'])
def assert_config(config):
if not isinstance(config, tuple):
raise ValueError('config must be a tuple. Received {}'.format(
type(config)))
if len(config) != 2:
raise ValueError('config must have 2 elements, has %d' % len(config))
if not isinstance(config[0], tf.DType):
raise ValueError('First element of config must be a tf.DType.')
if not isinstance(config[1], list):
raise ValueError('Second element of config must be a list.')
assert isinstance(inputs, collections.OrderedDict)
for modality_type in inputs:
assert_config(inputs[modality_type])
self._inputs = collections.OrderedDict(
[(k, io_config(*value)) for k, value in inputs.iteritems()])
if query is not None:
assert_config(query)
self._query = io_config(*query)
else:
self._query = None
assert_config(output)
self._output = io_config(*output)
@property
def inputs(self):
return self._inputs
@property
def output(self):
return self._output
@property
def query(self):
return self._query
class UnrolledTask(object):
"""An interface for a Task which can be unrolled during training.
Each example is called episode and consists of inputs and target output, where
the output can be considered as desired unrolled sequence of actions for the
inputs. For the specified tasks, these action sequences are to be
unambiguously definable.
"""
__metaclass__ = abc.ABCMeta
def __init__(self, config):
assert isinstance(config, UnrolledTaskIOConfig)
self._config = config
# A dict of bookkeeping variables.
self.info = {}
# Tensorflow input is multithreaded and this lock is needed to prevent
# race condition in the environment. Without the lock, non-thread safe
# environments crash.
self._lock = threading.Lock()
@property
def config(self):
return self._config
@abc.abstractmethod
def episode(self):
"""Returns data needed to train and test a single episode.
Each episode consists of inputs, which define the context of the task, a
query which defines the task, and a target output, which defines a
sequence of actions to be executed for this query. This sequence should not
require feedback, i.e. can be predicted purely from input and query.]
Returns:
inputs, query, output, where inputs is a list of numpy arrays and query
and output are numpy arrays. These arrays must be of shape and type as
specified in the task configuration.
"""
pass
def reset(self, observation):
"""Called after the environment is reset."""
pass
def episode_batch(self, batch_size):
"""Returns a batch of episodes.
Args:
batch_size: size of batch.
Returns:
(inputs, query, output, masks) where inputs is list of numpy arrays and
query, output, and mask are numpy arrays. These arrays must be of shape
and type as specified in the task configuration with one additional
preceding dimension corresponding to the batch.
Raises:
ValueError: if self.episode() returns illegal values.
"""
batched_inputs = collections.OrderedDict(
[[mtype, []] for mtype in self.config.inputs])
batched_queries = []
batched_outputs = []
batched_masks = []
for _ in range(int(batch_size)):
with self._lock:
# The episode function needs to be thread-safe. Since the current
# implementation for the envs are not thread safe we need to have lock
# the operations here.
inputs, query, outputs = self.episode()
if not isinstance(outputs, tuple):
raise ValueError('Outputs return value must be tuple.')
if len(outputs) != 2:
raise ValueError('Output tuple must be of size 2.')
if inputs is not None:
for modality_type in batched_inputs:
batched_inputs[modality_type].append(
np.expand_dims(inputs[modality_type], axis=0))
if query is not None:
batched_queries.append(np.expand_dims(query, axis=0))
batched_outputs.append(np.expand_dims(outputs[0], axis=0))
if outputs[1] is not None:
batched_masks.append(np.expand_dims(outputs[1], axis=0))
batched_inputs = {
k: np.concatenate(i, axis=0) for k, i in batched_inputs.iteritems()
}
if batched_queries:
batched_queries = np.concatenate(batched_queries, axis=0)
batched_outputs = np.concatenate(batched_outputs, axis=0)
if batched_masks:
batched_masks = np.concatenate(batched_masks, axis=0).astype(np.float32)
else:
# When the array is empty, the default np.dtype is float64 which causes
# py_func to crash in the tests.
batched_masks = np.array([], dtype=np.float32)
batched_inputs = [batched_inputs[k] for k in self._config.inputs]
return batched_inputs, batched_queries, batched_outputs, batched_masks
def tf_episode_batch(self, batch_size):
"""A batch of episodes as TF Tensors.
Same as episode_batch with the difference that the return values are TF
Tensors.
Args:
batch_size: a python float for the batch size.
Returns:
inputs, query, output, mask where inputs is a dictionary of tf.Tensor
where the keys are the modality types specified in the config.inputs.
query, output, and mask are TF Tensors. These tensors must
be of shape and type as specified in the task configuration with one
additional preceding dimension corresponding to the batch. Both mask and
output have the same shape as output.
"""
# Define TF outputs.
touts = []
shapes = []
for _, i in self._config.inputs.iteritems():
touts.append(i.type)
shapes.append(i.shape)
if self._config.query is not None:
touts.append(self._config.query.type)
shapes.append(self._config.query.shape)
# Shapes and types for batched_outputs.
touts.append(self._config.output.type)
shapes.append(self._config.output.shape)
# Shapes and types for batched_masks.
touts.append(self._config.output.type)
shapes.append(self._config.output.shape[0:1])
def episode_batch_func():
if self.config.query is None:
inp, _, output, masks = self.episode_batch(int(batch_size))
return tuple(inp) + (output, masks)
else:
inp, query, output, masks = self.episode_batch(int(batch_size))
return tuple(inp) + (query, output, masks)
tf_episode_batch = tf.py_func(episode_batch_func, [], touts,
stateful=True, name='taskdata')
for episode, shape in zip(tf_episode_batch, shapes):
episode.set_shape([batch_size] + shape)
tf_episode_batch_dict = collections.OrderedDict([
(mtype, episode)
for mtype, episode in zip(self.config.inputs.keys(), tf_episode_batch)
])
cur_index = len(self.config.inputs.keys())
tf_query = None
if self.config.query is not None:
tf_query = tf_episode_batch[cur_index]
cur_index += 1
tf_outputs = tf_episode_batch[cur_index]
tf_masks = tf_episode_batch[cur_index + 1]
return tf_episode_batch_dict, tf_query, tf_outputs, tf_masks
@abc.abstractmethod
def target_loss(self, true_targets, targets, weights=None):
"""A loss for training a task model.
This loss measures the discrepancy between the task outputs, the true and
predicted ones.
Args:
true_targets: tf.Tensor of shape and type as defined in the task config
containing the true outputs.
targets: tf.Tensor of shape and type as defined in the task config
containing the predicted outputs.
weights: a bool tf.Tensor of shape as targets. Only true values are
considered when formulating the loss.
"""
pass
def reward(self, obs, done, info):
"""Returns a reward.
The tasks has to compute a reward based on the state of the environment. The
reward computation, though, is task specific. The task is to use the
environment interface, as defined in task_env.py, to compute the reward. If
this interface does not expose enough information, it is to be updated.
Args:
obs: Observation from environment's step function.
done: Done flag from environment's step function.
info: Info dict from environment's step function.
Returns:
obs: Observation.
reward: Floating point value.
done: Done flag.
info: Info dict.
"""
# Default implementation does not do anything.
return obs, 0.0, done, info
class RandomExplorationBasedTask(UnrolledTask):
"""A Task which starts with a random exploration of the environment."""
def __init__(self,
env,
seed,
add_query_noise=False,
query_noise_var=0.0,
*args,
**kwargs): # pylint: disable=keyword-arg-before-vararg
"""Initializes a Task using a random exploration runs.
Args:
env: an instance of type TaskEnv and gym.Env.
seed: a random seed.
add_query_noise: boolean, if True then whatever queries are generated,
they are randomly perturbed. The semantics of the queries depends on the
concrete task implementation.
query_noise_var: float, the variance of Gaussian noise used for query
perturbation. Used iff add_query_noise==True.
*args: see super class.
**kwargs: see super class.
"""
super(RandomExplorationBasedTask, self).__init__(*args, **kwargs)
assert isinstance(env, task_env.TaskEnv)
self._env = env
self._env.set_task(self)
self._rng = np.random.RandomState(seed)
self._add_query_noise = add_query_noise
self._query_noise_var = query_noise_var
# GoToStaticXTask can also take empty config but for the rest of the classes
# the number of modality types is 1.
if len(self.config.inputs.keys()) > 1:
raise NotImplementedError('current implementation supports input '
'with only one modality type or less.')
def _exploration(self):
"""Generates a random exploration run.
The function uses the environment to generate a run.
Returns:
A tuple of numpy arrays. The i-th array contains observation of type and
shape as specified in config.inputs[i].
A list of states along the exploration path.
A list of vertex indices corresponding to the path of the exploration.
"""
in_seq_len = self._config.inputs.values()[0].shape[0]
path, _, states, step_outputs = self._env.random_step_sequence(
min_len=in_seq_len)
obs = {modality_type: [] for modality_type in self._config.inputs}
for o in step_outputs:
step_obs, _, done, _ = o
# It is expected that each value of step_obs is a dict of observations,
# whose dimensions are consistent with the config.inputs sizes.
for modality_type in self._config.inputs:
assert modality_type in step_obs, '{}'.format(type(step_obs))
o = step_obs[modality_type]
i = self._config.inputs[modality_type]
assert len(o.shape) == len(i.shape) - 1
for dim_o, dim_i in zip(o.shape, i.shape[1:]):
assert dim_o == dim_i, '{} != {}'.format(dim_o, dim_i)
obs[modality_type].append(o)
if done:
break
if not obs:
return obs, states, path
max_path_len = int(
round(in_seq_len * float(len(path)) / float(len(obs.values()[0]))))
path = path[-max_path_len:]
states = states[-in_seq_len:]
# The above obs is a list of tuples of np,array. Re-format them as tuple of
# np.array, each array containing all observations from all steps.
def regroup(obs, i):
"""Regroups observations.
Args:
obs: a list of tuples of same size. The k-th tuple contains all the
observations from k-th step. Each observation is a numpy array.
i: the index of the observation in each tuple to be grouped.
Returns:
A numpy array of shape config.inputs[i] which contains all i-th
observations from all steps. These are concatenated along the first
dimension. In addition, if the number of observations is different from
the one specified in config.inputs[i].shape[0], then the array is either
padded from front or clipped.
"""
grouped_obs = np.concatenate(
[np.expand_dims(o, axis=0) for o in obs[i]], axis=0)
in_seq_len = self._config.inputs[i].shape[0]
# pylint: disable=unbalanced-tuple-unpacking
grouped_obs, _ = _pad_or_clip_array(
grouped_obs, in_seq_len, is_front_clip=True)
return grouped_obs
all_obs = {i: regroup(obs, i) for i in self._config.inputs}
return all_obs, states, path
def _obs_to_state(self, path, states):
"""Computes mapping between path nodes and states."""
# Generate a numpy array of locations corresponding to the path vertices.
path_coordinates = map(self._env.vertex_to_pose, path)
path_coordinates = np.concatenate(
[np.reshape(p, [1, 2]) for p in path_coordinates])
# The observations are taken along a smoothed trajectory following the path.
# We compute a mapping between the obeservations and the map vertices.
path_to_obs = collections.defaultdict(list)
obs_to_state = []
for i, s in enumerate(states):
location = np.reshape(s[0:2], [1, 2])
index = np.argmin(
np.reshape(
np.sum(np.power(path_coordinates - location, 2), axis=1), [-1]))
index = path[index]
path_to_obs[index].append(i)
obs_to_state.append(index)
return path_to_obs, obs_to_state
def _perturb_state(self, state, noise_var):
"""Perturbes the state.
The location are purturbed using a Gaussian noise with variance
noise_var. The orientation is uniformly sampled.
Args:
state: a numpy array containing an env state (x, y locations).
noise_var: float
Returns:
The perturbed state.
"""
def normal(v, std):
if std > 0:
n = self._rng.normal(0.0, std)
n = min(n, 2.0 * std)
n = max(n, -2.0 * std)
return v + n
else:
return v
state = state.copy()
state[0] = normal(state[0], noise_var)
state[1] = normal(state[1], noise_var)
if state.size > 2:
state[2] = self._rng.uniform(-math.pi, math.pi)
return state
def _sample_obs(self,
indices,
observations,
observation_states,
path_to_obs,
max_obs_index=None,
use_exploration_obs=True):
"""Samples one observation which corresponds to vertex_index in path.
In addition, the sampled observation must have index in observations less
than max_obs_index. If these two conditions cannot be satisfied the
function returns None.
Args:
indices: a list of integers.
observations: a list of numpy arrays containing all the observations.
observation_states: a list of numpy arrays, each array representing the
state of the observation.
path_to_obs: a dict of path indices to lists of observation indices.
max_obs_index: an integer.
use_exploration_obs: if True, then the observation is sampled among the
specified observations, otherwise it is obtained from the environment.
Returns:
A tuple of:
-- A numpy array of size width x height x 3 representing the sampled
observation.
-- The index of the sampld observation among the input observations.
-- The state at which the observation is captured.
Raises:
ValueError: if the observation and observation_states lists are of
different lengths.
"""
if len(observations) != len(observation_states):
raise ValueError('observation and observation_states lists must have '
'equal lengths')
if not indices:
return None, None, None
vertex_index = self._rng.choice(indices)
if use_exploration_obs:
obs_indices = path_to_obs[vertex_index]
if max_obs_index is not None:
obs_indices = [i for i in obs_indices if i < max_obs_index]
if obs_indices:
index = self._rng.choice(obs_indices)
if self._add_query_noise:
xytheta = self._perturb_state(observation_states[index],
self._query_noise_var)
return self._env.observation(xytheta), index, xytheta
else:
return observations[index], index, observation_states[index]
else:
return None, None, None
else:
xy = self._env.vertex_to_pose(vertex_index)
xytheta = np.array([xy[0], xy[1], 0.0])
xytheta = self._perturb_state(xytheta, self._query_noise_var)
return self._env.observation(xytheta), None, xytheta
class AreNearbyTask(RandomExplorationBasedTask):
"""A task of identifying whether a query is nearby current location or not.
The query is guaranteed to be in proximity of an already visited location,
i.e. close to one of the observations. For each observation we have one
query, which is either close or not to this observation.
"""
def __init__(
self,
max_distance=0,
*args,
**kwargs): # pylint: disable=keyword-arg-before-vararg
super(AreNearbyTask, self).__init__(*args, **kwargs)
self._max_distance = max_distance
if len(self.config.inputs.keys()) != 1:
raise NotImplementedError('current implementation supports input '
'with only one modality type')
def episode(self):
"""Episode data.
Returns:
observations: a tuple with one element. This element is a numpy array of
size in_seq_len x observation_size x observation_size x 3 containing
in_seq_len images.
query: a numpy array of size
in_seq_len x observation_size X observation_size x 3 containing a query
image.
A tuple of size two. First element is a in_seq_len x 2 numpy array of
either 1.0 or 0.0. The i-th element denotes whether the i-th query
image is neraby (value 1.0) or not (value 0.0) to the i-th observation.
The second element in the tuple is a mask, a numpy array of size
in_seq_len x 1 and values 1.0 or 0.0 denoting whether the query is
valid or not (it can happen that the query is not valid, e.g. there are
not enough observations to have a meaningful queries).
"""
observations, states, path = self._exploration()
assert len(observations.values()[0]) == len(states)
# The observations are taken along a smoothed trajectory following the path.
# We compute a mapping between the obeservations and the map vertices.
path_to_obs, obs_to_path = self._obs_to_state(path, states)
# Go over all observations, and sample a query. With probability 0.5 this
# query is a nearby observation (defined as belonging to the same vertex
# in path).
g = self._env.graph
queries = []
labels = []
validity_masks = []
query_index_in_observations = []
for i, curr_o in enumerate(observations.values()[0]):
p = obs_to_path[i]
low = max(0, i - self._max_distance)
# A list of lists of vertex indices. Each list in this group corresponds
# to one possible label.
index_groups = [[], [], []]
# Nearby visited indices, label 1.
nearby_visited = [
ii for ii in path[low:i + 1] + g[p].keys() if ii in obs_to_path[:i]
]
nearby_visited = [ii for ii in index_groups[1] if ii in path_to_obs]
# NOT Nearby visited indices, label 0.
not_nearby_visited = [ii for ii in path[:low] if ii not in g[p].keys()]
not_nearby_visited = [ii for ii in index_groups[0] if ii in path_to_obs]
# NOT visited indices, label 2.
not_visited = [
ii for ii in range(g.number_of_nodes()) if ii not in path[:i + 1]
]
index_groups = [not_nearby_visited, nearby_visited, not_visited]
# Consider only labels for which there are indices.
allowed_labels = [ii for ii, group in enumerate(index_groups) if group]
label = self._rng.choice(allowed_labels)
indices = list(set(index_groups[label]))
max_obs_index = None if label == 2 else i
use_exploration_obs = False if label == 2 else True
o, obs_index, _ = self._sample_obs(
indices=indices,
observations=observations.values()[0],
observation_states=states,
path_to_obs=path_to_obs,
max_obs_index=max_obs_index,
use_exploration_obs=use_exploration_obs)
query_index_in_observations.append(obs_index)
# If we cannot sample a valid query, we mark it as not valid in mask.
if o is None:
label = 0.0
o = curr_o
validity_masks.append(0)
else:
validity_masks.append(1)
queries.append(o.values()[0])
labels.append(label)
query = np.concatenate([np.expand_dims(q, axis=0) for q in queries], axis=0)
def one_hot(label, num_labels=3):
a = np.zeros((num_labels,), dtype=np.float)
a[int(label)] = 1.0
return a
outputs = np.stack([one_hot(l) for l in labels], axis=0)
validity_mask = np.reshape(
np.array(validity_masks, dtype=np.int32), [-1, 1])
self.info['query_index_in_observations'] = query_index_in_observations
self.info['observation_states'] = states
return observations, query, (outputs, validity_mask)
def target_loss(self, truth, predicted, weights=None):
pass
class NeighboringQueriesTask(RandomExplorationBasedTask):
"""A task of identifying whether two queries are closeby or not.
The proximity between queries is defined by the length of the shorest path
between them.
"""
def __init__(
self,
max_distance=1,
*args,
**kwargs): # pylint: disable=keyword-arg-before-vararg
"""Initializes a NeighboringQueriesTask.
Args:
max_distance: integer, the maximum distance in terms of number of vertices
between the two queries, so that they are considered neighboring.
*args: for super class.
**kwargs: for super class.
"""
super(NeighboringQueriesTask, self).__init__(*args, **kwargs)
self._max_distance = max_distance
if len(self.config.inputs.keys()) != 1:
raise NotImplementedError('current implementation supports input '
'with only one modality type')
def episode(self):
"""Episode data.
Returns:
observations: a tuple with one element. This element is a numpy array of
size in_seq_len x observation_size x observation_size x 3 containing
in_seq_len images.
query: a numpy array of size
2 x observation_size X observation_size x 3 containing a pair of query
images.
A tuple of size two. First element is a numpy array of size 2 containing
a one hot vector of whether the two observations are neighobring. Second
element is a boolean numpy value denoting whether this is a valid
episode.
"""
observations, states, path = self._exploration()
assert len(observations.values()[0]) == len(states)
path_to_obs, _ = self._obs_to_state(path, states)
# Restrict path to ones for which observations have been generated.
path = [p for p in path if p in path_to_obs]
# Sample first query.
query1_index = self._rng.choice(path)
# Sample label.
label = self._rng.randint(2)
# Sample second query.
# If label == 1, then second query must be nearby, otherwise not.
closest_indices = nx.single_source_shortest_path(
self._env.graph, query1_index, self._max_distance).keys()
if label == 0:
# Closest indices on the path.
indices = [p for p in path if p not in closest_indices]
else:
# Indices which are not closest on the path.
indices = [p for p in closest_indices if p in path]
query2_index = self._rng.choice(indices)
# Generate an observation.
query1, query1_index, _ = self._sample_obs(
[query1_index],
observations.values()[0],
states,
path_to_obs,
max_obs_index=None,
use_exploration_obs=True)
query2, query2_index, _ = self._sample_obs(
[query2_index],
observations.values()[0],
states,
path_to_obs,
max_obs_index=None,
use_exploration_obs=True)
queries = np.concatenate(
[np.expand_dims(q, axis=0) for q in [query1, query2]])
labels = np.array([0, 0])
labels[label] = 1
is_valid = np.array([1])
self.info['observation_states'] = states
self.info['query_indices_in_observations'] = [query1_index, query2_index]
return observations, queries, (labels, is_valid)
def target_loss(self, truth, predicted, weights=None):
pass
#@gin.configurable
class GotoStaticXTask(RandomExplorationBasedTask):
"""Task go to a static X.
If continuous reward is used only one goal is allowed so that the reward can
be computed as a delta-distance to that goal..
"""
def __init__(self,
step_reward=0.0,
goal_reward=1.0,
hit_wall_reward=-1.0,
done_at_target=False,
use_continuous_reward=False,
*args,
**kwargs): # pylint: disable=keyword-arg-before-vararg
super(GotoStaticXTask, self).__init__(*args, **kwargs)
if len(self.config.inputs.keys()) > 1:
raise NotImplementedError('current implementation supports input '
'with only one modality type or less.')
self._step_reward = step_reward
self._goal_reward = goal_reward
self._hit_wall_reward = hit_wall_reward
self._done_at_target = done_at_target
self._use_continuous_reward = use_continuous_reward
self._previous_path_length = None
def episode(self):
observations, _, path = self._exploration()
if len(path) < 2:
raise ValueError('The exploration path has only one node.')
g = self._env.graph
start = path[-1]
while True:
goal = self._rng.choice(path[:-1])
if goal != start:
break
goal_path = nx.shortest_path(g, start, goal)
init_orientation = self._rng.uniform(0, np.pi, (1,))
trajectory = np.array(
[list(self._env.vertex_to_pose(p)) for p in goal_path])
init_xy = np.reshape(trajectory[0, :], [-1])
init_state = np.concatenate([init_xy, init_orientation], 0)
trajectory = trajectory[1:, :]
deltas = envs_util.trajectory_to_deltas(trajectory, init_state)
output_seq_len = self._config.output.shape[0]
arr = _pad_or_clip_array(deltas, output_seq_len, output_mask=True)
# pylint: disable=unbalanced-tuple-unpacking
thetas, _, thetas_mask = arr
query = self._env.observation(self._env.vertex_to_pose(goal)).values()[0]
return observations, query, (thetas, thetas_mask)
def reward(self, obs, done, info):
if 'wall_collision' in info and info['wall_collision']:
return obs, self._hit_wall_reward, done, info
reward = 0.0
current_vertex = self._env.pose_to_vertex(self._env.state)
if current_vertex in self._env.targets():
if self._done_at_target:
done = True
else:
obs = self._env.reset()
reward = self._goal_reward
else:
if self._use_continuous_reward:
if len(self._env.targets()) != 1:
raise ValueError(
'FindX task with continuous reward is assuming only one target.')
goal_vertex = self._env.targets()[0]
path_length = self._compute_path_length(goal_vertex)
reward = self._previous_path_length - path_length
self._previous_path_length = path_length
else:
reward = self._step_reward
return obs, reward, done, info
def _compute_path_length(self, goal_vertex):
current_vertex = self._env.pose_to_vertex(self._env.state)
path = nx.shortest_path(self._env.graph, current_vertex, goal_vertex)
assert len(path) >= 2
curr_xy = np.array(self._env.state[:2])
next_xy = np.array(self._env.vertex_to_pose(path[1]))
last_step_distance = np.linalg.norm(next_xy - curr_xy)
return (len(path) - 2) * self._env.cell_size_px + last_step_distance
def reset(self, observation):
if self._use_continuous_reward:
if len(self._env.targets()) != 1:
raise ValueError(
'FindX task with continuous reward is assuming only one target.')
goal_vertex = self._env.targets()[0]
self._previous_path_length = self._compute_path_length(goal_vertex)
def target_loss(self, truth, predicted, weights=None):
"""Action classification loss.
Args:
truth: a batch_size x sequence length x number of labels float
Tensor containing a one hot vector for each label in each batch and
time.
predicted: a batch_size x sequence length x number of labels float
Tensor containing a predicted distribution over all actions.
weights: a batch_size x sequence_length float Tensor of bool
denoting which actions are valid.
Returns:
An average cross entropy over all batches and elements in sequence.
"""
return classification_loss(
truth=truth, predicted=predicted, weights=weights, is_one_hot=True)
class RelativeLocationTask(RandomExplorationBasedTask):
"""A task of estimating the relative location of a query w.r.t current.
It is to be used for debugging. It is designed such that the output is a
single value, out of a discrete set of values, so that it can be phrased as
a classification problem.
"""
def __init__(self, num_labels, *args, **kwargs):
"""Initializes a relative location task.
Args:
num_labels: integer, number of orientations to bin the relative
orientation into.
*args: see super class.
**kwargs: see super class.
"""
super(RelativeLocationTask, self).__init__(*args, **kwargs)
self._num_labels = num_labels
if len(self.config.inputs.keys()) != 1:
raise NotImplementedError('current implementation supports input '
'with only one modality type')
def episode(self):
observations, states, path = self._exploration()
# Select a random element from history.
path_to_obs, _ = self._obs_to_state(path, states)
use_exploration_obs = not self._add_query_noise
query, _, query_state = self._sample_obs(
path[:-1],
observations.values()[0],
states,
path_to_obs,
max_obs_index=None,
use_exploration_obs=use_exploration_obs)
x, y, theta = tuple(states[-1])
q_x, q_y, _ = tuple(query_state)
t_x, t_y = q_x - x, q_y - y
(rt_x, rt_y) = (np.sin(theta) * t_x - np.cos(theta) * t_y,
np.cos(theta) * t_x + np.sin(theta) * t_y)
# Bins are [a(i), a(i+1)] for a(i) = -pi + 0.5 * bin_size + i * bin_size.
shift = np.pi * (1 - 1.0 / (2.0 * self._num_labels))
orientation = np.arctan2(rt_y, rt_x) + shift
if orientation < 0:
orientation += 2 * np.pi
label = int(np.floor(self._num_labels * orientation / (2 * np.pi)))
out_shape = self._config.output.shape
if len(out_shape) != 1:
raise ValueError('Output shape should be of rank 1.')
if out_shape[0] != self._num_labels:
raise ValueError('Output shape must be of size %d' % self._num_labels)
output = np.zeros(out_shape, dtype=np.float32)
output[label] = 1
return observations, query, (output, None)
def target_loss(self, truth, predicted, weights=None):
return classification_loss(
truth=truth, predicted=predicted, weights=weights, is_one_hot=True)
class LocationClassificationTask(UnrolledTask):
"""A task of classifying a location as one of several classes.
The task does not have an input, but just a query and an output. The query
is an observation of the current location, e.g. an image taken from the
current state. The output is a label classifying this location in one of
predefined set of locations (or landmarks).
The current implementation classifies locations as intersections based on the
number and directions of biforcations. It is expected that a location can have
at most 4 different directions, aligned with the axes. As each of these four
directions might be present or not, the number of possible intersections are
2^4 = 16.
"""
def __init__(self, env, seed, *args, **kwargs):
super(LocationClassificationTask, self).__init__(*args, **kwargs)
self._env = env
self._rng = np.random.RandomState(seed)
# A location property which can be set. If not set, a random one is
# generated.
self._location = None
if len(self.config.inputs.keys()) > 1:
raise NotImplementedError('current implementation supports input '
'with only one modality type or less.')
@property
def location(self):
return self._location
@location.setter
def location(self, location):
self._location = location
def episode(self):
# Get a location. If not set, sample on at a vertex with a random
# orientation
location = self._location
if location is None:
num_nodes = self._env.graph.number_of_nodes()
vertex = int(math.floor(self._rng.uniform(0, num_nodes)))
xy = self._env.vertex_to_pose(vertex)
theta = self._rng.uniform(0, 2 * math.pi)
location = np.concatenate(
[np.reshape(xy, [-1]), np.array([theta])], axis=0)
else:
vertex = self._env.pose_to_vertex(location)
theta = location[2]
neighbors = self._env.graph.neighbors(vertex)
xy_s = [self._env.vertex_to_pose(n) for n in neighbors]
def rotate(xy, theta):
"""Rotates a vector around the origin by angle theta.
Args:
xy: a numpy darray of shape (2, ) of floats containing the x and y
coordinates of a vector.
theta: a python float containing the rotation angle in radians.
Returns:
A numpy darray of floats of shape (2,) containing the x and y
coordinates rotated xy.
"""
rotated_x = np.cos(theta) * xy[0] - np.sin(theta) * xy[1]
rotated_y = np.sin(theta) * xy[0] + np.cos(theta) * xy[1]
return np.array([rotated_x, rotated_y])
# Rotate all intersection biforcation by the orientation of the agent as the
# intersection label is defined in an agent centered fashion.
xy_s = [
rotate(xy - location[0:2], -location[2] - math.pi / 4) for xy in xy_s
]
th_s = [np.arctan2(xy[1], xy[0]) for xy in xy_s]
out_shape = self._config.output.shape
if len(out_shape) != 1:
raise ValueError('Output shape should be of rank 1.')
num_labels = out_shape[0]
if num_labels != 16:
raise ValueError('Currently only 16 labels are supported '
'(there are 16 different 4 way intersection types).')
th_s = set([int(math.floor(4 * (th / (2 * np.pi) + 0.5))) for th in th_s])
one_hot_label = np.zeros((num_labels,), dtype=np.float32)
label = 0
for th in th_s:
label += pow(2, th)
one_hot_label[int(label)] = 1.0
query = self._env.observation(location).values()[0]
return [], query, (one_hot_label, None)
def reward(self, obs, done, info):
raise ValueError('Do not call.')
def target_loss(self, truth, predicted, weights=None):
return classification_loss(
truth=truth, predicted=predicted, weights=weights, is_one_hot=True)
class GotoStaticXNoExplorationTask(UnrolledTask):
"""An interface for findX tasks without exploration.
The agent is initialized a random location in a random world and a random goal
and the objective is for the agent to move toward the goal. This class
generates episode for such task. Each generates a sequence of observations x
and target outputs y. x is the observations and is an OrderedDict with keys
provided from config.inputs.keys() and the shapes provided in the
config.inputs. The output is a numpy arrays with the shape specified in the
config.output. The shape of the array is (sequence_length x action_size) where
action is the number of actions that can be done in the environment. Note that
config.output.shape should be set according to the number of actions that can
be done in the env.
target outputs y are the groundtruth value of each action that is computed
from the environment graph. The target output for each action is proportional
to the progress that each action makes. Target value of 1 means that the
action takes the agent one step closer, -1 means the action takes the agent
one step farther. Value of -2 means that action should not take place at all.
This can be because the action leads to collision or it wants to terminate the
episode prematurely.
"""
def __init__(self, env, *args, **kwargs):
super(GotoStaticXNoExplorationTask, self).__init__(*args, **kwargs)
if self._config.query is not None:
raise ValueError('query should be None.')
if len(self._config.output.shape) != 2:
raise ValueError('output should only have two dimensions:'
'(sequence_length x number_of_actions)')
for input_config in self._config.inputs.values():
if input_config.shape[0] != self._config.output.shape[0]:
raise ValueError('the first dimension of the input and output should'
'be the same.')
if len(self._config.output.shape) != 2:
raise ValueError('output shape should be '
'(sequence_length x number_of_actions)')
self._env = env
def _compute_shortest_path_length(self, vertex, target_vertices):
"""Computes length of the shortest path from vertex to any target vertexes.
Args:
vertex: integer, index of the vertex in the environment graph.
target_vertices: list of the target vertexes
Returns:
integer, minimum distance from the vertex to any of the target_vertices.
Raises:
ValueError: if there is no path between the vertex and at least one of
the target_vertices.
"""
try:
return np.min([
len(nx.shortest_path(self._env.graph, vertex, t))
for t in target_vertices
])
except:
#logging.error('there is no path between vertex %d and at least one of '
# 'the targets %r', vertex, target_vertices)
raise
def _compute_gt_value(self, vertex, target_vertices):
"""Computes groundtruth value of all the actions at the vertex.
The value of each action is the difference each action makes in the length
of the shortest path to the goal. If an action takes the agent one step
closer to the goal the value is 1. In case, it takes the agent one step away
from the goal it would be -1. If it leads to collision or if the agent uses
action stop before reaching to the goal it is -2. To avoid scale issues the
gt_values are multipled by 0.5.
Args:
vertex: integer, the index of current vertex.
target_vertices: list of the integer indexes of the target views.
Returns:
numpy array with shape (action_size,) and each element is the groundtruth
value of each action based on the progress each action makes.
"""
action_size = self._config.output.shape[1]
output_value = np.ones((action_size), dtype=np.float32) * -2
my_distance = self._compute_shortest_path_length(vertex, target_vertices)
for adj in self._env.graph[vertex]:
adj_distance = self._compute_shortest_path_length(adj, target_vertices)
if adj_distance is None:
continue
action_index = self._env.action(
self._env.vertex_to_pose(vertex), self._env.vertex_to_pose(adj))
assert action_index is not None, ('{} is not adjacent to {}. There might '
'be a problem in environment graph '
'connectivity because there is no '
'direct edge between the given '
'vertices').format(
self._env.vertex_to_pose(vertex),
self._env.vertex_to_pose(adj))
output_value[action_index] = my_distance - adj_distance
return output_value * 0.5
def episode(self):
"""Returns data needed to train and test a single episode.
Returns:
(inputs, None, output) where inputs is a dictionary of modality types to
numpy arrays. The second element is query but we assume that the goal
is also given as part of observation so it should be None for this task,
and the outputs is the tuple of ground truth action values with the
shape of (sequence_length x action_size) that is coming from
config.output.shape and a numpy array with the shape of
(sequence_length,) that is 1 if the corresponding element of the
input and output should be used in the training optimization.
Raises:
ValueError: If the output values for env.random_step_sequence is not
valid.
ValueError: If the shape of observations coming from the env is not
consistent with the config.
ValueError: If there is a modality type specified in the config but the
environment does not return that.
"""
# Sequence length is the first dimension of any of the input tensors.
sequence_length = self._config.inputs.values()[0].shape[0]
modality_types = self._config.inputs.keys()
path, _, _, step_outputs = self._env.random_step_sequence(
max_len=sequence_length)
target_vertices = [self._env.pose_to_vertex(x) for x in self._env.targets()]
if len(path) != len(step_outputs):
raise ValueError('path, and step_outputs should have equal length'
' {}!={}'.format(len(path), len(step_outputs)))
# Building up observations. observations will be a OrderedDict of
# modality types. The values are numpy arrays that follow the given shape
# in the input config for each modality type.
observations = collections.OrderedDict([k, []] for k in modality_types)
for step_output in step_outputs:
obs_dict = step_output[0]
# Only going over the modality types that are specified in the input
# config.
for modality_type in modality_types:
if modality_type not in obs_dict:
raise ValueError('modality type is not returned from the environment.'
'{} not in {}'.format(modality_type,
obs_dict.keys()))
obs = obs_dict[modality_type]
if np.any(
obs.shape != tuple(self._config.inputs[modality_type].shape[1:])):
raise ValueError(
'The observations should have the same size as speicifed in'
'config for modality type {}. {} != {}'.format(
modality_type, obs.shape,
self._config.inputs[modality_type].shape[1:]))
observations[modality_type].append(obs)
gt_value = [self._compute_gt_value(v, target_vertices) for v in path]
# pylint: disable=unbalanced-tuple-unpacking
gt_value, _, value_mask = _pad_or_clip_array(
np.array(gt_value),
sequence_length,
is_front_clip=False,
output_mask=True,
)
for modality_type, obs in observations.iteritems():
observations[modality_type], _, mask = _pad_or_clip_array(
np.array(obs), sequence_length, is_front_clip=False, output_mask=True)
assert np.all(mask == value_mask)
return observations, None, (gt_value, value_mask)
def reset(self, observation):
"""Called after the environment is reset."""
pass
def target_loss(self, true_targets, targets, weights=None):
"""A loss for training a task model.
This loss measures the discrepancy between the task outputs, the true and
predicted ones.
Args:
true_targets: tf.Tensor of tf.float32 with the shape of
(batch_size x sequence_length x action_size).
targets: tf.Tensor of tf.float32 with the shape of
(batch_size x sequence_length x action_size).
weights: tf.Tensor of tf.bool with the shape of
(batch_size x sequence_length).
Raises:
ValueError: if the shapes of the input tensors are not consistent.
Returns:
L2 loss between the predicted action values and true action values.
"""
targets_shape = targets.get_shape().as_list()
true_targets_shape = true_targets.get_shape().as_list()
if len(targets_shape) != 3 or len(true_targets_shape) != 3:
raise ValueError('invalid shape for targets or true_targets_shape')
if np.any(targets_shape != true_targets_shape):
raise ValueError('the shape of targets and true_targets are not the same'
'{} != {}'.format(targets_shape, true_targets_shape))
if weights is not None:
# Filtering targets and true_targets using weights.
weights_shape = weights.get_shape().as_list()
if np.any(weights_shape != targets_shape[0:2]):
raise ValueError('The first two elements of weights shape should match'
'target. {} != {}'.format(weights_shape,
targets_shape))
true_targets = tf.boolean_mask(true_targets, weights)
targets = tf.boolean_mask(targets, weights)
return tf.losses.mean_squared_error(tf.reshape(targets, [-1]),
tf.reshape(true_targets, [-1]))
def reward(self, obs, done, info):
raise NotImplementedError('reward is not implemented for this task')
################################################################################
class NewTask(UnrolledTask):
def __init__(self, env, *args, **kwargs):
super(NewTask, self).__init__(*args, **kwargs)
self._env = env
def _compute_shortest_path_length(self, vertex, target_vertices):
"""Computes length of the shortest path from vertex to any target vertexes.
Args:
vertex: integer, index of the vertex in the environment graph.
target_vertices: list of the target vertexes
Returns:
integer, minimum distance from the vertex to any of the target_vertices.
Raises:
ValueError: if there is no path between the vertex and at least one of
the target_vertices.
"""
try:
return np.min([
len(nx.shortest_path(self._env.graph, vertex, t))
for t in target_vertices
])
except:
logging.error('there is no path between vertex %d and at least one of '
'the targets %r', vertex, target_vertices)
raise
def _compute_gt_value(self, vertex, target_vertices):
"""Computes groundtruth value of all the actions at the vertex.
The value of each action is the difference each action makes in the length
of the shortest path to the goal. If an action takes the agent one step
closer to the goal the value is 1. In case, it takes the agent one step away
from the goal it would be -1. If it leads to collision or if the agent uses
action stop before reaching to the goal it is -2. To avoid scale issues the
gt_values are multipled by 0.5.
Args:
vertex: integer, the index of current vertex.
target_vertices: list of the integer indexes of the target views.
Returns:
numpy array with shape (action_size,) and each element is the groundtruth
value of each action based on the progress each action makes.
"""
action_size = self._config.output.shape[1]
output_value = np.ones((action_size), dtype=np.float32) * -2
# own compute _compute_shortest_path_length - returnts float
my_distance = self._compute_shortest_path_length(vertex, target_vertices)
for adj in self._env.graph[vertex]:
adj_distance = self._compute_shortest_path_length(adj, target_vertices)
if adj_distance is None:
continue
action_index = self._env.action(
self._env.vertex_to_pose(vertex), self._env.vertex_to_pose(adj))
assert action_index is not None, ('{} is not adjacent to {}. There might '
'be a problem in environment graph '
'connectivity because there is no '
'direct edge between the given '
'vertices').format(
self._env.vertex_to_pose(vertex),
self._env.vertex_to_pose(adj))
output_value[action_index] = my_distance - adj_distance
return output_value * 0.5
def episode(self):
"""Returns data needed to train and test a single episode.
Returns:
(inputs, None, output) where inputs is a dictionary of modality types to
numpy arrays. The second element is query but we assume that the goal
is also given as part of observation so it should be None for this task,
and the outputs is the tuple of ground truth action values with the
shape of (sequence_length x action_size) that is coming from
config.output.shape and a numpy array with the shape of
(sequence_length,) that is 1 if the corresponding element of the
input and output should be used in the training optimization.
Raises:
ValueError: If the output values for env.random_step_sequence is not
valid.
ValueError: If the shape of observations coming from the env is not
consistent with the config.
ValueError: If there is a modality type specified in the config but the
environment does not return that.
"""
# Sequence length is the first dimension of any of the input tensors.
sequence_length = self._config.inputs.values()[0].shape[0]
modality_types = self._config.inputs.keys()
path, _, _, step_outputs = self._env.random_step_sequence(
max_len=sequence_length)
target_vertices = [self._env.pose_to_vertex(x) for x in self._env.targets()]
if len(path) != len(step_outputs):
raise ValueError('path, and step_outputs should have equal length'
' {}!={}'.format(len(path), len(step_outputs)))
# Building up observations. observations will be a OrderedDict of
# modality types. The values are numpy arrays that follow the given shape
# in the input config for each modality type.
observations = collections.OrderedDict([k, []] for k in modality_types)
for step_output in step_outputs:
obs_dict = step_output[0]
# Only going over the modality types that are specified in the input
# config.
for modality_type in modality_types:
if modality_type not in obs_dict:
raise ValueError('modality type is not returned from the environment.'
'{} not in {}'.format(modality_type,
obs_dict.keys()))
obs = obs_dict[modality_type]
if np.any(
obs.shape != tuple(self._config.inputs[modality_type].shape[1:])):
raise ValueError(
'The observations should have the same size as speicifed in'
'config for modality type {}. {} != {}'.format(
modality_type, obs.shape,
self._config.inputs[modality_type].shape[1:]))
observations[modality_type].append(obs)
gt_value = [self._compute_gt_value(v, target_vertices) for v in path]
# pylint: disable=unbalanced-tuple-unpacking
gt_value, _, value_mask = _pad_or_clip_array(
np.array(gt_value),
sequence_length,
is_front_clip=False,
output_mask=True,
)
for modality_type, obs in observations.iteritems():
observations[modality_type], _, mask = _pad_or_clip_array(
np.array(obs), sequence_length, is_front_clip=False, output_mask=True)
assert np.all(mask == value_mask)
return observations, None, (gt_value, value_mask)
def reset(self, observation):
"""Called after the environment is reset."""
pass
def target_loss(self, true_targets, targets, weights=None):
"""A loss for training a task model.
This loss measures the discrepancy between the task outputs, the true and
predicted ones.
Args:
true_targets: tf.Tensor of tf.float32 with the shape of
(batch_size x sequence_length x action_size).
targets: tf.Tensor of tf.float32 with the shape of
(batch_size x sequence_length x action_size).
weights: tf.Tensor of tf.bool with the shape of
(batch_size x sequence_length).
Raises:
ValueError: if the shapes of the input tensors are not consistent.
Returns:
L2 loss between the predicted action values and true action values.
"""
targets_shape = targets.get_shape().as_list()
true_targets_shape = true_targets.get_shape().as_list()
if len(targets_shape) != 3 or len(true_targets_shape) != 3:
raise ValueError('invalid shape for targets or true_targets_shape')
if np.any(targets_shape != true_targets_shape):
raise ValueError('the shape of targets and true_targets are not the same'
'{} != {}'.format(targets_shape, true_targets_shape))
if weights is not None:
# Filtering targets and true_targets using weights.
weights_shape = weights.get_shape().as_list()
if np.any(weights_shape != targets_shape[0:2]):
raise ValueError('The first two elements of weights shape should match'
'target. {} != {}'.format(weights_shape,
targets_shape))
true_targets = tf.boolean_mask(true_targets, weights)
targets = tf.boolean_mask(targets, weights)
return tf.losses.mean_squared_error(tf.reshape(targets, [-1]),
tf.reshape(true_targets, [-1]))
def reward(self, obs, done, info):
raise NotImplementedError('reward is not implemented for this task')
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
# pylint: disable=line-too-long
# pyformat: disable
"""Train and eval for supervised navigation training.
For training:
python train_supervised_active_vision.py \
--mode='train' \
--logdir=$logdir/checkin_log_det/ \
--modality_types='det' \
--batch_size=8 \
--train_iters=200000 \
--lstm_cell_size=2048 \
--policy_fc_size=2048 \
--sequence_length=20 \
--max_eval_episode_length=100 \
--test_iters=194 \
--gin_config=envs/configs/active_vision_config.gin \
--gin_params='ActiveVisionDatasetEnv.dataset_root="$datadir"' \
--logtostderr
For testing:
python train_supervised_active_vision.py
--mode='eval' \
--logdir=$logdir/checkin_log_det/ \
--modality_types='det' \
--batch_size=8 \
--train_iters=200000 \
--lstm_cell_size=2048 \
--policy_fc_size=2048 \
--sequence_length=20 \
--max_eval_episode_length=100 \
--test_iters=194 \
--gin_config=envs/configs/active_vision_config.gin \
--gin_params='ActiveVisionDatasetEnv.dataset_root="$datadir"' \
--logtostderr
"""
import collections
import os
import time
from absl import app
from absl import flags
from absl import logging
import networkx as nx
import numpy as np
import tensorflow as tf
import gin
import embedders
import policies
import tasks
from envs import active_vision_dataset_env
from envs import task_env
slim = tf.contrib.slim
flags.DEFINE_string('logdir', '',
'Path to a directory to write summaries and checkpoints')
# Parameters controlling the training setup. In general one would not need to
# modify them.
flags.DEFINE_string('master', 'local',
'BNS name of the TensorFlow master, or local.')
flags.DEFINE_integer('task_id', 0,
'Task id of the replica running the training.')
flags.DEFINE_integer('ps_tasks', 0,
'Number of tasks in the ps job. If 0 no ps job is used.')
flags.DEFINE_integer('decay_steps', 1000,
'Number of steps for exponential decay.')
flags.DEFINE_float('learning_rate', 0.0001, 'Learning rate.')
flags.DEFINE_integer('batch_size', 8, 'Batch size.')
flags.DEFINE_integer('sequence_length', 20, 'sequence length')
flags.DEFINE_integer('train_iters', 200000, 'number of training iterations.')
flags.DEFINE_integer('save_summaries_secs', 300,
'number of seconds between saving summaries')
flags.DEFINE_integer('save_interval_secs', 300,
'numer of seconds between saving variables')
flags.DEFINE_integer('log_every_n_steps', 20, 'number of steps between logging')
flags.DEFINE_string('modality_types', '',
'modality names in _ separated format')
flags.DEFINE_string('conv_window_sizes', '8_4_3',
'conv window size in separated by _')
flags.DEFINE_string('conv_strides', '4_2_1', '')
flags.DEFINE_string('conv_channels', '8_16_16', '')
flags.DEFINE_integer('embedding_fc_size', 128,
'size of embedding for each modality')
flags.DEFINE_integer('obs_resolution', 64,
'resolution of the input observations')
flags.DEFINE_integer('lstm_cell_size', 2048, 'size of lstm cell size')
flags.DEFINE_integer('policy_fc_size', 2048,
'size of fully connected layers for policy part')
flags.DEFINE_float('weight_decay', 0.0002, 'weight decay')
flags.DEFINE_integer('goal_category_count', 5, 'number of goal categories')
flags.DEFINE_integer('action_size', 7, 'number of possible actions')
flags.DEFINE_integer('max_eval_episode_length', 100,
'maximum sequence length for evaluation.')
flags.DEFINE_enum('mode', 'train', ['train', 'eval'],
'indicates whether it is in training or evaluation')
flags.DEFINE_integer('test_iters', 194,
'number of iterations that the eval needs to be run')
flags.DEFINE_multi_string('gin_config', [],
'List of paths to a gin config files for the env.')
flags.DEFINE_multi_string('gin_params', [],
'Newline separated list of Gin parameter bindings.')
flags.DEFINE_string(
'resnet50_path', './resnet_v2_50_checkpoint/resnet_v2_50.ckpt', 'path to resnet50'
'checkpoint')
flags.DEFINE_bool('freeze_resnet_weights', True, '')
flags.DEFINE_string(
'eval_init_points_file_name', '',
'Name of the file that containts the initial locations and'
'worlds for each evalution point')
FLAGS = flags.FLAGS
TRAIN_WORLDS = [
'Home_001_1', 'Home_001_2', 'Home_002_1', 'Home_003_1', 'Home_003_2',
'Home_004_1', 'Home_004_2', 'Home_005_1', 'Home_005_2', 'Home_006_1',
'Home_010_1'
]
TEST_WORLDS = ['Home_011_1', 'Home_013_1', 'Home_016_1']
def create_modality_types():
"""Parses the modality_types and returns a list of task_env.ModalityType."""
if not FLAGS.modality_types:
raise ValueError('there needs to be at least one modality type')
modality_types = FLAGS.modality_types.split('_')
for x in modality_types:
if x not in ['image', 'sseg', 'det', 'depth']:
raise ValueError('invalid modality type: {}'.format(x))
conversion_dict = {
'image': task_env.ModalityTypes.IMAGE,
'sseg': task_env.ModalityTypes.SEMANTIC_SEGMENTATION,
'depth': task_env.ModalityTypes.DEPTH,
'det': task_env.ModalityTypes.OBJECT_DETECTION,
}
return [conversion_dict[k] for k in modality_types]
def create_task_io_config(
modality_types,
goal_category_count,
action_size,
sequence_length,
):
"""Generates task io config."""
shape_prefix = [sequence_length, FLAGS.obs_resolution, FLAGS.obs_resolution]
shapes = {
task_env.ModalityTypes.IMAGE: [sequence_length, 224, 224, 3],
task_env.ModalityTypes.DEPTH: shape_prefix + [
2,
],
task_env.ModalityTypes.SEMANTIC_SEGMENTATION: shape_prefix + [
1,
],
task_env.ModalityTypes.OBJECT_DETECTION: shape_prefix + [
90,
]
}
types = {k: tf.float32 for k in shapes}
types[task_env.ModalityTypes.IMAGE] = tf.uint8
inputs = collections.OrderedDict(
[[mtype, (types[mtype], shapes[mtype])] for mtype in modality_types])
inputs[task_env.ModalityTypes.GOAL] = (tf.float32,
[sequence_length, goal_category_count])
inputs[task_env.ModalityTypes.PREV_ACTION] = (tf.float32, [
sequence_length, action_size + 1
])
print inputs
return tasks.UnrolledTaskIOConfig(
inputs=inputs,
output=(tf.float32, [sequence_length, action_size]),
query=None)
def map_to_embedder(modality_type):
"""Maps modality_type to its corresponding embedder."""
if modality_type == task_env.ModalityTypes.PREV_ACTION:
return None
if modality_type == task_env.ModalityTypes.GOAL:
return embedders.IdentityEmbedder()
if modality_type == task_env.ModalityTypes.IMAGE:
return embedders.ResNet50Embedder()
conv_window_sizes = [int(x) for x in FLAGS.conv_window_sizes.split('_')]
conv_channels = [int(x) for x in FLAGS.conv_channels.split('_')]
conv_strides = [int(x) for x in FLAGS.conv_strides.split('_')]
params = tf.contrib.training.HParams(
to_one_hot=modality_type == task_env.ModalityTypes.SEMANTIC_SEGMENTATION,
one_hot_length=10,
conv_sizes=conv_window_sizes,
conv_strides=conv_strides,
conv_channels=conv_channels,
embedding_size=FLAGS.embedding_fc_size,
weight_decay_rate=FLAGS.weight_decay,
)
return embedders.SmallNetworkEmbedder(params)
def create_train_and_init_ops(policy, task):
"""Creates training ops given the arguments.
Args:
policy: the policy for the task.
task: the task instance.
Returns:
train_op: the op that needs to be runned at each step.
summaries_op: the summary op that is executed.
init_fn: the op that initializes the variables if there is no previous
checkpoint. If Resnet50 is not used in the model it is None, otherwise
it reads the weights from FLAGS.resnet50_path and sets the init_fn
to the op that initializes the ResNet50 with the pre-trained weights.
"""
assert isinstance(task, tasks.GotoStaticXNoExplorationTask)
assert isinstance(policy, policies.Policy)
inputs, _, gt_outputs, masks = task.tf_episode_batch(FLAGS.batch_size)
outputs, _ = policy.build(inputs, None)
loss = task.target_loss(gt_outputs, outputs, masks)
init_fn = None
# If resnet is added to the graph, init_fn should initialize resnet weights
# if there is no previous checkpoint.
variables_assign_dict = {}
vars_list = []
for v in slim.get_model_variables():
if v.name.find('resnet') >= 0:
if not FLAGS.freeze_resnet_weights:
vars_list.append(v)
variables_assign_dict[v.name[v.name.find('resnet'):-2]] = v
else:
vars_list.append(v)
global_step = tf.train.get_or_create_global_step()
learning_rate = tf.train.exponential_decay(
FLAGS.learning_rate,
global_step,
decay_steps=FLAGS.decay_steps,
decay_rate=0.98,
staircase=True)
optimizer = tf.train.AdamOptimizer(learning_rate)
train_op = slim.learning.create_train_op(
loss,
optimizer,
global_step=global_step,
variables_to_train=vars_list,
)
if variables_assign_dict:
init_fn = slim.assign_from_checkpoint_fn(
FLAGS.resnet50_path,
variables_assign_dict,
ignore_missing_vars=False)
scalar_summaries = {}
scalar_summaries['LR'] = learning_rate
scalar_summaries['loss'] = loss
for name, summary in scalar_summaries.iteritems():
tf.summary.scalar(name, summary)
return train_op, init_fn
def create_eval_ops(policy, config, possible_targets):
"""Creates the necessary ops for evaluation."""
inputs_feed = collections.OrderedDict([[
mtype,
tf.placeholder(config.inputs[mtype].type,
[1] + config.inputs[mtype].shape)
] for mtype in config.inputs])
inputs_feed[task_env.ModalityTypes.PREV_ACTION] = tf.placeholder(
tf.float32, [1, 1] + [
config.output.shape[-1] + 1,
])
prev_state_feed = [
tf.placeholder(
tf.float32, [1, FLAGS.lstm_cell_size], name='prev_state_{}'.format(i))
for i in range(2)
]
policy_outputs = policy.build(inputs_feed, prev_state_feed)
summary_feed = {}
for c in possible_targets + ['mean']:
summary_feed[c] = tf.placeholder(
tf.float32, [], name='eval_in_range_{}_input'.format(c))
tf.summary.scalar('eval_in_range_{}'.format(c), summary_feed[c])
return inputs_feed, prev_state_feed, policy_outputs, (tf.summary.merge_all(),
summary_feed)
def unroll_policy_for_eval(
sess,
env,
inputs_feed,
prev_state_feed,
policy_outputs,
number_of_steps,
output_folder,
):
"""unrolls the policy for testing.
Args:
sess: tf.Session
env: The environment.
inputs_feed: dictionary of placeholder for the input modalities.
prev_state_feed: placeholder for the input to the prev_state of the model.
policy_outputs: tensor that contains outputs of the policy.
number_of_steps: maximum number of unrolling steps.
output_folder: output_folder where the function writes a dictionary of
detailed information about the path. The dictionary keys are 'states' and
'distance'. The value for 'states' is the list of states that the agent
goes along the path. The value for 'distance' contains the length of
shortest path to the goal at each step.
Returns:
states: list of states along the path.
distance: list of distances along the path.
"""
prev_state = [
np.zeros((1, FLAGS.lstm_cell_size), dtype=np.float32) for _ in range(2)
]
prev_action = np.zeros((1, 1, FLAGS.action_size + 1), dtype=np.float32)
obs = env.reset()
distances_to_goal = []
states = []
unique_id = '{}_{}'.format(env.cur_image_id(), env.goal_string)
for _ in range(number_of_steps):
distances_to_goal.append(
np.min([
len(
nx.shortest_path(env.graph, env.pose_to_vertex(env.state()),
env.pose_to_vertex(target_view)))
for target_view in env.targets()
]))
states.append(env.state())
feed_dict = {inputs_feed[mtype]: [[obs[mtype]]] for mtype in inputs_feed}
feed_dict[prev_state_feed[0]] = prev_state[0]
feed_dict[prev_state_feed[1]] = prev_state[1]
action_values, prev_state = sess.run(policy_outputs, feed_dict=feed_dict)
chosen_action = np.argmax(action_values[0])
obs, _, done, info = env.step(np.int32(chosen_action))
prev_action[0][0][chosen_action] = 1.
prev_action[0][0][-1] = float(info['success'])
# If the agent chooses action stop or the number of steps exceeeded
# env._episode_length.
if done:
break
# logging.info('distance = %d, id = %s, #steps = %d', distances_to_goal[-1],
output_path = os.path.join(output_folder, unique_id + '.npy')
with tf.gfile.Open(output_path, 'w') as f:
print 'saving path information to {}'.format(output_path)
np.save(f, {'states': states, 'distance': distances_to_goal})
return states, distances_to_goal
def init(sequence_length, eval_init_points_file_name, worlds):
"""Initializes the common operations between train and test."""
modality_types = create_modality_types()
logging.info('modality types: %r', modality_types)
# negative reward_goal_range prevents the env from terminating early when the
# agent is close to the goal. The policy should keep the agent until the end
# of the 100 steps either through chosing stop action or oscilating around
# the target.
env = active_vision_dataset_env.ActiveVisionDatasetEnv(
modality_types=modality_types +
[task_env.ModalityTypes.GOAL, task_env.ModalityTypes.PREV_ACTION],
reward_goal_range=-1,
eval_init_points_file_name=eval_init_points_file_name,
worlds=worlds,
output_size=FLAGS.obs_resolution,
)
config = create_task_io_config(
modality_types=modality_types,
goal_category_count=FLAGS.goal_category_count,
action_size=FLAGS.action_size,
sequence_length=sequence_length,
)
task = tasks.GotoStaticXNoExplorationTask(env=env, config=config)
embedders_dict = {mtype: map_to_embedder(mtype) for mtype in config.inputs}
policy_params = tf.contrib.training.HParams(
lstm_state_size=FLAGS.lstm_cell_size,
fc_channels=FLAGS.policy_fc_size,
weight_decay=FLAGS.weight_decay,
target_embedding_size=FLAGS.embedding_fc_size,
)
policy = policies.LSTMPolicy(
modality_names=config.inputs.keys(),
embedders_dict=embedders_dict,
action_size=FLAGS.action_size,
params=policy_params,
max_episode_length=sequence_length)
return env, config, task, policy
def test():
"""Contains all the operations for testing policies."""
env, config, _, policy = init(1, 'all_init_configs', TEST_WORLDS)
inputs_feed, prev_state_feed, policy_outputs, summary_op = create_eval_ops(
policy, config, env.possible_targets)
sv = tf.train.Supervisor(logdir=FLAGS.logdir)
prev_checkpoint = None
with sv.managed_session(
start_standard_services=False,
config=tf.ConfigProto(allow_soft_placement=True)) as sess:
while not sv.should_stop():
while True:
new_checkpoint = tf.train.latest_checkpoint(FLAGS.logdir)
print 'new_checkpoint ', new_checkpoint
if not new_checkpoint:
time.sleep(1)
continue
if prev_checkpoint is None:
prev_checkpoint = new_checkpoint
break
if prev_checkpoint != new_checkpoint:
prev_checkpoint = new_checkpoint
break
else: # if prev_checkpoint == new_checkpoint, we have to wait more.
time.sleep(1)
checkpoint_step = int(new_checkpoint[new_checkpoint.rfind('-') + 1:])
sv.saver.restore(sess, new_checkpoint)
print '--------------------'
print 'evaluating checkpoint {}'.format(new_checkpoint)
folder_path = os.path.join(FLAGS.logdir, 'evals', str(checkpoint_step))
if not tf.gfile.Exists(folder_path):
tf.gfile.MakeDirs(folder_path)
eval_stats = {c: [] for c in env.possible_targets}
for test_iter in range(FLAGS.test_iters):
print 'evaluating {} of {}'.format(test_iter, FLAGS.test_iters)
_, distance_to_goal = unroll_policy_for_eval(
sess,
env,
inputs_feed,
prev_state_feed,
policy_outputs,
FLAGS.max_eval_episode_length,
folder_path,
)
print 'goal = {}'.format(env.goal_string)
eval_stats[env.goal_string].append(float(distance_to_goal[-1] <= 7))
eval_stats = {k: np.mean(v) for k, v in eval_stats.iteritems()}
eval_stats['mean'] = np.mean(eval_stats.values())
print eval_stats
feed_dict = {summary_op[1][c]: eval_stats[c] for c in eval_stats}
summary_str = sess.run(summary_op[0], feed_dict=feed_dict)
writer = sv.summary_writer
writer.add_summary(summary_str, checkpoint_step)
writer.flush()
def train():
_, _, task, policy = init(FLAGS.sequence_length, None, TRAIN_WORLDS)
print(FLAGS.save_summaries_secs)
print(FLAGS.save_interval_secs)
print(FLAGS.logdir)
with tf.device(
tf.train.replica_device_setter(ps_tasks=FLAGS.ps_tasks, merge_devices=True)):
train_op, init_fn = create_train_and_init_ops(policy=policy, task=task)
print(FLAGS.logdir)
slim.learning.train(
train_op=train_op,
init_fn=init_fn,
logdir=FLAGS.logdir,
is_chief=FLAGS.task_id == 0,
number_of_steps=FLAGS.train_iters,
save_summaries_secs=FLAGS.save_summaries_secs,
save_interval_secs=FLAGS.save_interval_secs,
session_config=tf.ConfigProto(allow_soft_placement=True),
)
def main(_):
gin.parse_config_files_and_bindings(FLAGS.gin_config, FLAGS.gin_params)
if FLAGS.mode == 'train':
train()
else:
test()
if __name__ == '__main__':
app.run(main)
#!/bin/bash
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
# blaze build -c opt train_supervised_active_vision
# bazel build -c opt --config=cuda --copt=-mavx train_supervised_active_vision && \
bazel-bin/research/cognitive_planning/train_supervised_active_vision \
--mode='train' \
--logdir=/usr/local/google/home/kosecka/local_avd_train/ \
--modality_types='det' \
--batch_size=8 \
--train_iters=200000 \
--lstm_cell_size=2048 \
--policy_fc_size=2048 \
--sequence_length=20 \
--max_eval_episode_length=100 \
--test_iters=194 \
--gin_config=envs/configs/active_vision_config.gin \
--gin_params='ActiveVisionDatasetEnv.dataset_root="/cns/jn-d/home/kosecka/AVD_Minimal/"' \
--logtostderr
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A set of functions that are used for visualization.
These functions often receive an image, perform some visualization on the image.
The functions do not return a value, instead they modify the image itself.
"""
import collections
import functools
# Set headless-friendly backend.
import matplotlib; matplotlib.use('Agg') # pylint: disable=multiple-statements
import matplotlib.pyplot as plt # pylint: disable=g-import-not-at-top
import numpy as np
import PIL.Image as Image
import PIL.ImageColor as ImageColor
import PIL.ImageDraw as ImageDraw
import PIL.ImageFont as ImageFont
import six
import tensorflow as tf
import standard_fields as fields
_TITLE_LEFT_MARGIN = 10
_TITLE_TOP_MARGIN = 10
STANDARD_COLORS = [
'AliceBlue', 'Chartreuse', 'Aqua', 'Aquamarine', 'Azure', 'Beige', 'Bisque',
'BlanchedAlmond', 'BlueViolet', 'BurlyWood', 'CadetBlue', 'AntiqueWhite',
'Chocolate', 'Coral', 'CornflowerBlue', 'Cornsilk', 'Crimson', 'Cyan',
'DarkCyan', 'DarkGoldenRod', 'DarkGrey', 'DarkKhaki', 'DarkOrange',
'DarkOrchid', 'DarkSalmon', 'DarkSeaGreen', 'DarkTurquoise', 'DarkViolet',
'DeepPink', 'DeepSkyBlue', 'DodgerBlue', 'FireBrick', 'FloralWhite',
'ForestGreen', 'Fuchsia', 'Gainsboro', 'GhostWhite', 'Gold', 'GoldenRod',
'Salmon', 'Tan', 'HoneyDew', 'HotPink', 'IndianRed', 'Ivory', 'Khaki',
'Lavender', 'LavenderBlush', 'LawnGreen', 'LemonChiffon', 'LightBlue',
'LightCoral', 'LightCyan', 'LightGoldenRodYellow', 'LightGray', 'LightGrey',
'LightGreen', 'LightPink', 'LightSalmon', 'LightSeaGreen', 'LightSkyBlue',
'LightSlateGray', 'LightSlateGrey', 'LightSteelBlue', 'LightYellow', 'Lime',
'LimeGreen', 'Linen', 'Magenta', 'MediumAquaMarine', 'MediumOrchid',
'MediumPurple', 'MediumSeaGreen', 'MediumSlateBlue', 'MediumSpringGreen',
'MediumTurquoise', 'MediumVioletRed', 'MintCream', 'MistyRose', 'Moccasin',
'NavajoWhite', 'OldLace', 'Olive', 'OliveDrab', 'Orange', 'OrangeRed',
'Orchid', 'PaleGoldenRod', 'PaleGreen', 'PaleTurquoise', 'PaleVioletRed',
'PapayaWhip', 'PeachPuff', 'Peru', 'Pink', 'Plum', 'PowderBlue', 'Purple',
'Red', 'RosyBrown', 'RoyalBlue', 'SaddleBrown', 'Green', 'SandyBrown',
'SeaGreen', 'SeaShell', 'Sienna', 'Silver', 'SkyBlue', 'SlateBlue',
'SlateGray', 'SlateGrey', 'Snow', 'SpringGreen', 'SteelBlue', 'GreenYellow',
'Teal', 'Thistle', 'Tomato', 'Turquoise', 'Violet', 'Wheat', 'White',
'WhiteSmoke', 'Yellow', 'YellowGreen'
]
def save_image_array_as_png(image, output_path):
"""Saves an image (represented as a numpy array) to PNG.
Args:
image: a numpy array with shape [height, width, 3].
output_path: path to which image should be written.
"""
image_pil = Image.fromarray(np.uint8(image)).convert('RGB')
with tf.gfile.Open(output_path, 'w') as fid:
image_pil.save(fid, 'PNG')
def encode_image_array_as_png_str(image):
"""Encodes a numpy array into a PNG string.
Args:
image: a numpy array with shape [height, width, 3].
Returns:
PNG encoded image string.
"""
image_pil = Image.fromarray(np.uint8(image))
output = six.BytesIO()
image_pil.save(output, format='PNG')
png_string = output.getvalue()
output.close()
return png_string
def draw_bounding_box_on_image_array(image,
ymin,
xmin,
ymax,
xmax,
color='red',
thickness=4,
display_str_list=(),
use_normalized_coordinates=True):
"""Adds a bounding box to an image (numpy array).
Bounding box coordinates can be specified in either absolute (pixel) or
normalized coordinates by setting the use_normalized_coordinates argument.
Args:
image: a numpy array with shape [height, width, 3].
ymin: ymin of bounding box.
xmin: xmin of bounding box.
ymax: ymax of bounding box.
xmax: xmax of bounding box.
color: color to draw bounding box. Default is red.
thickness: line thickness. Default value is 4.
display_str_list: list of strings to display in box
(each to be shown on its own line).
use_normalized_coordinates: If True (default), treat coordinates
ymin, xmin, ymax, xmax as relative to the image. Otherwise treat
coordinates as absolute.
"""
image_pil = Image.fromarray(np.uint8(image)).convert('RGB')
draw_bounding_box_on_image(image_pil, ymin, xmin, ymax, xmax, color,
thickness, display_str_list,
use_normalized_coordinates)
np.copyto(image, np.array(image_pil))
def draw_bounding_box_on_image(image,
ymin,
xmin,
ymax,
xmax,
color='red',
thickness=4,
display_str_list=(),
use_normalized_coordinates=True):
"""Adds a bounding box to an image.
Bounding box coordinates can be specified in either absolute (pixel) or
normalized coordinates by setting the use_normalized_coordinates argument.
Each string in display_str_list is displayed on a separate line above the
bounding box in black text on a rectangle filled with the input 'color'.
If the top of the bounding box extends to the edge of the image, the strings
are displayed below the bounding box.
Args:
image: a PIL.Image object.
ymin: ymin of bounding box.
xmin: xmin of bounding box.
ymax: ymax of bounding box.
xmax: xmax of bounding box.
color: color to draw bounding box. Default is red.
thickness: line thickness. Default value is 4.
display_str_list: list of strings to display in box
(each to be shown on its own line).
use_normalized_coordinates: If True (default), treat coordinates
ymin, xmin, ymax, xmax as relative to the image. Otherwise treat
coordinates as absolute.
"""
draw = ImageDraw.Draw(image)
im_width, im_height = image.size
if use_normalized_coordinates:
(left, right, top, bottom) = (xmin * im_width, xmax * im_width,
ymin * im_height, ymax * im_height)
else:
(left, right, top, bottom) = (xmin, xmax, ymin, ymax)
draw.line([(left, top), (left, bottom), (right, bottom),
(right, top), (left, top)], width=thickness, fill=color)
try:
font = ImageFont.truetype('arial.ttf', 24)
except IOError:
font = ImageFont.load_default()
# If the total height of the display strings added to the top of the bounding
# box exceeds the top of the image, stack the strings below the bounding box
# instead of above.
display_str_heights = [font.getsize(ds)[1] for ds in display_str_list]
# Each display_str has a top and bottom margin of 0.05x.
total_display_str_height = (1 + 2 * 0.05) * sum(display_str_heights)
if top > total_display_str_height:
text_bottom = top
else:
text_bottom = bottom + total_display_str_height
# Reverse list and print from bottom to top.
for display_str in display_str_list[::-1]:
text_width, text_height = font.getsize(display_str)
margin = np.ceil(0.05 * text_height)
draw.rectangle(
[(left, text_bottom - text_height - 2 * margin), (left + text_width,
text_bottom)],
fill=color)
draw.text(
(left + margin, text_bottom - text_height - margin),
display_str,
fill='black',
font=font)
text_bottom -= text_height - 2 * margin
def draw_bounding_boxes_on_image_array(image,
boxes,
color='red',
thickness=4,
display_str_list_list=()):
"""Draws bounding boxes on image (numpy array).
Args:
image: a numpy array object.
boxes: a 2 dimensional numpy array of [N, 4]: (ymin, xmin, ymax, xmax).
The coordinates are in normalized format between [0, 1].
color: color to draw bounding box. Default is red.
thickness: line thickness. Default value is 4.
display_str_list_list: list of list of strings.
a list of strings for each bounding box.
The reason to pass a list of strings for a
bounding box is that it might contain
multiple labels.
Raises:
ValueError: if boxes is not a [N, 4] array
"""
image_pil = Image.fromarray(image)
draw_bounding_boxes_on_image(image_pil, boxes, color, thickness,
display_str_list_list)
np.copyto(image, np.array(image_pil))
def draw_bounding_boxes_on_image(image,
boxes,
color='red',
thickness=4,
display_str_list_list=()):
"""Draws bounding boxes on image.
Args:
image: a PIL.Image object.
boxes: a 2 dimensional numpy array of [N, 4]: (ymin, xmin, ymax, xmax).
The coordinates are in normalized format between [0, 1].
color: color to draw bounding box. Default is red.
thickness: line thickness. Default value is 4.
display_str_list_list: list of list of strings.
a list of strings for each bounding box.
The reason to pass a list of strings for a
bounding box is that it might contain
multiple labels.
Raises:
ValueError: if boxes is not a [N, 4] array
"""
boxes_shape = boxes.shape
if not boxes_shape:
return
if len(boxes_shape) != 2 or boxes_shape[1] != 4:
raise ValueError('Input must be of size [N, 4]')
for i in range(boxes_shape[0]):
display_str_list = ()
if display_str_list_list:
display_str_list = display_str_list_list[i]
draw_bounding_box_on_image(image, boxes[i, 0], boxes[i, 1], boxes[i, 2],
boxes[i, 3], color, thickness, display_str_list)
def _visualize_boxes(image, boxes, classes, scores, category_index, **kwargs):
return visualize_boxes_and_labels_on_image_array(
image, boxes, classes, scores, category_index=category_index, **kwargs)
def _visualize_boxes_and_masks(image, boxes, classes, scores, masks,
category_index, **kwargs):
return visualize_boxes_and_labels_on_image_array(
image,
boxes,
classes,
scores,
category_index=category_index,
instance_masks=masks,
**kwargs)
def _visualize_boxes_and_keypoints(image, boxes, classes, scores, keypoints,
category_index, **kwargs):
return visualize_boxes_and_labels_on_image_array(
image,
boxes,
classes,
scores,
category_index=category_index,
keypoints=keypoints,
**kwargs)
def _visualize_boxes_and_masks_and_keypoints(
image, boxes, classes, scores, masks, keypoints, category_index, **kwargs):
return visualize_boxes_and_labels_on_image_array(
image,
boxes,
classes,
scores,
category_index=category_index,
instance_masks=masks,
keypoints=keypoints,
**kwargs)
def draw_bounding_boxes_on_image_tensors(images,
boxes,
classes,
scores,
category_index,
instance_masks=None,
keypoints=None,
max_boxes_to_draw=20,
min_score_thresh=0.2,
use_normalized_coordinates=True):
"""Draws bounding boxes, masks, and keypoints on batch of image tensors.
Args:
images: A 4D uint8 image tensor of shape [N, H, W, C]. If C > 3, additional
channels will be ignored.
boxes: [N, max_detections, 4] float32 tensor of detection boxes.
classes: [N, max_detections] int tensor of detection classes. Note that
classes are 1-indexed.
scores: [N, max_detections] float32 tensor of detection scores.
category_index: a dict that maps integer ids to category dicts. e.g.
{1: {1: 'dog'}, 2: {2: 'cat'}, ...}
instance_masks: A 4D uint8 tensor of shape [N, max_detection, H, W] with
instance masks.
keypoints: A 4D float32 tensor of shape [N, max_detection, num_keypoints, 2]
with keypoints.
max_boxes_to_draw: Maximum number of boxes to draw on an image. Default 20.
min_score_thresh: Minimum score threshold for visualization. Default 0.2.
use_normalized_coordinates: Whether to assume boxes and kepoints are in
normalized coordinates (as opposed to absolute coordiantes).
Default is True.
Returns:
4D image tensor of type uint8, with boxes drawn on top.
"""
# Additional channels are being ignored.
images = images[:, :, :, 0:3]
visualization_keyword_args = {
'use_normalized_coordinates': use_normalized_coordinates,
'max_boxes_to_draw': max_boxes_to_draw,
'min_score_thresh': min_score_thresh,
'agnostic_mode': False,
'line_thickness': 4
}
if instance_masks is not None and keypoints is None:
visualize_boxes_fn = functools.partial(
_visualize_boxes_and_masks,
category_index=category_index,
**visualization_keyword_args)
elems = [images, boxes, classes, scores, instance_masks]
elif instance_masks is None and keypoints is not None:
visualize_boxes_fn = functools.partial(
_visualize_boxes_and_keypoints,
category_index=category_index,
**visualization_keyword_args)
elems = [images, boxes, classes, scores, keypoints]
elif instance_masks is not None and keypoints is not None:
visualize_boxes_fn = functools.partial(
_visualize_boxes_and_masks_and_keypoints,
category_index=category_index,
**visualization_keyword_args)
elems = [images, boxes, classes, scores, instance_masks, keypoints]
else:
visualize_boxes_fn = functools.partial(
_visualize_boxes,
category_index=category_index,
**visualization_keyword_args)
elems = [images, boxes, classes, scores]
def draw_boxes(image_and_detections):
"""Draws boxes on image."""
image_with_boxes = tf.py_func(visualize_boxes_fn, image_and_detections,
tf.uint8)
return image_with_boxes
images = tf.map_fn(draw_boxes, elems, dtype=tf.uint8, back_prop=False)
return images
def draw_side_by_side_evaluation_image(eval_dict,
category_index,
max_boxes_to_draw=20,
min_score_thresh=0.2,
use_normalized_coordinates=True):
"""Creates a side-by-side image with detections and groundtruth.
Bounding boxes (and instance masks, if available) are visualized on both
subimages.
Args:
eval_dict: The evaluation dictionary returned by
eval_util.result_dict_for_single_example().
category_index: A category index (dictionary) produced from a labelmap.
max_boxes_to_draw: The maximum number of boxes to draw for detections.
min_score_thresh: The minimum score threshold for showing detections.
use_normalized_coordinates: Whether to assume boxes and kepoints are in
normalized coordinates (as opposed to absolute coordiantes).
Default is True.
Returns:
A [1, H, 2 * W, C] uint8 tensor. The subimage on the left corresponds to
detections, while the subimage on the right corresponds to groundtruth.
"""
detection_fields = fields.DetectionResultFields()
input_data_fields = fields.InputDataFields()
instance_masks = None
if detection_fields.detection_masks in eval_dict:
instance_masks = tf.cast(
tf.expand_dims(eval_dict[detection_fields.detection_masks], axis=0),
tf.uint8)
keypoints = None
if detection_fields.detection_keypoints in eval_dict:
keypoints = tf.expand_dims(
eval_dict[detection_fields.detection_keypoints], axis=0)
groundtruth_instance_masks = None
if input_data_fields.groundtruth_instance_masks in eval_dict:
groundtruth_instance_masks = tf.cast(
tf.expand_dims(
eval_dict[input_data_fields.groundtruth_instance_masks], axis=0),
tf.uint8)
images_with_detections = draw_bounding_boxes_on_image_tensors(
eval_dict[input_data_fields.original_image],
tf.expand_dims(eval_dict[detection_fields.detection_boxes], axis=0),
tf.expand_dims(eval_dict[detection_fields.detection_classes], axis=0),
tf.expand_dims(eval_dict[detection_fields.detection_scores], axis=0),
category_index,
instance_masks=instance_masks,
keypoints=keypoints,
max_boxes_to_draw=max_boxes_to_draw,
min_score_thresh=min_score_thresh,
use_normalized_coordinates=use_normalized_coordinates)
images_with_groundtruth = draw_bounding_boxes_on_image_tensors(
eval_dict[input_data_fields.original_image],
tf.expand_dims(eval_dict[input_data_fields.groundtruth_boxes], axis=0),
tf.expand_dims(eval_dict[input_data_fields.groundtruth_classes], axis=0),
tf.expand_dims(
tf.ones_like(
eval_dict[input_data_fields.groundtruth_classes],
dtype=tf.float32),
axis=0),
category_index,
instance_masks=groundtruth_instance_masks,
keypoints=None,
max_boxes_to_draw=None,
min_score_thresh=0.0,
use_normalized_coordinates=use_normalized_coordinates)
return tf.concat([images_with_detections, images_with_groundtruth], axis=2)
def draw_keypoints_on_image_array(image,
keypoints,
color='red',
radius=2,
use_normalized_coordinates=True):
"""Draws keypoints on an image (numpy array).
Args:
image: a numpy array with shape [height, width, 3].
keypoints: a numpy array with shape [num_keypoints, 2].
color: color to draw the keypoints with. Default is red.
radius: keypoint radius. Default value is 2.
use_normalized_coordinates: if True (default), treat keypoint values as
relative to the image. Otherwise treat them as absolute.
"""
image_pil = Image.fromarray(np.uint8(image)).convert('RGB')
draw_keypoints_on_image(image_pil, keypoints, color, radius,
use_normalized_coordinates)
np.copyto(image, np.array(image_pil))
def draw_keypoints_on_image(image,
keypoints,
color='red',
radius=2,
use_normalized_coordinates=True):
"""Draws keypoints on an image.
Args:
image: a PIL.Image object.
keypoints: a numpy array with shape [num_keypoints, 2].
color: color to draw the keypoints with. Default is red.
radius: keypoint radius. Default value is 2.
use_normalized_coordinates: if True (default), treat keypoint values as
relative to the image. Otherwise treat them as absolute.
"""
draw = ImageDraw.Draw(image)
im_width, im_height = image.size
keypoints_x = [k[1] for k in keypoints]
keypoints_y = [k[0] for k in keypoints]
if use_normalized_coordinates:
keypoints_x = tuple([im_width * x for x in keypoints_x])
keypoints_y = tuple([im_height * y for y in keypoints_y])
for keypoint_x, keypoint_y in zip(keypoints_x, keypoints_y):
draw.ellipse([(keypoint_x - radius, keypoint_y - radius),
(keypoint_x + radius, keypoint_y + radius)],
outline=color, fill=color)
def draw_mask_on_image_array(image, mask, color='red', alpha=0.4):
"""Draws mask on an image.
Args:
image: uint8 numpy array with shape (img_height, img_height, 3)
mask: a uint8 numpy array of shape (img_height, img_height) with
values between either 0 or 1.
color: color to draw the keypoints with. Default is red.
alpha: transparency value between 0 and 1. (default: 0.4)
Raises:
ValueError: On incorrect data type for image or masks.
"""
if image.dtype != np.uint8:
raise ValueError('`image` not of type np.uint8')
if mask.dtype != np.uint8:
raise ValueError('`mask` not of type np.uint8')
if np.any(np.logical_and(mask != 1, mask != 0)):
raise ValueError('`mask` elements should be in [0, 1]')
if image.shape[:2] != mask.shape:
raise ValueError('The image has spatial dimensions %s but the mask has '
'dimensions %s' % (image.shape[:2], mask.shape))
rgb = ImageColor.getrgb(color)
pil_image = Image.fromarray(image)
solid_color = np.expand_dims(
np.ones_like(mask), axis=2) * np.reshape(list(rgb), [1, 1, 3])
pil_solid_color = Image.fromarray(np.uint8(solid_color)).convert('RGBA')
pil_mask = Image.fromarray(np.uint8(255.0*alpha*mask)).convert('L')
pil_image = Image.composite(pil_solid_color, pil_image, pil_mask)
np.copyto(image, np.array(pil_image.convert('RGB')))
def visualize_boxes_and_labels_on_image_array(
image,
boxes,
classes,
scores,
category_index,
instance_masks=None,
instance_boundaries=None,
keypoints=None,
use_normalized_coordinates=False,
max_boxes_to_draw=20,
min_score_thresh=.5,
agnostic_mode=False,
line_thickness=4,
groundtruth_box_visualization_color='black',
skip_scores=False,
skip_labels=False):
"""Overlay labeled boxes on an image with formatted scores and label names.
This function groups boxes that correspond to the same location
and creates a display string for each detection and overlays these
on the image. Note that this function modifies the image in place, and returns
that same image.
Args:
image: uint8 numpy array with shape (img_height, img_width, 3)
boxes: a numpy array of shape [N, 4]
classes: a numpy array of shape [N]. Note that class indices are 1-based,
and match the keys in the label map.
scores: a numpy array of shape [N] or None. If scores=None, then
this function assumes that the boxes to be plotted are groundtruth
boxes and plot all boxes as black with no classes or scores.
category_index: a dict containing category dictionaries (each holding
category index `id` and category name `name`) keyed by category indices.
instance_masks: a numpy array of shape [N, image_height, image_width] with
values ranging between 0 and 1, can be None.
instance_boundaries: a numpy array of shape [N, image_height, image_width]
with values ranging between 0 and 1, can be None.
keypoints: a numpy array of shape [N, num_keypoints, 2], can
be None
use_normalized_coordinates: whether boxes is to be interpreted as
normalized coordinates or not.
max_boxes_to_draw: maximum number of boxes to visualize. If None, draw
all boxes.
min_score_thresh: minimum score threshold for a box to be visualized
agnostic_mode: boolean (default: False) controlling whether to evaluate in
class-agnostic mode or not. This mode will display scores but ignore
classes.
line_thickness: integer (default: 4) controlling line width of the boxes.
groundtruth_box_visualization_color: box color for visualizing groundtruth
boxes
skip_scores: whether to skip score when drawing a single detection
skip_labels: whether to skip label when drawing a single detection
Returns:
uint8 numpy array with shape (img_height, img_width, 3) with overlaid boxes.
"""
# Create a display string (and color) for every box location, group any boxes
# that correspond to the same location.
box_to_display_str_map = collections.defaultdict(list)
box_to_color_map = collections.defaultdict(str)
box_to_instance_masks_map = {}
box_to_instance_boundaries_map = {}
box_to_keypoints_map = collections.defaultdict(list)
if not max_boxes_to_draw:
max_boxes_to_draw = boxes.shape[0]
for i in range(min(max_boxes_to_draw, boxes.shape[0])):
if scores is None or scores[i] > min_score_thresh:
box = tuple(boxes[i].tolist())
if instance_masks is not None:
box_to_instance_masks_map[box] = instance_masks[i]
if instance_boundaries is not None:
box_to_instance_boundaries_map[box] = instance_boundaries[i]
if keypoints is not None:
box_to_keypoints_map[box].extend(keypoints[i])
if scores is None:
box_to_color_map[box] = groundtruth_box_visualization_color
else:
display_str = ''
if not skip_labels:
if not agnostic_mode:
if classes[i] in category_index.keys():
class_name = category_index[classes[i]]['name']
else:
class_name = 'N/A'
display_str = str(class_name)
if not skip_scores:
if not display_str:
display_str = '{}%'.format(int(100*scores[i]))
else:
display_str = '{}: {}%'.format(display_str, int(100*scores[i]))
box_to_display_str_map[box].append(display_str)
if agnostic_mode:
box_to_color_map[box] = 'DarkOrange'
else:
box_to_color_map[box] = STANDARD_COLORS[
classes[i] % len(STANDARD_COLORS)]
# Draw all boxes onto image.
for box, color in box_to_color_map.items():
ymin, xmin, ymax, xmax = box
if instance_masks is not None:
draw_mask_on_image_array(
image,
box_to_instance_masks_map[box],
color=color
)
if instance_boundaries is not None:
draw_mask_on_image_array(
image,
box_to_instance_boundaries_map[box],
color='red',
alpha=1.0
)
draw_bounding_box_on_image_array(
image,
ymin,
xmin,
ymax,
xmax,
color=color,
thickness=line_thickness,
display_str_list=box_to_display_str_map[box],
use_normalized_coordinates=use_normalized_coordinates)
if keypoints is not None:
draw_keypoints_on_image_array(
image,
box_to_keypoints_map[box],
color=color,
radius=line_thickness / 2,
use_normalized_coordinates=use_normalized_coordinates)
return image
def add_cdf_image_summary(values, name):
"""Adds a tf.summary.image for a CDF plot of the values.
Normalizes `values` such that they sum to 1, plots the cumulative distribution
function and creates a tf image summary.
Args:
values: a 1-D float32 tensor containing the values.
name: name for the image summary.
"""
def cdf_plot(values):
"""Numpy function to plot CDF."""
normalized_values = values / np.sum(values)
sorted_values = np.sort(normalized_values)
cumulative_values = np.cumsum(sorted_values)
fraction_of_examples = (np.arange(cumulative_values.size, dtype=np.float32)
/ cumulative_values.size)
fig = plt.figure(frameon=False)
ax = fig.add_subplot('111')
ax.plot(fraction_of_examples, cumulative_values)
ax.set_ylabel('cumulative normalized values')
ax.set_xlabel('fraction of examples')
fig.canvas.draw()
width, height = fig.get_size_inches() * fig.get_dpi()
image = np.fromstring(fig.canvas.tostring_rgb(), dtype='uint8').reshape(
1, int(height), int(width), 3)
return image
cdf_plot = tf.py_func(cdf_plot, [values], tf.uint8)
tf.summary.image(name, cdf_plot)
def add_hist_image_summary(values, bins, name):
"""Adds a tf.summary.image for a histogram plot of the values.
Plots the histogram of values and creates a tf image summary.
Args:
values: a 1-D float32 tensor containing the values.
bins: bin edges which will be directly passed to np.histogram.
name: name for the image summary.
"""
def hist_plot(values, bins):
"""Numpy function to plot hist."""
fig = plt.figure(frameon=False)
ax = fig.add_subplot('111')
y, x = np.histogram(values, bins=bins)
ax.plot(x[:-1], y)
ax.set_ylabel('count')
ax.set_xlabel('value')
fig.canvas.draw()
width, height = fig.get_size_inches() * fig.get_dpi()
image = np.fromstring(
fig.canvas.tostring_rgb(), dtype='uint8').reshape(
1, int(height), int(width), 3)
return image
hist_plot = tf.py_func(hist_plot, [values, bins], tf.uint8)
tf.summary.image(name, hist_plot)
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Initializes at random location and visualizes the optimal path.
Different modes of execution:
1) benchmark: It generates benchmark_iter sample trajectory to random goals
and plots the histogram of path lengths. It can be also used to see how fast
it runs.
2) vis: It visualizes the generated paths by image, semantic segmentation, and
so on.
3) human: allows the user to navigate through environment from keyboard input.
python viz_active_vision_dataset_main -- \
--mode=benchmark --benchmark_iter=1000 --gin_config=envs/configs/active_vision_config.gin
python viz_active_vision_dataset_main -- \
--mode=vis \
--gin_config=envs/configs/active_vision_config.gin
python viz_active_vision_dataset_main -- \
--mode=human \
--gin_config=envs/configs/active_vision_config.gin
python viz_active_vision_dataset_main.py --mode=eval --eval_folder=/usr/local/google/home/$USER/checkin_log_det/evals/ --output_folder=/usr/local/google/home/$USER/test_imgs/ --gin_config=envs/configs/active_vision_config.gin
"""
import matplotlib
# pylint: disable=g-import-not-at-top
# Need Tk for interactive plots.
matplotlib.use('TkAgg')
import tensorflow as tf
from matplotlib import pyplot as plt
import numpy as np
import os
from pyglib import app
from pyglib import flags
import gin
import cv2
from envs import active_vision_dataset_env
from envs import task_env
VIS_MODE = 'vis'
HUMAN_MODE = 'human'
BENCHMARK_MODE = 'benchmark'
GRAPH_MODE = 'graph'
EVAL_MODE = 'eval'
flags.DEFINE_enum('mode', VIS_MODE,
[VIS_MODE, HUMAN_MODE, BENCHMARK_MODE, GRAPH_MODE, EVAL_MODE],
'mode of the execution')
flags.DEFINE_integer('benchmark_iter', 1000,
'number of iterations for benchmarking')
flags.DEFINE_string('eval_folder', '', 'the path to the eval folder')
flags.DEFINE_string('output_folder', '',
'the path to which the images and gifs are written')
flags.DEFINE_multi_string('gin_config', [],
'List of paths to a gin config files for the env.')
flags.DEFINE_multi_string('gin_params', [],
'Newline separated list of Gin parameter bindings.')
mt = task_env.ModalityTypes
FLAGS = flags.FLAGS
def benchmark(env, targets):
"""Benchmarks the speed of sequence generation by env.
Args:
env: environment.
targets: list of target classes.
"""
episode_lengths = {}
all_init_configs = {}
all_actions = dict([(a, 0.) for a in env.actions])
for i in range(FLAGS.benchmark_iter):
path, actions, _, _ = env.random_step_sequence()
selected_actions = np.argmax(actions, axis=-1)
new_actions = dict([(a, 0.) for a in env.actions])
for a in selected_actions:
new_actions[env.actions[a]] += 1. / selected_actions.shape[0]
for a in new_actions:
all_actions[a] += new_actions[a] / FLAGS.benchmark_iter
start_image_id, world, goal = env.get_init_config(path)
print world
if world not in all_init_configs:
all_init_configs[world] = set()
all_init_configs[world].add((start_image_id, goal, len(actions)))
if env.goal_index not in episode_lengths:
episode_lengths[env.goal_index] = []
episode_lengths[env.goal_index].append(len(actions))
for i, cls in enumerate(episode_lengths):
plt.subplot(231 + i)
plt.hist(episode_lengths[cls])
plt.title(targets[cls])
plt.show()
def human(env, targets):
"""Lets user play around the env manually."""
string_key_map = {
'a': 'left',
'd': 'right',
'w': 'forward',
's': 'backward',
'j': 'rotate_ccw',
'l': 'rotate_cw',
'n': 'stop'
}
integer_key_map = {
'a': env.actions.index('left'),
'd': env.actions.index('right'),
'w': env.actions.index('forward'),
's': env.actions.index('backward'),
'j': env.actions.index('rotate_ccw'),
'l': env.actions.index('rotate_cw'),
'n': env.actions.index('stop')
}
for k in integer_key_map:
integer_key_map[k] = np.int32(integer_key_map[k])
plt.ion()
for _ in range(20):
obs = env.reset()
steps = -1
action = None
while True:
print 'distance = ', obs[task_env.ModalityTypes.DISTANCE]
steps += 1
depth_value = obs[task_env.ModalityTypes.DEPTH][:, :, 0]
depth_mask = obs[task_env.ModalityTypes.DEPTH][:, :, 1]
seg_mask = np.squeeze(obs[task_env.ModalityTypes.SEMANTIC_SEGMENTATION])
det_mask = np.argmax(
obs[task_env.ModalityTypes.OBJECT_DETECTION], axis=-1)
img = obs[task_env.ModalityTypes.IMAGE]
plt.subplot(231)
plt.title('steps = {}'.format(steps))
plt.imshow(img.astype(np.uint8))
plt.subplot(232)
plt.imshow(depth_value)
plt.title('depth value')
plt.subplot(233)
plt.imshow(depth_mask)
plt.title('depth mask')
plt.subplot(234)
plt.imshow(seg_mask)
plt.title('seg')
plt.subplot(235)
plt.imshow(det_mask)
plt.title('det')
plt.subplot(236)
plt.title('goal={}'.format(targets[env.goal_index]))
plt.draw()
while True:
s = raw_input('key = ')
if np.random.rand() > 0.5:
key_map = string_key_map
else:
key_map = integer_key_map
if s in key_map:
action = key_map[s]
break
else:
print 'invalid action'
print 'action = {}'.format(action)
if action == 'stop':
print 'dist to goal: {}'.format(len(env.path_to_goal()) - 2)
break
obs, reward, done, info = env.step(action)
print 'reward = {}, done = {}, success = {}'.format(
reward, done, info['success'])
def visualize_random_step_sequence(env):
"""Visualizes random sequence of steps."""
plt.ion()
for _ in range(20):
path, actions, _, step_outputs = env.random_step_sequence(max_len=30)
print 'path = {}'.format(path)
for action, step_output in zip(actions, step_outputs):
obs, _, done, _ = step_output
depth_value = obs[task_env.ModalityTypes.DEPTH][:, :, 0]
depth_mask = obs[task_env.ModalityTypes.DEPTH][:, :, 1]
seg_mask = np.squeeze(obs[task_env.ModalityTypes.SEMANTIC_SEGMENTATION])
det_mask = np.argmax(
obs[task_env.ModalityTypes.OBJECT_DETECTION], axis=-1)
img = obs[task_env.ModalityTypes.IMAGE]
plt.subplot(231)
plt.imshow(img.astype(np.uint8))
plt.subplot(232)
plt.imshow(depth_value)
plt.title('depth value')
plt.subplot(233)
plt.imshow(depth_mask)
plt.title('depth mask')
plt.subplot(234)
plt.imshow(seg_mask)
plt.title('seg')
plt.subplot(235)
plt.imshow(det_mask)
plt.title('det')
plt.subplot(236)
print 'action = {}'.format(action)
print 'done = {}'.format(done)
plt.draw()
if raw_input('press \'n\' to go to the next random sequence. Otherwise, '
'press any key to continue...') == 'n':
break
def visualize(env, input_folder, output_root_folder):
"""visualizes images for sequence of steps from the evals folder."""
def which_env(file_name):
img_name = file_name.split('_')[0][2:5]
env_dict = {'161': 'Home_016_1', '131': 'Home_013_1', '111': 'Home_011_1'}
if img_name in env_dict:
return env_dict[img_name]
else:
raise ValueError('could not resolve env: {} {}'.format(
img_name, file_name))
def which_goal(file_name):
return file_name[file_name.find('_')+1:]
output_images_folder = os.path.join(output_root_folder, 'images')
output_gifs_folder = os.path.join(output_root_folder, 'gifs')
if not tf.gfile.IsDirectory(output_images_folder):
tf.gfile.MakeDirs(output_images_folder)
if not tf.gfile.IsDirectory(output_gifs_folder):
tf.gfile.MakeDirs(output_gifs_folder)
npy_files = [
os.path.join(input_folder, name)
for name in tf.gfile.ListDirectory(input_folder)
if name.find('npy') >= 0
]
for i, npy_file in enumerate(npy_files):
print 'saving images {}/{}'.format(i, len(npy_files))
pure_name = npy_file[npy_file.rfind('/') + 1:-4]
output_folder = os.path.join(output_images_folder, pure_name)
if not tf.gfile.IsDirectory(output_folder):
tf.gfile.MakeDirs(output_folder)
print '*******'
print pure_name[0:pure_name.find('_')]
env.reset_for_eval(which_env(pure_name),
which_goal(pure_name),
pure_name[0:pure_name.find('_')],
)
with tf.gfile.Open(npy_file) as h:
states = np.load(h).item()['states']
images = [
env.observation(state)[mt.IMAGE] for state in states
]
for j, img in enumerate(images):
cv2.imwrite(os.path.join(output_folder, '{0:03d}'.format(j) + '.jpg'),
img[:, :, ::-1])
print 'converting to gif'
os.system(
'convert -set delay 20 -colors 256 -dispose 1 {}/*.jpg {}.gif'.format(
output_folder,
os.path.join(output_gifs_folder, pure_name + '.gif')
)
)
def evaluate_folder(env, folder_path):
"""Evaluates the performance from the evals folder."""
targets = ['fridge', 'dining_table', 'microwave', 'tv', 'couch']
def compute_acc(npy_file):
with tf.gfile.Open(npy_file) as h:
data = np.load(h).item()
if npy_file.find('dining_table') >= 0:
category = 'dining_table'
else:
category = npy_file[npy_file.rfind('_') + 1:-4]
return category, data['distance'][-1] - 2
def evaluate_iteration(folder):
"""Evaluates the data from the folder of certain eval iteration."""
print folder
npy_files = [
os.path.join(folder, name)
for name in tf.gfile.ListDirectory(folder)
if name.find('npy') >= 0
]
eval_stats = {c: [] for c in targets}
for npy_file in npy_files:
try:
category, dist = compute_acc(npy_file)
except: # pylint: disable=bare-except
continue
eval_stats[category].append(float(dist <= 5))
for c in eval_stats:
if not eval_stats[c]:
print 'incomplete eval {}: empty class {}'.format(folder_path, c)
return None
eval_stats[c] = np.mean(eval_stats[c])
eval_stats['mean'] = np.mean(eval_stats.values())
return eval_stats
checkpoint_folders = [
folder_path + x
for x in tf.gfile.ListDirectory(folder_path)
if tf.gfile.IsDirectory(folder_path + x)
]
print '{} folders found'.format(len(checkpoint_folders))
print '------------------------'
all_iters = []
all_accs = []
for i, folder in enumerate(checkpoint_folders):
print 'processing {}/{}'.format(i, len(checkpoint_folders))
eval_stats = evaluate_iteration(folder)
if eval_stats is None:
continue
else:
iter_no = int(folder[folder.rfind('/') + 1:])
print 'result ', iter_no, eval_stats['mean']
all_accs.append(eval_stats['mean'])
all_iters.append(iter_no)
all_accs = np.asarray(all_accs)
all_iters = np.asarray(all_iters)
idx = np.argmax(all_accs)
print 'best result at iteration {} was {}'.format(all_iters[idx],
all_accs[idx])
order = np.argsort(all_iters)
all_iters = all_iters[order]
all_accs = all_accs[order]
#plt.plot(all_iters, all_accs)
#plt.show()
#print 'done plotting'
best_iteration_folder = os.path.join(folder_path, str(all_iters[idx]))
print 'generating gifs and images for {}'.format(best_iteration_folder)
visualize(env, best_iteration_folder, FLAGS.output_folder)
def main(_):
gin.parse_config_files_and_bindings(FLAGS.gin_config, FLAGS.gin_params)
print('********')
print(FLAGS.mode)
print(FLAGS.gin_config)
print(FLAGS.gin_params)
env = active_vision_dataset_env.ActiveVisionDatasetEnv(modality_types=[
task_env.ModalityTypes.IMAGE,
task_env.ModalityTypes.SEMANTIC_SEGMENTATION,
task_env.ModalityTypes.OBJECT_DETECTION, task_env.ModalityTypes.DEPTH,
task_env.ModalityTypes.DISTANCE
])
if FLAGS.mode == BENCHMARK_MODE:
benchmark(env, env.possible_targets)
elif FLAGS.mode == GRAPH_MODE:
for loc in env.worlds:
env.check_scene_graph(loc, 'fridge')
elif FLAGS.mode == HUMAN_MODE:
human(env, env.possible_targets)
elif FLAGS.mode == VIS_MODE:
visualize_random_step_sequence(env)
elif FLAGS.mode == EVAL_MODE:
evaluate_folder(env, FLAGS.eval_folder)
if __name__ == '__main__':
app.run(main)
![No Maintenance Intended](https://img.shields.io/badge/No%20Maintenance%20Intended-%E2%9C%95-red.svg)
![TensorFlow Requirement: 1.x](https://img.shields.io/badge/TensorFlow%20Requirement-1.x-brightgreen)
![TensorFlow 2 Not Supported](https://img.shields.io/badge/TensorFlow%202%20Not%20Supported-%E2%9C%95-red.svg)
# Compression with Neural Networks
This is a [TensorFlow](http://www.tensorflow.org/) model repo containing
research on compression with neural networks. This repo currently contains
code for the following papers:
[Full Resolution Image Compression with Recurrent Neural Networks](https://arxiv.org/abs/1608.05148)
## Organization
[Image Encoder](image_encoder/): Encoding and decoding images into their binary representation.
[Entropy Coder](entropy_coder/): Lossless compression of the binary representation.
## Contact Info
Model repository maintained by Nick Johnston ([nmjohn](https://github.com/nmjohn)).
# Neural net based entropy coding
This is a [TensorFlow](http://www.tensorflow.org/) model for additional
lossless compression of bitstreams generated by neural net based image
encoders as described in
[https://arxiv.org/abs/1703.10114](https://arxiv.org/abs/1703.10114).
To be more specific, the entropy coder aims at compressing further binary
codes which have a 3D tensor structure with:
* the first two dimensions of the tensors corresponding to the height and
the width of the binary codes,
* the last dimension being the depth of the codes. The last dimension can be
sliced into N groups of K, where each additional group is used by the image
decoder to add more details to the reconstructed image.
The code in this directory only contains the underlying code probability model
but does not perform the actual compression using arithmetic coding.
The code probability model is enough to compute the theoretical compression
ratio.
## Prerequisites
The only software requirements for running the encoder and decoder is having
Tensorflow installed.
You will also need to add the top level source directory of the entropy coder
to your `PYTHONPATH`, for example:
`export PYTHONPATH=${PYTHONPATH}:/tmp/models/compression`
## Training the entropy coder
### Synthetic dataset
If you do not have a training dataset, there is a simple code generative model
that you can use to generate a dataset and play with the entropy coder.
The generative model is located under dataset/gen\_synthetic\_dataset.py. Note
that this simple generative model is not going to give good results on real
images as it is not supposed to be close to the statistics of the binary
representation of encoded images. Consider it as a toy dataset, no more, no
less.
To generate a synthetic dataset with 20000 samples:
`mkdir -p /tmp/dataset`
`python ./dataset/gen_synthetic_dataset.py --dataset_dir=/tmp/dataset/
--count=20000`
Note that the generator has not been optimized at all, generating the synthetic
dataset is currently pretty slow.
### Training
If you just want to play with the entropy coder trainer, here is the command
line that can be used to train the entropy coder on the synthetic dataset:
`mkdir -p /tmp/entropy_coder_train`
`python ./core/entropy_coder_train.py --task=0
--train_dir=/tmp/entropy_coder_train/
--model=progressive
--model_config=./configs/synthetic/model_config.json
--train_config=./configs/synthetic/train_config.json
--input_config=./configs/synthetic/input_config.json
`
Training is configured using 3 files formatted using JSON:
* One file is used to configure the underlying entropy coder model.
Currently, only the *progressive* model is supported.
This model takes 2 mandatory parameters and an optional one:
* `layer_depth`: the number of bits per layer (a.k.a. iteration).
Background: the image decoder takes each layer to add more detail
to the image.
* `layer_count`: the maximum number of layers that should be supported
by the model. This should be equal or greater than the maximum number
of layers in the input binary codes.
* `coded_layer_count`: This can be used to consider only partial codes,
keeping only the first `coded_layer_count` layers and ignoring the
remaining layers. If left empty, the binary codes are left unchanged.
* One file to configure the training, including the learning rate, ...
The meaning of the parameters are pretty straightforward. Note that this
file is only used during training and is not needed during inference.
* One file to specify the input dataset to use during training.
The dataset is formatted using tf.RecordIO.
## Inference: file size after entropy coding.
### Using a synthetic sample
Here is the command line to generate a single synthetic sample formatted
in the same way as what is provided by the image encoder:
`python ./dataset/gen_synthetic_single.py
--sample_filename=/tmp/dataset/sample_0000.npz`
To actually compute the additional compression ratio using the entropy coder
trained in the previous step:
`python ./core/entropy_coder_single.py
--model=progressive
--model_config=./configs/synthetic/model_config.json
--input_codes=/tmp/dataset/sample_0000.npz
--checkpoint=/tmp/entropy_coder_train/model.ckpt-209078`
where the checkpoint number should be adjusted accordingly.
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Import and register all the entropy coder models."""
# pylint: disable=unused-import
from entropy_coder.progressive import progressive
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Basic test of all registered models."""
import tensorflow as tf
# pylint: disable=unused-import
import all_models
# pylint: enable=unused-import
from entropy_coder.model import model_factory
class AllModelsTest(tf.test.TestCase):
def testBuildModelForTraining(self):
factory = model_factory.GetModelRegistry()
model_names = factory.GetAvailableModels()
for m in model_names:
tf.reset_default_graph()
global_step = tf.Variable(tf.zeros([], dtype=tf.int64),
trainable=False,
name='global_step')
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
batch_size = 3
height = 40
width = 20
depth = 5
binary_codes = tf.placeholder(dtype=tf.float32,
shape=[batch_size, height, width, depth])
# Create a model with the default configuration.
print('Creating model: {}'.format(m))
model = factory.CreateModel(m)
model.Initialize(global_step,
optimizer,
model.GetConfigStringForUnitTest())
self.assertTrue(model.loss is None, 'model: {}'.format(m))
self.assertTrue(model.train_op is None, 'model: {}'.format(m))
self.assertTrue(model.average_code_length is None, 'model: {}'.format(m))
# Build the Tensorflow graph corresponding to the model.
model.BuildGraph(binary_codes)
self.assertTrue(model.loss is not None, 'model: {}'.format(m))
self.assertTrue(model.average_code_length is not None,
'model: {}'.format(m))
if model.train_op is None:
print('Model {} is not trainable'.format(m))
if __name__ == '__main__':
tf.test.main()
{
"data": "/tmp/dataset/synthetic_dataset",
"unique_code_size": true
}
{
"batch_size": 4,
"learning_rate": 0.1,
"decay_rate": 0.9,
"samples_per_decay": 20000
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment