Unverified Commit 5a5d3305 authored by Lukasz Kaiser's avatar Lukasz Kaiser Committed by GitHub
Browse files

Merge pull request #2969 from coreylynch/master

Adding TCN.
parents 69cf6fca aa3d4422
......@@ -32,6 +32,7 @@ research/slim/* @sguada @nathansilberman
research/street/* @theraysmith
research/swivel/* @waterson
research/syntaxnet/* @calberti @andorardo @bogatyy @markomernick
research/tcn/* @coreylynch @sermanet
research/textsum/* @panyx0718 @peterjliu
research/transformer/* @daviddao
research/video_prediction/* @cbfinn
......
......@@ -61,6 +61,7 @@ installation](https://www.tensorflow.org/install).
using a Deep RNN.
- [swivel](swivel): the Swivel algorithm for generating word embeddings.
- [syntaxnet](syntaxnet): neural models of natural language syntax.
- [tcn](tcn): Self-supervised representation learning from multi-view video.
- [textsum](textsum): sequence-to-sequence with attention model for text
summarization.
- [transformer](transformer): spatial transformer network, which allows the
......
package(default_visibility = [":internal"])
licenses(["notice"]) # Apache 2.0
exports_files(["LICENSE"])
package_group(
name = "internal",
packages = [
"//tcn/...",
],
)
py_binary(
name = "download_pretrained",
srcs = [
"download_pretrained.py",
],
)
py_binary(
name = "generate_videos",
srcs = [
"generate_videos.py",
],
main = "generate_videos.py",
deps = [
":data_providers",
":get_estimator",
":util",
],
)
py_test(
name = "svtcn_loss_test",
size = "medium",
srcs = [
"estimators/svtcn_loss.py",
"estimators/svtcn_loss_test.py",
],
deps = [
":util",
],
)
py_library(
name = "data_providers",
srcs = [
"data_providers.py",
],
deps = [
":preprocessing",
],
)
py_test(
name = "data_providers_test",
size = "large",
srcs = ["data_providers_test.py"],
deps = [
":data_providers",
],
)
py_library(
name = "preprocessing",
srcs = [
"preprocessing.py",
],
)
py_binary(
name = "get_estimator",
srcs = [
"estimators/get_estimator.py",
],
deps = [
":mvtcn_estimator",
":svtcn_estimator",
],
)
py_binary(
name = "base_estimator",
srcs = [
"estimators/base_estimator.py",
"model.py",
],
deps = [
":data_providers",
":util",
],
)
py_library(
name = "util",
srcs = [
"utils/luatables.py",
"utils/progress.py",
"utils/util.py",
],
)
py_binary(
name = "mvtcn_estimator",
srcs = [
"estimators/mvtcn_estimator.py",
],
deps = [
":base_estimator",
],
)
py_binary(
name = "svtcn_estimator",
srcs = [
"estimators/svtcn_estimator.py",
"estimators/svtcn_loss.py",
],
deps = [
":base_estimator",
],
)
py_binary(
name = "train",
srcs = [
"train.py",
],
deps = [
":data_providers",
":get_estimator",
":util",
],
)
py_binary(
name = "labeled_eval",
srcs = [
"labeled_eval.py",
],
deps = [
":get_estimator",
],
)
py_test(
name = "labeled_eval_test",
size = "small",
srcs = ["labeled_eval_test.py"],
deps = [
":labeled_eval",
],
)
py_binary(
name = "eval",
srcs = [
"eval.py",
],
deps = [
":get_estimator",
],
)
py_binary(
name = "alignment",
srcs = [
"alignment.py",
],
deps = [
":get_estimator",
],
)
py_binary(
name = "visualize_embeddings",
srcs = [
"visualize_embeddings.py",
],
deps = [
":data_providers",
":get_estimator",
":util",
],
)
py_binary(
name = "webcam",
srcs = [
"dataset/webcam.py",
],
main = "dataset/webcam.py",
)
py_binary(
name = "images_to_videos",
srcs = [
"dataset/images_to_videos.py",
],
main = "dataset/images_to_videos.py",
)
py_binary(
name = "videos_to_tfrecords",
srcs = [
"dataset/videos_to_tfrecords.py",
],
main = "dataset/videos_to_tfrecords.py",
deps = [
":preprocessing",
],
)
# Time Contrastive Networks
This implements ["Time Contrastive Networks"](https://arxiv.org/abs/1704.06888),
which is part of the larger [Self-Supervised Imitation
Learning](https://sermanet.github.io/imitation/) project.
![](https://sermanet.github.io/tcn/docs/figs/mvTCN.png)
## Contacts
Maintainers of TCN:
* Corey Lynch: [github](https://github.com/coreylynch),
[twitter](https://twitter.com/coreylynch)
* Pierre Sermanet: [github](https://github.com/sermanet),
[twitter](https://twitter.com/psermanet)
## Contents
* [Getting Started](#getting-started)
* [Install Dependencies](#install-dependencies)
* [Download the Inception v3
Checkpoint](#download-pretrained-inceptionv3-checkpoint)
* [Run all the tests](#run-all-the-tests)
* [Concepts](#concepts)
* [Multi-view Webcam Video](#multi-view-webcam-video)
* [Data Pipelines](#data-pipelines)
* [Estimators](#estimators)
* [Models](#models)
* [Losses](#losses)
* [Inference](#inference)
* [Configuration](#configuration)
* [Monitoring Training](#monitoring-training)
* [KNN Classification Error](#knn-classification-error)
* [KNN Classification Error](#multi-view-alignment)
* [Visualization](#visualization)
* [Nearest Neighbor Imitation
Videos](#nearest-neighbor-imitation-videos)
* [PCA & T-SNE Visualization](#pca-t-sne-visualization)
* [Tutorial Part I: Collecting Multi-View Webcam
Videos](#tutorial-part-i-collecting-multi-view-webcam-videos)
* [Collect Webcam Videos](#collect-webcam-videos)
* [Create TFRecords](#create-tfrecords)
* [Tutorial Part II: Training, Evaluation, and
Visualization](#tutorial-part-ii-training-evaluation-and-visualization)
* [Download Data](#download-data)
* [Download the Inception v3
Checkpoint](#download-pretrained-inceptionv3-checkpoint)
* [Define a Config](#define-a-config)
* [Train](#train)
* [Evaluate](#evaluate)
* [Monitor training](#monior-training)
* [Visualize](#visualize)
* [Generate Imitation Videos](#generate-imitation-videos)
* [Run PCA & T-SNE Visualization](#t-sne-pca-visualization)
## Getting started
### Install Dependencies
* [Tensorflow nightly build](https://pypi.python.org/pypi/tf-nightly-gpu) or
via `pip install tf-nightly-gpu`.
* [Bazel](http://bazel.io/docs/install.html)
* matplotlib
* sklearn
* opencv
### Download Pretrained InceptionV3 Checkpoint
Run the script that downloads the pretrained InceptionV3 checkpoint:
```bash
cd tensorflow-models/tcn
python download_pretrained.py
```
### Run all the tests
```bash
bazel test :all
```
## Concepts
### Multi-View Webcam Video
We provide utilities to collect your own multi-view videos in dataset/webcam.py.
See the [webcam tutorial](#tutorial-part-i-collecting-multi-view-webcam-videos)
for an end to end example of how to collect multi-view webcam data and convert
it to the TFRecord format expected by this library.
## Data Pipelines
We use the [tf.data.Dataset
API](https://www.tensorflow.org/programmers_guide/datasets) to construct input
pipelines that feed training, evaluation, and visualization. These pipelines are
defined in `data_providers.py`.
## Estimators
We define training, evaluation, and inference behavior using the
[tf.estimator.Estimator
API](https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator). See
`estimators/mvtcn_estimator.py` for an example of how multi-view TCN training,
evaluation, and inference is implemented.
## Models
Different embedder architectures are implemented in model.py. We used the
`InceptionConvSSFCEmbedder` in the pouring experiments, but we're also
evaluating `Resnet` embedders.
## Losses
We use the
[tf.contrib.losses.metric_learning](https://www.tensorflow.org/versions/master/api_docs/python/tf/contrib/losses/metric_learning)
library's implementations of triplet loss with semi-hard negative mining and
npairs loss. In our experiments, npairs loss has better empirical convergence
and produces the best qualitative visualizations, and will likely be our choice
for future experiments. See the
[paper](http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/papers/nips16_npairmetriclearning.pdf)
for details on the algorithm.
## Inference
We support 3 modes of inference for trained TCN models:
* Mode 1: Input is a tf.Estimator input_fn (see
[this](https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator#predict)
for details). Output is an iterator over embeddings and additional metadata.
See `labeled_eval.py` for a usage example.
* Mode 2: Input is a TFRecord or (or list of TFRecords). This returns an
iterator over tuples of (embeddings, raw_image_strings, sequence_name),
where embeddings is the [num views, sequence length, embedding size] numpy
array holding the full embedded sequence (for all views), raw_image_strings
is a [num views, sequence length] string array holding the jpeg-encoded raw
image strings, and sequence_name is the name of the sequence. See
`generate_videos.py` for a usage example.
* Mode 3: Input is a numpy array of size [num images, height, width, num
channels]. This returns a tuple of (embeddings, raw_image_strings), where
embeddings is a 2-D float32 numpy array holding [num_images, embedding_size]
image embeddings, and raw_image_strings is a 1-D string numpy array holding
[batch_size] jpeg-encoded image strings. This can be used as follows:
```python
images = np.random.uniform(0, 1, size=(batch_size, 1080, 1920, 3))
embeddings, _ = estimator.inference(
images, checkpoint_path=checkpoint_path)
```
See `estimators/base_estimator.py` for details.
## Configuration
Data pipelines, training, eval, and visualization are all configured using
key-value parameters passed as [YAML](https://en.wikipedia.org/wiki/YAML) files.
Configurations can be nested, e.g.:
```yaml
learning:
optimizer: 'adam'
learning_rate: 0.001
```
### T objects
YAML configs are converted to LuaTable-like `T` object (see
`utils/luatables.py`), which behave like a python `dict`, but allow you to use
dot notation to access (nested) keys. For example we could access the learning
rate in the above config snippet via `config.learning.learning_rate`.
### Multiple Configs
Multiple configs can be passed to the various binaries as a comma separated list
of config paths via the `--config_paths` flag. This allows us to specify a
default config that applies to all experiments (e.g. how often to write
checkpoints, default embedder hyperparams) and one config per experiment holding
the just hyperparams specific to the experiment (path to data, etc.).
See `configs/tcn_default.yml` for an example of our default config and
`configs/pouring.yml` for an example of how we define the pouring experiments.
Configs are applied left to right. For example, consider two config files:
default.yml
```yaml
learning:
learning_rate: 0.001 # Default learning rate.
optimizer: 'adam'
```
myexperiment.yml
```yaml
learning:
learning_rate: 1.0 # Experiment learning rate (overwrites default).
data:
training: '/path/to/myexperiment/training.tfrecord'
```
Running
```bash
bazel run train.py --config_paths='default.yml,myexperiment.yml'
```
results in a final merged config called final_training_config.yml
```yaml
learning:
optimizer: 'adam'
learning_rate: 1.0
data:
training: '/path/to/myexperiment/training.tfrecord'
```
which is created automatically and stored in the experiment log directory
alongside model checkpoints and tensorboard summaries. This gives us a record of
the exact configs that went into each trial.
## Monitoring training
We usually look at two validation metrics during training: knn classification
error and multi-view alignment.
### KNN-Classification Error
In cases where we have labeled validation data, we can compute the average
cross-sequence KNN classification error (1.0 - recall@k=1) over all embedded
labeled images in the validation set. See `labeled_eval.py`.
### Multi-view Alignment
In cases where there is no labeled validation data, we can look at the how well
our model aligns multiple views of same embedded validation sequences. That is,
for each embedded validation sequence, for all cross-view pairs, we compute the
scaled absolute distance between ground truth time indices and knn time indices.
See `alignment.py`.
## Visualization
We visualize the embedding space learned by our models in two ways: nearest
neighbor imitation videos and PCA/T-SNE.
### Nearest Neighbor Imitation Videos
One of the easiest way to evaluate the understanding of your model is to see how
well the model can semantically align two videos via nearest neighbors in
embedding space.
Consider the case where we have multiple validation demo videos of a human or
robot performing the same task. For example, in the pouring experiments, we
collected many different multiview validation videos of a person pouring the
contents of one container into another, then setting the container down. If we'd
like to see how well our embeddings generalize across viewpoint, object/agent
appearance, and background, we can construct what we call "Nearest Neighbor
Imitation" videos, by embedding some validation query sequence `i` from view 1,
and finding the nearest neighbor for each query frame in some embedded target
sequence `j` filmed from view 1.
[Here's](https://sermanet.github.io/tcn/docs/figs/pouring_human.mov.gif) an
example of the final product.
See `generate_videos.py` for details.
### PCA & T-SNE Visualization
We can also embed a set of images taken randomly from validation videos and
visualize the embedding space using PCA projection and T-SNE in the tensorboard
projector. See `visualize_embeddings.py` for details.
## Tutorial Part I: Collecting Multi-View Webcam Videos
Here we give an end-to-end example of how to collect your own multiview webcam
videos and convert them to the TFRecord format expected by training.
Note: This was tested with up to 8 concurrent [Logitech c930e
webcams](https://www.logitech.com/en-us/product/c930e-webcam) extended with
[Plugable 5 Meter (16 Foot) USB 2.0 Active Repeater Extension
Cables](https://www.amazon.com/gp/product/B006LFL4X0/ref=oh_aui_detailpage_o05_s00?ie=UTF8&psc=1).
### Collect webcam videos
Go to dataset/webcam.py
1. Plug your webcams in and run
```bash
ls -ltrh /dev/video*
```
You should see one device listed per connected webcam.
2. Define some environment variables describing the dataset you're collecting.
```bash
dataset=tutorial # Name of the dataset.
mode=train # E.g. 'train', 'validation', 'test', 'demo'.
num_views=2 # Number of webcams.
viddir=/tmp/tcn/videos # Output directory for the videos.
tmp_imagedir=/tmp/tcn/tmp_images # Temp directory to hold images.
debug_vids=1 # Whether or not to generate side-by-side debug videos.
export DISPLAY=:0.0 # This allows real time matplotlib display.
```
3. Run the webcam.py script.
```bash
bazel build -c opt --copt=-mavx webcam && \
bazel-bin/webcam \
--dataset $dataset \
--mode $mode \
--num_views $num_views \
--tmp_imagedir $tmp_imagedir \
--viddir $viddir \
--debug_vids 1
```
4. Hit Ctrl-C when done collecting, upon which the script will compile videos
for each view and optionally a debug video concatenating multiple
simultaneous views.
5. If `--seqname` flag isn't set, the script will name the first sequence '0',
the second sequence '1', and so on (meaning you can just keep rerunning step
3.). When you are finished, you should see an output viddir with the
following structure:
```bash
videos/0_view0.mov
videos/0_view1.mov
...
videos/0_viewM.mov
videos/1_viewM.mov
...
videos/N_viewM.mov
for N sequences and M webcam views.
```
### Create TFRecords
Use `dataset/videos_to_tfrecords.py` to convert the directory of videos into a
directory of TFRecords files, one per multi-view sequence.
```bash
viddir=/tmp/tcn/videos
dataset=tutorial
mode=train
videos=$viddir/$dataset
bazel build -c opt videos_to_tfrecords && \
bazel-bin/videos_to_tfrecords --logtostderr \
--input_dir $videos/$mode \
--output_dir ~/tcn_data/$dataset/$mode \
--max_per_shard 400
```
Setting `--max_per_shard` > 0 allows you to shard training data. We've observed
that sharding long training sequences provides better performance in terms of
global steps/sec.
This should be left at the default of 0 for validation / test data.
You should now have a directory of TFRecords files with the following structure:
```bash
output_dir/0.tfrecord
...
output_dir/N.tfrecord
1 TFRecord file for each of N multi-view sequences.
```
Now we're ready to move on to part II: training, evaluation, and visualization.
## Tutorial Part II: Training, Evaluation, and Visualization
Here we give an end-to-end example of how to train, evaluate, and visualize the
embedding space learned by TCN models.
### Download Data
We will be using the 'Multiview Pouring' dataset, which can be downloaded using
the download.sh script
[here.](https://sites.google.com/site/brainrobotdata/home/multiview-pouring)
The rest of the tutorial will assume that you have your data downloaded to a
folder at `~/tcn_data`.
```bash
mkdir ~/tcn_data
mv ~/Downloads/download.sh ~/tcn_data
./download.sh
```
You should now have the following path containing all the data:
```bash
ls ~/tcn_data/multiview-pouring
labels README.txt tfrecords videos
```
### Download Pretrained Inception Checkpoint
If you haven't already, run the script that downloads the pretrained InceptionV3
checkpoint:
```bash
python download_pretrained.py
```
### Define A Config
For our experiment, we create 2 configs:
* `configs/tcn_default.yml`: This contains all the default hyperparameters
that generally don't vary across experiments.
* `configs/pouring.yml`: This contains all the hyperparameters that are
specific to the pouring experiment.
Important note about `configs/pouring.yml`:
* data.eval_cropping: We use 'pad200' for the pouring dataset, which was
filmed rather close up on iphone cameras. A better choice for data filmed on
webcam is likely 'crop_center'. See preprocessing.py for options.
### Train
Run the training binary:
```yaml
logdir=/tmp/tcn/pouring
c=configs
configs=$c/tcn_default.yml,$c/pouring.yml
bazel build -c opt --copt=-mavx --config=cuda train && \
bazel-bin/train \
--config_paths $configs --logdir $logdir
```
### Evaluate
Run the binary that computes running validation loss. Set `export
CUDA_VISIBLE_DEVICES=` to run on CPU.
```bash
bazel build -c opt --copt=-mavx eval && \
bazel-bin/eval \
--config_paths $configs --logdir $logdir
```
Run the binary that computes running validation cross-view sequence alignment.
Set `export CUDA_VISIBLE_DEVICES=` to run on CPU.
```bash
bazel build -c opt --copt=-mavx alignment && \
bazel-bin/alignment \
--config_paths $configs --checkpointdir $logdir --outdir $logdir
```
Run the binary that computes running labeled KNN validation error. Set `export
CUDA_VISIBLE_DEVICES=` to run on CPU.
```bash
bazel build -c opt --copt=-mavx labeled_eval && \
bazel-bin/labeled_eval \
--config_paths $configs --checkpointdir $logdir --outdir $logdir
```
### Monitor training
Run `tensorboard --logdir=$logdir`. After a bit of training, you should see
curves that look like this:
#### Training loss
<img src="g3doc/loss.png" title="Training Loss" />
#### Validation loss
<img src="g3doc/val_loss.png" title="Validation Loss" />
#### Validation Alignment
<img src="g3doc/alignment.png" title="Validation Alignment" />
#### Average Validation KNN Classification Error
<img src="g3doc/avg_error.png" title="Validation Average KNN Error" />
#### Individual Validation KNN Classification Errors
<img src="g3doc/all_error.png" title="All Validation Average KNN Errors" />
### Visualize
To visualize the embedding space learned by a model, we can:
#### Generate Imitation Videos
```bash
# Use the automatically generated final config file as config.
configs=$logdir/final_training_config.yml
# Visualize checkpoint 40001.
checkpoint_iter=40001
# Use validation records for visualization.
records=~/tcn_data/multiview-pouring/tfrecords/val
# Write videos to this location.
outdir=$logdir/tcn_viz/imitation_vids
```
```bash
bazel build -c opt --config=cuda --copt=-mavx generate_videos && \
bazel-bin/generate_videos \
--config_paths $configs \
--checkpointdir $logdir \
--checkpoint_iter $checkpoint_iter \
--query_records_dir $records \
--target_records_dir $records \
--outdir $outdir
```
After the script completes, you should see a directory of videos with names
like:
`$outdir/qtrain_clearodwalla_to_clear1_realv1_imtrain_clearsoda_to_white13_realv0.mp4`
that look like this: <img src="g3doc/im.gif" title="Imitation Video" />
#### T-SNE / PCA Visualization
Run the binary that generates embeddings and metadata.
```bash
outdir=$logdir/tcn_viz/embedding_viz
bazel build -c opt --config=cuda --copt=-mavx visualize_embeddings && \
bazel-bin/visualize_embeddings \
--config_paths $configs \
--checkpointdir $logdir \
--checkpoint_iter $checkpoint_iter \
--embedding_records $records \
--outdir $outdir \
--num_embed 1000 \
--sprite_dim 64
```
Run tensorboard, pointed at the embedding viz output directory.
```
tensorboard --logdir=$outdir
```
You should see something like this in tensorboard.
<img src="g3doc/pca.png" title="PCA" />
workspace(name = "tcn")
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Calculates test sequence alignment score."""
from __future__ import absolute_import
from __future__ import absolute_import
from __future__ import division
import os
import numpy as np
from estimators.get_estimator import get_estimator
from utils import util
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.INFO)
tf.flags.DEFINE_string(
'config_paths', '',
"""
Path to a YAML configuration files defining FLAG values. Multiple files
can be separated by the `#` symbol. Files are merged recursively. Setting
a key in these files is equivalent to setting the FLAG value with
the same name.
""")
tf.flags.DEFINE_string(
'model_params', '{}', 'YAML configuration string for the model parameters.')
tf.app.flags.DEFINE_string(
'checkpoint_iter', '', 'Evaluate this specific checkpoint.')
tf.app.flags.DEFINE_string(
'checkpointdir', '/tmp/tcn', 'Path to model checkpoints.')
tf.app.flags.DEFINE_string('outdir', '/tmp/tcn', 'Path to write summaries to.')
FLAGS = tf.app.flags.FLAGS
def compute_average_alignment(
seqname_to_embeddings, num_views, summary_writer, training_step):
"""Computes the average cross-view alignment for all sequence view pairs.
Args:
seqname_to_embeddings: Dict, mapping sequence name to a
[num_views, embedding size] numpy matrix holding all embedded views.
num_views: Int, number of simultaneous views in the dataset.
summary_writer: A `SummaryWriter` object.
training_step: Int, the training step of the model used to embed images.
Alignment is the scaled absolute difference between the ground truth time
and the knn aligned time.
abs(|time_i - knn_time|) / sequence_length
"""
all_alignments = []
for _, view_embeddings in seqname_to_embeddings.iteritems():
for idx_i in range(num_views):
for idx_j in range(idx_i+1, num_views):
embeddings_view_i = view_embeddings[idx_i]
embeddings_view_j = view_embeddings[idx_j]
seq_len = len(embeddings_view_i)
times_i = np.array(range(seq_len))
# Get the nearest time_index for each embedding in view_i.
times_j = np.array([util.KNNIdsWithDistances(
q, embeddings_view_j, k=1)[0][0] for q in embeddings_view_i])
# Compute sequence view pair alignment.
alignment = np.mean(
np.abs(np.array(times_i)-np.array(times_j))/float(seq_len))
all_alignments.append(alignment)
print 'alignment so far %f' % alignment
average_alignment = np.mean(all_alignments)
print 'Average alignment %f' % average_alignment
summ = tf.Summary(value=[tf.Summary.Value(
tag='validation/alignment', simple_value=average_alignment)])
summary_writer.add_summary(summ, int(training_step))
def evaluate_once(
config, checkpointdir, validation_records, checkpoint_path, batch_size,
num_views):
"""Evaluates and reports the validation alignment."""
# Choose an estimator based on training strategy.
estimator = get_estimator(config, checkpointdir)
# Embed all validation sequences.
seqname_to_embeddings = {}
for (view_embeddings, _, seqname) in estimator.inference(
validation_records, checkpoint_path, batch_size):
seqname_to_embeddings[seqname] = view_embeddings
# Compute and report alignment statistics.
ckpt_step = int(checkpoint_path.split('-')[-1])
summary_dir = os.path.join(FLAGS.outdir, 'alignment_summaries')
summary_writer = tf.summary.FileWriter(summary_dir)
compute_average_alignment(
seqname_to_embeddings, num_views, summary_writer, ckpt_step)
def main(_):
# Parse config dict from yaml config files / command line flags.
config = util.ParseConfigsToLuaTable(FLAGS.config_paths, FLAGS.model_params)
num_views = config.data.num_views
validation_records = util.GetFilesRecursively(config.data.validation)
batch_size = config.data.batch_size
checkpointdir = FLAGS.checkpointdir
# If evaluating a specific checkpoint, do that.
if FLAGS.checkpoint_iter:
checkpoint_path = os.path.join(
'%s/model.ckpt-%s' % (checkpointdir, FLAGS.checkpoint_iter))
evaluate_once(
config, checkpointdir, validation_records, checkpoint_path, batch_size,
num_views)
else:
for checkpoint_path in tf.contrib.training.checkpoints_iterator(
checkpointdir):
evaluate_once(
config, checkpointdir, validation_records, checkpoint_path,
batch_size, num_views)
if __name__ == '__main__':
tf.app.run()
# Train with Multi-View TCN.
training_strategy: 'mvtcn'
# Use the 'inception_conv_ss_fc' embedder, which has the structure:
# InceptionV3 -> 2 conv adaptation layers -> spatial softmax -> fully connected
# -> embedding.
embedder_strategy: 'inception_conv_ss_fc'
# Use npairs loss.
loss_strategy: 'npairs'
learning:
learning_rate: 0.0001
# Set some hyperparameters for our embedder.
inception_conv_ss_fc:
# Don't finetune the pre-trained weights.
finetune_inception: false
dropout:
# Don't dropout convolutional activations.
keep_conv: 1.0
# Use a dropout of 0.8 on the fully connected activations.
keep_fc: 0.8
# Use a dropout of 0.8 on the inception activations.
keep_pretrained: 0.8
# Size of the TCN embedding.
embedding_size: 32
data:
raw_height: 480
raw_width: 360
batch_size: 32
examples_per_sequence: 32
num_views: 2
preprocessing:
# Inference-time image cropping strategy.
eval_cropping: 'pad200'
augmentation:
# Do scale augmentation.
minscale: 0.8 # When downscaling, zoom in to 80% of the central bounding box.
maxscale: 3.0 # When upscaling, zoom out to 300% of the central bounding box.
proportion_scaled_up: 0.5 # Proportion of the time to scale up rather than down.
color: true # Do color augmentation.
fast_mode: true
# Paths to the data.
training: '~/tcn_data/multiview-pouring/tfrecords/train'
validation: '~/tcn_data/multiview-pouring/tfrecords/val'
test: 'path/to/test'
labeled:
image_attr_keys: ['image/view0', 'image/view1', 'task']
label_attr_keys: ['contact', 'distance', 'liquid_flowing', 'has_liquid', 'container_angle']
validation: '~/tcn_data/multiview-pouring/monolithic-labeled/val'
test: '~/tcn_data/multiview-pouring/monolithic-labeled/test'
logging:
checkpoint:
save_checkpoints_steps: 1000
\ No newline at end of file
# These configs are the defaults we used for both the pouring and pose
# experiments.
# Train on TPU?
use_tpu: false # Default is to run without TPU locally.
tpu:
num_shards: 1
iterations: 100
# SGD / general learning hyperparameters.
learning:
max_step: 1000000
learning_rate: 0.001
decay_steps: 10000
decay_factor: 1.00
l2_reg_weight: 0.000001
optimizer: 'adam'
# Default metric learning loss hyperparameters.
triplet_semihard:
embedding_l2: true # Suggestion from Hyun Oh Song's slides.
margin: .2 # Default value for Facenet.
npairs:
embedding_l2: false # Suggestion from Hyun Oh Song's slides.
clustering_loss:
embedding_l2: true # Suggestion from Hyun Oh Song's slides.
margin: 1.0 # Default in deep_metric_learning.
lifted_struct:
embedding_l2: false # Suggestion from Hyun Oh Song's slides.
margin: 1.0
contrastive:
embedding_l2: true # Suggestion from Hyun Oh Song's slides.
margin: 1.0
# Which method to use to train the embedding.
# Options are "mvtcn", "svtcn".
training_strategy: 'mvtcn'
# Which embedder architecture to use.
# Options are 'inception_conv_ss_fc' (used in pouring / pose experiments),
# 'resnet'.
embedder_strategy: 'inception_conv_ss_fc'
# Size of the TCN embedding.
embedding_size: 32
# Default hyperparameters for the different embedder architectures.
inception_conv_ss_fc:
pretrained_checkpoint: 'pretrained_checkpoints/inception/inception_v3.ckpt'
pretrained_layer: 'Mixed_5d'
additional_conv_sizes: [512, 512]
fc_hidden_sizes: [2048]
finetune: false
dropout:
keep_pretrained: 1.0
keep_conv: 1.0
keep_fc: 1.0
resnet:
pretrained_checkpoint: 'pretrained_checkpoints/resnet/resnet_v2_50.ckpt'
pretrained_layer: 4
finetune: false
adaptation_blocks: '512_3-512_3'
emb_connection: 'conv'
fc_hidden_sizes: 'None'
dropout:
keep_pretrained: 1.0
# Loss hyperparameters.
mvtcn:
# Size of the window in timesteps to get random anchor-positive pairs for
# training.
window: 580 # 29fps * 20 seconds.
svtcn:
pos_radius: 6 # 0.2 seconds * 29fps ~ 6 timesteps.
neg_radius: 12 # 2.0 * pos_radius.
# Data configs.
data:
height: 299
width: 299
preprocessing:
# Strategy to use when cropping images at inference time.
# See preprocessing.py for options.
eval_cropping: 'crop_center'
# Training scale, color augmentation hyparameters.
augmentation:
# See preprocessing.py for a discussion of how to use these parameters.
minscale: 1.0
maxscale: 1.0
proportion_scaled_up: 0.5
color: true
fast_mode: true
num_parallel_calls: 12
sequence_prefetch_size: 12
batch_prefetch_size: 12
batch_size: 36
eval_batch_size: 36
embed_batch_size: 128
val:
recall_at_k_list: [1]
num_eval_samples: 1000
eval_interval_secs: 300
logging:
summary:
image_summaries: false
save_summaries_steps: 100
flush_secs: 600
checkpoint:
num_to_keep: 0 # Keep all checkpoints.
save_checkpoints_steps: 1000
secs: 1800
\ No newline at end of file
use_tpu: False
training_strategy: 'mvtcn'
loss_strategy: 'triplet_semihard'
learning:
max_step: 2
optimizer: 'adam'
embedding_size: 8
data:
embed_batch_size: 12
batch_size: 12
examples_per_sequence: 12
num_views: 2
num_parallel_calls: 1
sequence_prefetch_size: 1
batch_prefetch_size: 1
logging:
summary:
image_summaries: false
save_summaries_steps: 100
flush_secs: 600
save_summaries_secs: 60
checkpoint:
num_to_keep: 0 # Keep all checkpoints.
save_checkpoints_steps: 1000
secs: 1800
\ No newline at end of file
This diff is collapsed.
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for data_providers."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import data_providers
import tensorflow as tf
class DataTest(tf.test.TestCase):
def testMVTripletIndices(self):
"""Ensures anchor/pos indices for a TCN batch are valid."""
tf.set_random_seed(0)
window = 580
batch_size = 36
num_pairs = batch_size // 2
num_views = 2
seq_len = 600
# Get anchor time and view indices for this sequence.
(_, a_view_indices,
p_view_indices) = data_providers.get_tcn_anchor_pos_indices(
seq_len, num_views, num_pairs, window)
with self.test_session() as sess:
(np_a_view_indices,
np_p_view_indices) = sess.run([a_view_indices, p_view_indices])
# Assert no overlap between anchor and pos view indices.
np.testing.assert_equal(
np.any(np.not_equal(np_a_view_indices, np_p_view_indices)), True)
# Assert set of view indices is a subset of expected set of view indices.
view_set = set(range(num_views))
self.assertTrue(set(np_a_view_indices).issubset(view_set))
self.assertTrue(set(np_p_view_indices).issubset(view_set))
def testSVTripletIndices(self):
"""Ensures time indices for a SV triplet batch are valid."""
seq_len = 600
batch_size = 36
num_views = 2
time_indices, _ = data_providers.get_svtcn_indices(
seq_len, batch_size, num_views)
with self.test_session() as sess:
np_time_indices = sess.run(time_indices)
first = np_time_indices[0]
last = np_time_indices[-1]
# Make sure batch time indices are a contiguous range.
self.assertTrue(np.array_equal(np_time_indices, range(first, last+1)))
if __name__ == "__main__":
tf.test.main()
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Converts temp directories of images to videos."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import os
import shutil
# pylint: disable=invalid-name
parser = argparse.ArgumentParser()
parser.add_argument(
'--view_dirs', type=str, default='',
help='Comma-separated list of temp view image directories.')
parser.add_argument(
'--vid_paths', type=str, default='',
help='Comma-separated list of video output paths.')
parser.add_argument(
'--debug_path', type=str, default='',
help='Output path to debug video.')
parser.add_argument(
'--debug_lhs_view', type=str, default='',
help='Output path to debug video.')
parser.add_argument(
'--debug_rhs_view', type=str, default='',
help='Output path to debug video.')
def create_vids(view_dirs, vid_paths, debug_path=None,
debug_lhs_view=0, debug_rhs_view=1):
"""Creates one video per view per sequence."""
# Create the view videos.
for (view_dir, vidpath) in zip(view_dirs, vid_paths):
encode_vid_cmd = r'mencoder mf://%s/*.png \
-mf fps=29:type=png \
-ovc lavc -lavcopts vcodec=mpeg4:mbd=2:trell \
-oac copy -o %s' % (view_dir, vidpath)
os.system(encode_vid_cmd)
# Optionally create a debug side-by-side video.
if debug_path:
lhs = vid_paths[int(debug_lhs_view)]
rhs = vid_paths[int(debug_rhs_view)]
os.system(r"avconv \
-i %s \
-i %s \
-filter_complex '[0:v]pad=iw*2:ih[int];[int][1:v]overlay=W/2:0[vid]' \
-map [vid] \
-c:v libx264 \
-crf 23 \
-preset veryfast \
%s" % (lhs, rhs, debug_path))
def main():
FLAGS, _ = parser.parse_known_args()
assert FLAGS.view_dirs
assert FLAGS.vid_paths
view_dirs = FLAGS.view_dirs.split(',')
vid_paths = FLAGS.vid_paths.split(',')
create_vids(view_dirs, vid_paths, FLAGS.debug_path,
FLAGS.debug_lhs_view, FLAGS.debug_rhs_view)
# Cleanup temp image dirs.
for i in view_dirs:
shutil.rmtree(i)
if __name__ == '__main__':
main()
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
r"""Converts videos to training, validation, test, and debug tfrecords on cns.
Example usage:
# From phone videos.
x=learning/brain/research/tcn/videos_to_tfrecords && \
blaze build -c opt $x && \
set=tmp && videos=~/data/tcn/datasets/$set/ && \
blaze-bin/$x --logtostderr --output_dir /cns/oi-d/home/$USER/tcn_data/$set \
--input_dir $videos/train
--debug $dataset/debug --rotate 90 --max_per_shard 400
# From webcam videos.
mode=train
x=learning/brain/research/tcn/videos_to_tfrecords && \
blaze build -c opt $x && \
set=tmp && videos=/tmp/tcn/videos/$set/ && \
blaze-bin/$x --logtostderr \
--output_dir /cns/oi-d/home/$USER/tcn_data/$set/$mode \
--input_dir $videos/$mode --max_per_shard 400
"""
import glob
import math
import multiprocessing
from multiprocessing.pool import ThreadPool
import os
from random import shuffle
import re
from StringIO import StringIO
import cv2
from PIL import Image
from PIL import ImageFile
from preprocessing import cv2resizeminedge
from preprocessing import cv2rotateimage
from preprocessing import shapestring
from utils.progress import Progress
import tensorflow.google as tf
tf.logging.set_verbosity(tf.logging.INFO)
tf.app.flags.DEFINE_string('view_pattern', '_view[_]*[0]+[.].*',
'view regexp pattern for first view')
tf.app.flags.DEFINE_string('input_dir', '', '''input data path''')
tf.app.flags.DEFINE_integer('resize_min_edge', 0,
'''resize the smallest edge to this size.''')
tf.app.flags.DEFINE_integer('rotate', 0, '''rotate the image in degrees.''')
tf.app.flags.DEFINE_string('rotate_if_matching', None,
'rotate only if video path matches regexp.')
tf.app.flags.DEFINE_string('output_dir', '', 'output directory for the dataset')
tf.app.flags.DEFINE_integer(
'max_per_shard', -1, 'max # of frames per data chunk')
tf.app.flags.DEFINE_integer('expected_views', 2, 'expected number of views')
tf.app.flags.DEFINE_integer('log_frequency', 50, 'frequency of logging')
tf.app.flags.DEFINE_integer(
'max_views_discrepancy', 100,
'Maximum length difference (in frames) allowed between views')
tf.app.flags.DEFINE_boolean('overwrite', False, 'overwrite output files')
FLAGS = tf.app.flags.FLAGS
feature = tf.train.Feature
bytes_feature = lambda v: feature(bytes_list=tf.train.BytesList(value=v))
int64_feature = lambda v: feature(int64_list=tf.train.Int64List(value=v))
float_feature = lambda v: feature(float_list=tf.train.FloatList(value=v))
def FindPatternFiles(path, view_pattern, errors):
"""Recursively find all files matching a certain pattern."""
if not path:
return None
tf.logging.info(
'Recursively searching for files matching pattern \'%s\' in %s' %
(view_pattern, path))
view_patt = re.compile('.*' + view_pattern)
sequences = []
for root, _, filenames in os.walk(path, followlinks=True):
path_root = root[:len(path)]
assert path_root == path
for filename in filenames:
if view_patt.match(filename):
fullpath = os.path.join(root, re.sub(view_pattern, '', filename))
shortpath = re.sub(path, '', fullpath).lstrip('/')
# Determine if this sequence should be sharded or not.
shard = False
if FLAGS.max_per_shard > 0:
shard = True
# Retrieve number of frames for this sequence.
num_views, length, view_paths, num_frames = GetViewInfo(
fullpath + view_pattern[0] + '*')
if num_views != FLAGS.expected_views:
tf.logging.info('Expected %d views but found: %s' %
(FLAGS.expected_views, str(view_paths)))
assert num_views == FLAGS.expected_views
assert length > 0
# Drop sequences if view lengths differ too much.
if max(num_frames) - min(num_frames) > FLAGS.max_views_discrepancy:
error_msg = (
'Error: ignoring sequence with views with length difference > %d:'
'%s in %s') % (FLAGS.max_views_discrepancy, str(num_frames),
fullpath)
errors.append(error_msg)
tf.logging.error(error_msg)
else:
# Append sequence info.
sequences.append({'full': fullpath, 'name': shortpath, 'len': length,
'start': 0, 'end': length, 'num_views': num_views,
'shard': shard})
return sorted(sequences, key=lambda k: k['name'])
def ShardSequences(sequences, max_per_shard):
"""Find all sequences, shard and randomize them."""
total_shards_len = 0
total_shards = 0
assert max_per_shard > 0
for sequence in sequences:
if sequence['shard']:
sequence['shard'] = False # Reset shard flag.
length = sequence['len']
start = sequence['start']
end = sequence['end']
name = sequence['name']
assert end - start == length
if length > max_per_shard:
# Dividing sequence into smaller shards.
num_shards = int(math.floor(length / max_per_shard)) + 1
size = int(math.ceil(length / num_shards))
tf.logging.info(
'splitting sequence of length %d into %d shards of size %d' %
(length, num_shards, size))
last_end = 0
for i in range(num_shards):
shard_start = last_end
shard_end = min(length, shard_start + size)
if i == num_shards - 1:
shard_end = length
shard_len = shard_end - shard_start
total_shards_len += shard_len
shard_name = name + '_shard%02d' % i
last_end = shard_end
# Enqueuing shard.
if i == 0: # Replace current sequence.
sequence['len'] = shard_len
sequence['start'] = shard_start
sequence['end'] = shard_end
sequence['name'] = shard_name
else: # Enqueue new sequence.
sequences.append(
{'full': sequence['full'], 'name': shard_name,
'len': shard_len, 'start': shard_start, 'end': shard_end,
'num_views': sequence['num_views'], 'shard': False})
total_shards += num_shards
assert last_end == length
# Print resulting sharding.
if total_shards > 0:
tf.logging.info('%d shards of average length %d' %
(total_shards, total_shards_len / total_shards))
return sorted(sequences, key=lambda k: k['name'])
def RandomizeSets(sets):
"""Randomize each set."""
for _, sequences in sorted(sets.iteritems()):
if sequences:
# Randomize order.
shuffle(sequences)
def GetSpecificFrame(vid_path, frame_index):
"""Gets a frame at a specified index in a video."""
cap = cv2.VideoCapture(vid_path)
cap.set(1, frame_index)
_, bgr = cap.read()
cap.release()
rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
return rgb
def JpegString(image, jpeg_quality=90):
"""Returns given PIL.Image instance as jpeg string.
Args:
image: A PIL image.
jpeg_quality: The image quality, on a scale from 1 (worst) to 95 (best).
Returns:
a jpeg_string.
"""
# This fix to PIL makes sure that we don't get an error when saving large
# jpeg files. This is a workaround for a bug in PIL. The value should be
# substantially larger than the size of the image being saved.
ImageFile.MAXBLOCK = 640 * 512 * 64
output_jpeg = StringIO()
image.save(output_jpeg, 'jpeg', quality=jpeg_quality, optimize=True)
return output_jpeg.getvalue()
def ParallelPreprocessing(args):
"""Parallel preprocessing: rotation, resize and jpeg encoding to string."""
(vid_path, timestep, num_timesteps, view) = args
try:
image = GetSpecificFrame(vid_path, timestep)
# Resizing.
resize_str = ''
if FLAGS.resize_min_edge > 0:
resize_str += ', resize ' + shapestring(image)
image = cv2resizeminedge(image, FLAGS.resize_min_edge)
resize_str += ' => ' + shapestring(image)
# Rotating.
rotate = None
if FLAGS.rotate:
rotate = FLAGS.rotate
if FLAGS.rotate_if_matching is not None:
rotate = None
patt = re.compile(FLAGS.rotate_if_matching)
if patt.match(vid_path) is not None:
rotate = FLAGS.rotate
if rotate is not None:
image = cv2rotateimage(image, FLAGS.rotate)
# Jpeg encoding.
image = Image.fromarray(image)
im_string = bytes_feature([JpegString(image)])
if timestep % FLAGS.log_frequency == 0:
tf.logging.info('Loaded frame %d / %d for %s (rotation %s%s) from %s' %
(timestep, num_timesteps, view, str(rotate), resize_str,
vid_path))
return im_string
except cv2.error as e:
tf.logging.error('Error while loading frame %d of %s: %s' %
(timestep, vid_path, str(e)))
return None
def GetNumFrames(vid_path):
"""Gets the number of frames in a video."""
cap = cv2.VideoCapture(vid_path)
total_frames = cap.get(7)
cap.release()
return int(total_frames)
def GetViewInfo(views_fullname):
"""Return information about a group of views."""
view_paths = sorted(glob.glob(views_fullname))
num_frames = [GetNumFrames(i) for i in view_paths]
min_num_frames = min(num_frames)
num_views = len(view_paths)
return num_views, min_num_frames, view_paths, num_frames
def AddSequence(sequence, writer, progress, errors):
"""Converts a sequence to a SequenceExample.
Sequences have multiple viewpoint videos. Extract all frames from all
viewpoint videos in parallel, build a single SequenceExample containing
all viewpoint images for every timestep.
Args:
sequence: a dict with information on a sequence.
writer: A TFRecordWriter.
progress: A Progress object to report processing progress.
errors: a list of string to append to in case of errors.
"""
fullname = sequence['full']
shortname = sequence['name']
start = sequence['start']
end = sequence['end']
num_timesteps = sequence['len']
# Build a list of all view paths for this fullname.
path = fullname + FLAGS.view_pattern[0] + '*'
tf.logging.info('Loading sequence from ' + path)
view_paths = sorted(glob.glob(path))
# Extract all images for all views
num_frames = [GetNumFrames(i) for i in view_paths]
tf.logging.info('Loading %s with [%d, %d[ (%d frames) from: %s %s' %
(shortname, start, end, num_timesteps,
str(num_frames), str(view_paths)))
num_views = len(view_paths)
total_timesteps = int(min(num_frames))
assert num_views == FLAGS.expected_views
assert num_views == sequence['num_views']
# Create a worker pool to parallelize loading/rotating
worker_pool = ThreadPool(multiprocessing.cpu_count())
# Collect all images for each view.
view_to_feature_list = {}
view_images = []
for view_idx, view in enumerate(
['view'+str(i) for i in range(num_views)]):
# Flatten list to process in parallel
work = []
for i in range(start, end):
work.append((view_paths[view_idx], i, total_timesteps, view))
# Load and rotate images in parallel
view_images.append(worker_pool.map(ParallelPreprocessing, work))
# Report progress.
progress.Add(len(view_images[view_idx]))
tf.logging.info('%s' % str(progress))
# Remove error frames from all views
i = start
num_errors = 0
while i < len(view_images[0]):
remove_frame = False
# Check if one or more views have an error for this frame.
for view_idx in range(num_views):
if view_images[view_idx][i] is None:
remove_frame = True
error_msg = 'Removing frame %d for all views for %s ' % (i, fullname)
errors.append(error_msg)
tf.logging.error(error_msg)
# Remove faulty frames.
if remove_frame:
num_errors += 1
for view_idx in range(num_views):
del view_images[view_idx][i]
else:
i += 1
# Ignore sequences that have errors.
if num_errors > 0:
error_msg = 'Dropping sequence because of frame errors for %s' % fullname
errors.append(error_msg)
tf.logging.error(error_msg)
else:
# Build FeatureList objects for each view.
for view_idx, view in enumerate(
['view'+str(i) for i in range(num_views)]):
# Construct FeatureList from repeated feature.
view_to_feature_list[view] = tf.train.FeatureList(
feature=view_images[view_idx])
context_features = tf.train.Features(feature={
'task': bytes_feature([shortname]),
'len': int64_feature([num_timesteps])
})
feature_lists = tf.train.FeatureLists(feature_list=view_to_feature_list)
ex = tf.train.SequenceExample(
context=context_features, feature_lists=feature_lists)
writer.write(ex.SerializeToString())
tf.logging.info('Done adding %s with %d timesteps'
% (fullname, num_timesteps))
def PrintSequencesInfo(sequences, prefix):
"""Print information about sequences and return the total number of frames."""
tf.logging.info('')
tf.logging.info(prefix)
num_frames = 0
for sequence in sequences:
shard_str = ''
if sequence['shard']:
shard_str = ' (sharding)'
tf.logging.info('frames [%d, %d[\t(%d frames * %d views)%s\t%s' % (
sequence['start'], sequence['end'], sequence['len'],
sequence['num_views'], shard_str, sequence['name']))
num_frames += sequence['len'] * sequence['num_views']
tf.logging.info(('%d frames (all views), %d sequences, average sequence'
' length (all views): %d') %
(num_frames, len(sequences), num_frames / len(sequences)))
tf.logging.info('')
return num_frames
def CheckRecord(filename, sequence):
"""Check that an existing tfrecord corresponds to the expected sequence."""
num_sequences = 0
total_frames = 0
for serialized_example in tf.python_io.tf_record_iterator(filename):
num_sequences += 1
example = tf.train.SequenceExample()
example.ParseFromString(serialized_example)
length = example.context.feature['len'].int64_list.value[0]
name = example.context.feature['task'].bytes_list.value[0]
total_frames += len(example.feature_lists.feature_list) * length
if sequence['name'] != name or sequence['len'] != length:
return False, total_frames
if num_sequences == 0:
return False, total_frames
return True, total_frames
def AddSequences():
"""Creates one training, validation."""
errors = []
# Generate datasets file lists.
sequences = FindPatternFiles(FLAGS.input_dir, FLAGS.view_pattern, errors)
num_frames = PrintSequencesInfo(sequences,
'Found the following datasets and files:')
# Sharding and randomizing sets.
if FLAGS.max_per_shard > 0:
sequences = ShardSequences(sequences, FLAGS.max_per_shard)
num_frames = PrintSequencesInfo(sequences, 'After sharding:')
tf.logging.info('')
# Process sets.
progress = Progress(num_frames)
output_list = []
for sequence in sequences:
record_name = os.path.join(
FLAGS.output_dir, '%s.tfrecord' % sequence['name'])
if tf.gfile.Exists(record_name) and not FLAGS.overwrite:
ok, num_frames = CheckRecord(record_name, sequence)
if ok:
progress.Add(num_frames)
tf.logging.info('Skipping existing output file: %s' % record_name)
continue
else:
tf.logging.info('File does not match sequence, reprocessing...')
output_dir = os.path.dirname(record_name)
if not tf.gfile.Exists(output_dir):
tf.logging.info('Creating output directory: %s' % output_dir)
tf.gfile.MakeDirs(output_dir)
output_list.append(record_name)
tf.logging.info('Writing to ' + record_name)
writer = tf.python_io.TFRecordWriter(record_name)
AddSequence(sequence, writer, progress, errors)
writer.close()
tf.logging.info('Wrote dataset files: ' + str(output_list))
tf.logging.info('All errors (%d): %s' % (len(errors), str(errors)))
def main(_):
AddSequences()
if __name__ == '__main__':
tf.app.run()
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
r"""Collect images from multiple simultaneous webcams.
Usage:
1. Define some environment variables that describe what you're collecting.
dataset=your_dataset_name
mode=train
num_views=2
viddir=/tmp/tcn/videos
tmp_imagedir=/tmp/tcn/tmp_images
debug_vids=1
2. Run the script.
export DISPLAY=:0.0 && \
root=learning/brain/research/tcn && \
bazel build -c opt --copt=-mavx tcn/webcam && \
bazel-bin/tcn/webcam \
--dataset $dataset \
--mode $mode \
--num_views $num_views \
--tmp_imagedir $tmp_imagedir \
--viddir $viddir \
--debug_vids 1 \
--logtostderr
3. Hit Ctrl-C when done collecting, upon which the script will compile videos
for each view and optionally a debug video concatenating multiple
simultaneous views.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import multiprocessing
from multiprocessing import Process
import os
import subprocess
import sys
import time
import cv2
import matplotlib
matplotlib.use('TkAgg')
from matplotlib import animation # pylint: disable=g-import-not-at-top
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.INFO)
tf.flags.DEFINE_string('dataset', '', 'Name of the dataset we`re collecting.')
tf.flags.DEFINE_string('mode', '',
'What type of data we`re collecting. E.g.:'
'`train`,`valid`,`test`, or `demo`')
tf.flags.DEFINE_string('seqname', '',
'Name of this sequence. If empty, the script will use'
'the name seq_N+1 where seq_N is the latest'
'integer-named sequence in the videos directory.')
tf.flags.DEFINE_integer('num_views', 2,
'Number of webcams.')
tf.flags.DEFINE_string('tmp_imagedir', '/tmp/tcn/data',
'Temporary outdir to write images.')
tf.flags.DEFINE_string('viddir', '/tmp/tcn/videos',
'Base directory to write debug videos.')
tf.flags.DEFINE_boolean('debug_vids', True,
'Whether to generate debug vids with multiple'
'concatenated views.')
tf.flags.DEFINE_string('debug_lhs_view', '0',
'Which viewpoint to use for the lhs video.')
tf.flags.DEFINE_string('debug_rhs_view', '1',
'Which viewpoint to use for the rhs video.')
tf.flags.DEFINE_integer('height', 1080, 'Raw input height.')
tf.flags.DEFINE_integer('width', 1920, 'Raw input width.')
tf.flags.DEFINE_string('webcam_ports', None,
'Comma-separated list of each webcam usb port.')
FLAGS = tf.app.flags.FLAGS
class ImageQueue(object):
"""An image queue holding each stream's most recent image.
Basically implements a process-safe collections.deque(maxlen=1).
"""
def __init__(self):
self.lock = multiprocessing.Lock()
self._queue = multiprocessing.Queue(maxsize=1)
def append(self, data):
with self.lock:
if self._queue.full():
# Pop the first element.
_ = self._queue.get()
self._queue.put(data)
def get(self):
with self.lock:
return self._queue.get()
def empty(self):
return self._queue.empty()
def close(self):
return self._queue.close()
class WebcamViewer(object):
"""A class which displays a live stream from the webcams."""
def __init__(self, display_queues):
"""Create a WebcamViewer instance."""
self.height = FLAGS.height
self.width = FLAGS.width
self.queues = display_queues
def _get_next_images(self):
"""Gets the next image to display."""
# Wait for one image per view.
not_found = True
while not_found:
if True in [q.empty() for q in self.queues]:
# At least one image queue is empty; wait.
continue
else:
# Retrieve the images.
latest = [q.get() for q in self.queues]
combined = np.concatenate(latest, axis=1)
not_found = False
return combined
def run(self):
"""Displays the Kcam live stream in a window.
This function blocks until the window is closed.
"""
fig, rgb_axis = plt.subplots()
image_rows = self.height
image_cols = self.width * FLAGS.num_views
initial_image = np.zeros((image_rows, image_cols, 3))
rgb_image = rgb_axis.imshow(initial_image, interpolation='nearest')
def update_figure(frame_index):
"""Animation function for matplotlib FuncAnimation. Updates the image.
Args:
frame_index: The frame number.
Returns:
An iterable of matplotlib drawables to clear.
"""
_ = frame_index
images = self._get_next_images()
images = images[..., [2, 1, 0]]
rgb_image.set_array(images)
return rgb_image,
# We must keep a reference to this animation in order for it to work.
unused_animation = animation.FuncAnimation(
fig, update_figure, interval=50, blit=True)
mng = plt.get_current_fig_manager()
mng.resize(*mng.window.maxsize())
plt.show()
def reconcile(queues, write_queue):
"""Gets a list of concurrent images from each view queue.
This waits for latest images to be available in all view queues,
then continuously:
- Creates a list of current images for each view.
- Writes the list to a queue of image lists to write to disk.
Args:
queues: A list of `ImageQueues`, holding the latest image from each webcam.
write_queue: A multiprocessing.Queue holding lists of concurrent images.
"""
# Loop forever.
while True:
# Wait till all queues have an image.
if True in [q.empty() for q in queues]:
continue
else:
# Retrieve all views' images.
latest = [q.get() for q in queues]
# Copy the list of all concurrent images to the write queue.
write_queue.put(latest)
def persist(write_queue, view_dirs):
"""Pulls lists of concurrent images off a write queue, writes them to disk.
Args:
write_queue: A multiprocessing.Queue holding lists of concurrent images;
one image per view.
view_dirs: A list of strings, holding the output image directories for each
view.
"""
timestep = 0
while True:
# Wait till there is work in the queue.
if write_queue.empty():
continue
# Get a list of concurrent images to write to disk.
view_ims = write_queue.get()
for view_idx, image in enumerate(view_ims):
view_base = view_dirs[view_idx]
# Assign all concurrent view images the same sequence timestep.
fname = os.path.join(view_base, '%s.png' % str(timestep).zfill(10))
cv2.imwrite(fname, image)
# Move to the next timestep.
timestep += 1
def get_image(camera):
"""Captures a single image from the camera and returns it in PIL format."""
data = camera.read()
_, im = data
return im
def capture_webcam(camera, display_queue, reconcile_queue):
"""Captures images from simultaneous webcams, writes them to queues.
Args:
camera: A cv2.VideoCapture object representing an open webcam stream.
display_queue: An ImageQueue.
reconcile_queue: An ImageQueue.
"""
# Take some ramp images to allow cams to adjust for brightness etc.
for i in range(60):
tf.logging.info('Taking ramp image %d.' % i)
get_image(camera)
cnt = 0
start = time.time()
while True:
# Get images for all cameras.
im = get_image(camera)
# Replace the current image in the display and reconcile queues.
display_queue.append(im)
reconcile_queue.append(im)
cnt += 1
current = time.time()
if cnt % 100 == 0:
tf.logging.info('Collected %s of video, %d frames at ~%.2f fps.' % (
timer(start, current), cnt, cnt/(current-start)))
def timer(start, end):
"""Returns a formatted time elapsed."""
hours, rem = divmod(end-start, 3600)
minutes, seconds = divmod(rem, 60)
return '{:0>2}:{:0>2}:{:05.2f}'.format(int(hours), int(minutes), seconds)
def display_webcams(display_queues):
"""Builds an WebcamViewer to animate incoming images, runs it."""
viewer = WebcamViewer(display_queues)
viewer.run()
def create_vids(view_dirs, seqname):
"""Creates one video per view per sequence."""
vidbase = os.path.join(FLAGS.viddir, FLAGS.dataset, FLAGS.mode)
if not os.path.exists(vidbase):
os.makedirs(vidbase)
vidpaths = []
for idx, view_dir in enumerate(view_dirs):
vidname = os.path.join(vidbase, '%s_view%d.mp4' % (seqname, idx))
encode_vid_cmd = r'mencoder mf://%s/*.png \
-mf fps=29:type=png \
-ovc lavc -lavcopts vcodec=mpeg4:mbd=2:trell \
-oac copy -o %s' % (view_dir, vidname)
os.system(encode_vid_cmd)
vidpaths.append(vidname)
debugpath = None
if FLAGS.debug_vids:
lhs = vidpaths[FLAGS.debug_lhs_view]
rhs = vidpaths[FLAGS.debug_rhs_view]
debug_base = os.path.join('%s_debug' % FLAGS.viddir, FLAGS.dataset,
FLAGS.mode)
if not os.path.exists(debug_base):
os.makedirs(debug_base)
debugpath = '%s/%s.mp4' % (debug_base, seqname)
os.system(r"avconv \
-i %s \
-i %s \
-filter_complex '[0:v]pad=iw*2:ih[int];[int][1:v]overlay=W/2:0[vid]' \
-map [vid] \
-c:v libx264 \
-crf 23 \
-preset veryfast \
%s" % (lhs, rhs, debugpath))
return vidpaths, debugpath
def setup_paths():
"""Sets up the necessary paths to collect videos."""
assert FLAGS.dataset
assert FLAGS.mode
assert FLAGS.num_views
# Setup directory for final images used to create videos for this sequence.
tmp_imagedir = os.path.join(FLAGS.tmp_imagedir, FLAGS.dataset, FLAGS.mode)
if not os.path.exists(tmp_imagedir):
os.makedirs(tmp_imagedir)
# Create a base directory to hold all sequence videos if it doesn't exist.
vidbase = os.path.join(FLAGS.viddir, FLAGS.dataset, FLAGS.mode)
if not os.path.exists(vidbase):
os.makedirs(vidbase)
# Get one directory per concurrent view and a sequence name.
view_dirs, seqname = get_view_dirs(vidbase, tmp_imagedir)
# Get an output path to each view's video.
vid_paths = []
for idx, _ in enumerate(view_dirs):
vid_path = os.path.join(vidbase, '%s_view%d.mp4' % (seqname, idx))
vid_paths.append(vid_path)
# Optionally build paths to debug_videos.
debug_path = None
if FLAGS.debug_vids:
debug_base = os.path.join('%s_debug' % FLAGS.viddir, FLAGS.dataset,
FLAGS.mode)
if not os.path.exists(debug_base):
os.makedirs(debug_base)
debug_path = '%s/%s.mp4' % (debug_base, seqname)
return view_dirs, vid_paths, debug_path
def get_view_dirs(vidbase, tmp_imagedir):
"""Creates and returns one view directory per webcam."""
# Create and append a sequence name.
if FLAGS.seqname:
seqname = FLAGS.seqname
else:
# If there's no video directory, this is the first sequence.
if not os.listdir(vidbase):
seqname = '0'
else:
# Otherwise, get the latest sequence name and increment it.
seq_names = [i.split('_')[0] for i in os.listdir(vidbase)]
latest_seq = sorted(map(int, seq_names), reverse=True)[0]
seqname = str(latest_seq+1)
tf.logging.info('No seqname specified, using: %s' % seqname)
view_dirs = [os.path.join(
tmp_imagedir, '%s_view%d' % (seqname, v)) for v in range(FLAGS.num_views)]
for d in view_dirs:
if not os.path.exists(d):
os.makedirs(d)
return view_dirs, seqname
def get_cameras():
"""Opens cameras using cv2, ensures they can take images."""
# Try to get free webcam ports.
if FLAGS.webcam_ports:
ports = map(int, FLAGS.webcam_ports.split(','))
else:
ports = range(FLAGS.num_views)
cameras = [cv2.VideoCapture(i) for i in ports]
if not all([i.isOpened() for i in cameras]):
try:
# Try to find and kill hanging cv2 process_ids.
output = subprocess.check_output(['lsof -t /dev/video*'], shell=True)
tf.logging.info('Found hanging cv2 process_ids: \n')
tf.logging.info(output)
tf.logging.info('Killing hanging processes...')
for process_id in output.split('\n')[:-1]:
subprocess.call(['kill %s' % process_id], shell=True)
time.sleep(3)
# Recapture webcams.
cameras = [cv2.VideoCapture(i) for i in ports]
except subprocess.CalledProcessError:
raise ValueError(
'Cannot connect to cameras. Try running: \n'
'ls -ltrh /dev/video* \n '
'to see which ports your webcams are connected to. Then hand those '
'ports as a comma-separated list to --webcam_ports, e.g. '
'--webcam_ports 0,1')
# Verify each camera is able to capture images.
ims = map(get_image, cameras)
assert False not in [i is not None for i in ims]
return cameras
def launch_images_to_videos(view_dirs, vid_paths, debug_path):
"""Launch job in separate process to convert images to videos."""
f = 'learning/brain/research/tcn/dataset/images_to_videos.py'
cmd = ['python %s ' % f]
cmd += ['--view_dirs %s ' % ','.join(i for i in view_dirs)]
cmd += ['--vid_paths %s ' % ','.join(i for i in vid_paths)]
cmd += ['--debug_path %s ' % debug_path]
cmd += ['--debug_lhs_view %s ' % FLAGS.debug_lhs_view]
cmd += ['--debug_rhs_view %s ' % FLAGS.debug_rhs_view]
cmd += [' & ']
cmd = ''.join(i for i in cmd)
# Call images_to_videos asynchronously.
fnull = open(os.devnull, 'w')
subprocess.Popen([cmd], stdout=fnull, stderr=subprocess.STDOUT, shell=True)
for p in vid_paths:
tf.logging.info('Writing final video to: %s' % p)
if debug_path:
tf.logging.info('Writing debug video to: %s' % debug_path)
def main(_):
# Initialize the camera capture objects.
cameras = get_cameras()
# Get one output directory per view.
view_dirs, vid_paths, debug_path = setup_paths()
try:
# Wait for user input.
try:
tf.logging.info('About to write to:')
for v in view_dirs:
tf.logging.info(v)
raw_input('Press Enter to continue...')
except SyntaxError:
pass
# Create a queue per view for displaying and saving images.
display_queues = [ImageQueue() for _ in range(FLAGS.num_views)]
reconcile_queues = [ImageQueue() for _ in range(FLAGS.num_views)]
# Create a queue for collecting all tuples of multi-view images to write to
# disk.
write_queue = multiprocessing.Queue()
processes = []
# Create a process to display collected images in real time.
processes.append(Process(target=display_webcams, args=(display_queues,)))
# Create a process to collect the latest simultaneous images from each view.
processes.append(Process(
target=reconcile, args=(reconcile_queues, write_queue,)))
# Create a process to collect the latest simultaneous images from each view.
processes.append(Process(
target=persist, args=(write_queue, view_dirs,)))
for (cam, dq, rq) in zip(cameras, display_queues, reconcile_queues):
processes.append(Process(
target=capture_webcam, args=(cam, dq, rq,)))
for p in processes:
p.start()
for p in processes:
p.join()
except KeyboardInterrupt:
# Close the queues.
for q in display_queues + reconcile_queues:
q.close()
# Release the cameras.
for cam in cameras:
cam.release()
# Launch images_to_videos script asynchronously.
launch_images_to_videos(view_dirs, vid_paths, debug_path)
try:
sys.exit(0)
except SystemExit:
os._exit(0) # pylint: disable=protected-access
if __name__ == '__main__':
tf.app.run()
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Downloads pretrained InceptionV3 and ResnetV2-50 checkpoints."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import tarfile
import urllib
INCEPTION_URL = 'http://download.tensorflow.org/models/inception_v3_2016_08_28.tar.gz'
RESNET_URL = 'http://download.tensorflow.org/models/resnet_v2_50_2017_04_14.tar.gz'
def DownloadWeights(model_dir, url):
os.makedirs(model_dir)
tar_path = os.path.join(model_dir, 'ckpt.tar.gz')
urllib.urlretrieve(url, tar_path)
tar = tarfile.open(os.path.join(model_dir, 'ckpt.tar.gz'))
tar.extractall(model_dir)
if __name__ == '__main__':
# Create a directory for all pretrained checkpoints.
ckpt_dir = 'pretrained_checkpoints'
if not os.path.exists(ckpt_dir):
os.makedirs(ckpt_dir)
# Download inception.
print('Downloading inception pretrained weights...')
inception_dir = os.path.join(ckpt_dir, 'inception')
DownloadWeights(inception_dir, INCEPTION_URL)
print('Done downloading inception pretrained weights.')
print('Downloading resnet pretrained weights...')
resnet_dir = os.path.join(ckpt_dir, 'resnet')
DownloadWeights(resnet_dir, RESNET_URL)
print('Done downloading resnet pretrained weights.')
This diff is collapsed.
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Get a configured estimator."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from estimators import mvtcn_estimator as mvtcn_estimators
from estimators import svtcn_estimator
def get_mvtcn_estimator(loss_strategy, config, logdir):
"""Returns a configured MVTCN estimator."""
loss_to_trainer = {
'triplet_semihard': mvtcn_estimators.MVTCNTripletEstimator,
'npairs': mvtcn_estimators.MVTCNNpairsEstimator,
}
if loss_strategy not in loss_to_trainer:
raise ValueError('Unknown loss for MVTCN: %s' % loss_strategy)
estimator = loss_to_trainer[loss_strategy](config, logdir)
return estimator
def get_estimator(config, logdir):
"""Returns an unsupervised model trainer based on config.
Args:
config: A T object holding training configs.
logdir: String, path to directory where model checkpoints and summaries
are saved.
Returns:
estimator: A configured `TCNEstimator` object.
Raises:
ValueError: If unknown training strategy is specified.
"""
# Get the training strategy.
training_strategy = config.training_strategy
if training_strategy == 'mvtcn':
loss_strategy = config.loss_strategy
estimator = get_mvtcn_estimator(
loss_strategy, config, logdir)
elif training_strategy == 'svtcn':
estimator = svtcn_estimator.SVTCNTripletEstimator(config, logdir)
else:
raise ValueError('Unknown training strategy: %s' % training_strategy)
return estimator
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""MVTCN trainer implementations with various metric learning losses."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import functools
import data_providers
import model as model_module
from estimators import base_estimator
import tensorflow as tf
class MVTCNEstimator(base_estimator.BaseEstimator):
"""Multi-view TCN base class."""
def __init__(self, config, logdir):
super(MVTCNEstimator, self).__init__(config, logdir)
def _pairs_provider(self, records, is_training):
config = self._config
num_views = config.data.num_views
window = config.mvtcn.window
num_parallel_calls = config.data.num_parallel_calls
sequence_prefetch_size = config.data.sequence_prefetch_size
batch_prefetch_size = config.data.batch_prefetch_size
examples_per_seq = config.data.examples_per_sequence
return functools.partial(
data_providers.multiview_pairs_provider,
file_list=records,
preprocess_fn=self.preprocess_data,
num_views=num_views,
window=window,
is_training=is_training,
examples_per_seq=examples_per_seq,
num_parallel_calls=num_parallel_calls,
sequence_prefetch_size=sequence_prefetch_size,
batch_prefetch_size=batch_prefetch_size)
def forward(self, images_concat, is_training, reuse=False):
"""See base class."""
embedder_strategy = self._config.embedder_strategy
loss_strategy = self._config.loss_strategy
l2_normalize_embedding = self._config[loss_strategy].embedding_l2
embedder = model_module.get_embedder(
embedder_strategy,
self._config,
images_concat,
is_training=is_training,
l2_normalize_embedding=l2_normalize_embedding, reuse=reuse)
embeddings_concat = embedder.construct_embedding()
variables_to_train = embedder.get_trainable_variables()
self.variables_to_train = variables_to_train
self.pretrained_init_fn = embedder.init_fn
return embeddings_concat
def _collect_image_summaries(self, anchor_images, positive_images,
images_concat):
image_summaries = self._config.logging.summary.image_summaries
if image_summaries and not self._config.use_tpu:
batch_pairs_summary = tf.concat(
[anchor_images, positive_images], axis=2)
tf.summary.image('training/mvtcn_pairs', batch_pairs_summary)
tf.summary.image('training/images_preprocessed_concat', images_concat)
class MVTCNTripletEstimator(MVTCNEstimator):
"""Multi-View TCN with semihard triplet loss."""
def __init__(self, config, logdir):
super(MVTCNTripletEstimator, self).__init__(config, logdir)
def construct_input_fn(self, records, is_training):
"""See base class."""
def input_fn(params):
"""Provides input to MVTCN models."""
if is_training and self._config.use_tpu:
batch_size = params['batch_size']
else:
batch_size = self._batch_size
(images_concat,
anchor_labels,
positive_labels,
anchor_images,
positive_images) = self._pairs_provider(
records, is_training)(batch_size=batch_size)
if is_training:
self._collect_image_summaries(anchor_images, positive_images,
images_concat)
labels = tf.concat([anchor_labels, positive_labels], axis=0)
features = {'batch_preprocessed': images_concat}
return (features, labels)
return input_fn
def define_loss(self, embeddings, labels, is_training):
"""See base class."""
margin = self._config.triplet_semihard.margin
loss = tf.contrib.losses.metric_learning.triplet_semihard_loss(
labels=labels, embeddings=embeddings, margin=margin)
self._loss = loss
if is_training and not self._config.use_tpu:
tf.summary.scalar('training/triplet_semihard', loss)
return loss
def define_eval_metric_ops(self):
"""See base class."""
return {'validation/triplet_semihard': tf.metrics.mean(self._loss)}
class MVTCNNpairsEstimator(MVTCNEstimator):
"""Multi-View TCN with npairs loss."""
def __init__(self, config, logdir):
super(MVTCNNpairsEstimator, self).__init__(config, logdir)
def construct_input_fn(self, records, is_training):
"""See base class."""
def input_fn(params):
"""Provides input to MVTCN models."""
if is_training and self._config.use_tpu:
batch_size = params['batch_size']
else:
batch_size = self._batch_size
(images_concat,
npairs_labels,
_,
anchor_images,
positive_images) = self._pairs_provider(
records, is_training)(batch_size=batch_size)
if is_training:
self._collect_image_summaries(anchor_images, positive_images,
images_concat)
features = {'batch_preprocessed': images_concat}
return (features, npairs_labels)
return input_fn
def define_loss(self, embeddings, labels, is_training):
"""See base class."""
embeddings_anchor, embeddings_positive = tf.split(embeddings, 2, axis=0)
loss = tf.contrib.losses.metric_learning.npairs_loss(
labels=labels, embeddings_anchor=embeddings_anchor,
embeddings_positive=embeddings_positive)
self._loss = loss
if is_training and not self._config.use_tpu:
tf.summary.scalar('training/npairs', loss)
return loss
def define_eval_metric_ops(self):
"""See base class."""
return {'validation/npairs': tf.metrics.mean(self._loss)}
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""SVTCN estimator implementation."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import data_providers
import model as model_module
from estimators import base_estimator
from estimators import svtcn_loss
import tensorflow as tf
class SVTCNEstimator(base_estimator.BaseEstimator):
"""Single-view TCN Estimator base class."""
def __init__(self, config, logdir):
super(SVTCNEstimator, self).__init__(config, logdir)
def construct_input_fn(self, records, is_training):
"""See base class."""
config = self._config
num_views = config.data.num_views
num_parallel_calls = config.data.num_parallel_calls
sequence_prefetch_size = config.data.sequence_prefetch_size
batch_prefetch_size = config.data.batch_prefetch_size
def input_fn():
"""Provides input to SVTCN models."""
(images_preprocessed,
images_raw,
timesteps) = data_providers.singleview_tcn_provider(
file_list=records,
preprocess_fn=self.preprocess_data,
num_views=num_views,
is_training=is_training,
batch_size=self._batch_size,
num_parallel_calls=num_parallel_calls,
sequence_prefetch_size=sequence_prefetch_size,
batch_prefetch_size=batch_prefetch_size)
if config.logging.summary.image_summaries and is_training:
tf.summary.image('training/svtcn_images', images_raw)
features = {'batch_preprocessed': images_preprocessed}
return (features, timesteps)
return input_fn
def forward(self, images, is_training, reuse=False):
"""See base class."""
embedder_strategy = self._config.embedder_strategy
embedder = model_module.get_embedder(
embedder_strategy,
self._config,
images,
is_training=is_training, reuse=reuse)
embeddings = embedder.construct_embedding()
if is_training:
self.variables_to_train = embedder.get_trainable_variables()
self.pretrained_init_fn = embedder.init_fn
return embeddings
class SVTCNTripletEstimator(SVTCNEstimator):
"""Single-View TCN with semihard triplet loss."""
def __init__(self, config, logdir):
super(SVTCNTripletEstimator, self).__init__(config, logdir)
def define_loss(self, embeddings, timesteps, is_training):
"""See base class."""
pos_radius = self._config.svtcn.pos_radius
neg_radius = self._config.svtcn.neg_radius
margin = self._config.triplet_semihard.margin
loss = svtcn_loss.singleview_tcn_loss(
embeddings, timesteps, pos_radius, neg_radius, margin=margin)
self._loss = loss
if is_training:
tf.summary.scalar('training/svtcn_loss', loss)
return loss
def define_eval_metric_ops(self):
"""See base class."""
return {'validation/svtcn_loss': tf.metrics.mean(self._loss)}
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""This implements single view TCN triplet loss."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
def pairwise_squared_distance(feature):
"""Computes the squared pairwise distance matrix.
output[i, j] = || feature[i, :] - feature[j, :] ||_2^2
Args:
feature: 2-D Tensor of size [number of data, feature dimension]
Returns:
pairwise_squared_distances: 2-D Tensor of size
[number of data, number of data]
"""
pairwise_squared_distances = tf.add(
tf.reduce_sum(
tf.square(feature), axis=1, keep_dims=True),
tf.reduce_sum(
tf.square(tf.transpose(feature)), axis=0,
keep_dims=True)) - 2.0 * tf.matmul(feature, tf.transpose(feature))
# Deal with numerical inaccuracies. Set small negatives to zero.
pairwise_squared_distances = tf.maximum(pairwise_squared_distances, 0.0)
return pairwise_squared_distances
def masked_maximum(data, mask, dim=1):
"""Computes the axis wise maximum over chosen elements.
Args:
data: N-D Tensor.
mask: N-D Tensor of zeros or ones.
dim: The dimension over which to compute the maximum.
Returns:
masked_maximums: N-D Tensor.
The maximized dimension is of size 1 after the operation.
"""
axis_minimums = tf.reduce_min(data, dim, keep_dims=True)
masked_maximums = tf.reduce_max(
tf.multiply(
data - axis_minimums, mask), dim, keep_dims=True) + axis_minimums
return masked_maximums
def masked_minimum(data, mask, dim=1):
"""Computes the axis wise minimum over chosen elements.
Args:
data: 2-D Tensor of size [n, m].
mask: 2-D Boolean Tensor of size [n, m].
dim: The dimension over which to compute the minimum.
Returns:
masked_minimums: N-D Tensor.
The minimized dimension is of size 1 after the operation.
"""
axis_maximums = tf.reduce_max(data, dim, keep_dims=True)
masked_minimums = tf.reduce_min(
tf.multiply(
data - axis_maximums, mask), dim, keep_dims=True) + axis_maximums
return masked_minimums
def singleview_tcn_loss(
embeddings, timesteps, pos_radius, neg_radius, margin=1.0,
sequence_ids=None, multiseq=False):
"""Computes the single view triplet loss with semi-hard negative mining.
The loss encourages the positive distances (between a pair of embeddings with
the same labels) to be smaller than the minimum negative distance among
which are at least greater than the positive distance plus the margin constant
(called semi-hard negative) in the mini-batch. If no such negative exists,
uses the largest negative distance instead.
Anchor, positive, negative selection is as follow:
Anchors: We consider every embedding timestep as an anchor.
Positives: pos_radius defines a radius (in timesteps) around each anchor from
which positives can be drawn. E.g. An anchor with t=10 and a pos_radius of
2 produces a set of 4 (anchor,pos) pairs [(a=10, p=8), ... (a=10, p=12)].
Negatives: neg_radius defines a boundary (in timesteps) around each anchor,
outside of which negatives can be drawn. E.g. An anchor with t=10 and a
neg_radius of 4 means negatives can be any t_neg where t_neg < 6 and
t_neg > 14.
Args:
embeddings: 2-D Tensor of embedding vectors.
timesteps: 1-D Tensor with shape [batch_size, 1] of sequence timesteps.
pos_radius: int32; the size of the window (in timesteps) around each anchor
timestep that a positive can be drawn from.
neg_radius: int32; the size of the window (in timesteps) around each anchor
timestep that defines a negative boundary. Negatives can only be chosen
where negative timestep t is < negative boundary min or > negative
boundary max.
margin: Float; the triplet loss margin hyperparameter.
sequence_ids: (Optional) 1-D Tensor with shape [batch_size, 1] of sequence
ids. Together (sequence_id, sequence_timestep) give us a unique index for
each image if we have multiple sequences in a batch.
multiseq: Boolean, whether or not the batch is composed of multiple
sequences (with possibly colliding timesteps).
Returns:
triplet_loss: tf.float32 scalar.
"""
assert neg_radius > pos_radius
# If timesteps shape isn't [batchsize, 1], reshape to [batch_size, 1].
tshape = tf.shape(timesteps)
assert tshape.shape == 2 or tshape.shape == 1
if tshape.shape == 1:
timesteps = tf.reshape(timesteps, [tshape[0], 1])
# Build pairwise squared distance matrix.
pdist_matrix = pairwise_squared_distance(embeddings)
# Build pairwise binary adjacency matrix, where adjacency[i,j] is True
# if timestep j is inside the positive range for timestep i and both
# timesteps come from the same sequence.
pos_radius = tf.cast(pos_radius, tf.int32)
if multiseq:
# If sequence_ids shape isn't [batchsize, 1], reshape to [batch_size, 1].
tshape = tf.shape(sequence_ids)
assert tshape.shape == 2 or tshape.shape == 1
if tshape.shape == 1:
sequence_ids = tf.reshape(sequence_ids, [tshape[0], 1])
# Build pairwise binary adjacency matrix based on sequence_ids
sequence_adjacency = tf.equal(sequence_ids, tf.transpose(sequence_ids))
# Invert so we can select negatives only.
sequence_adjacency_not = tf.logical_not(sequence_adjacency)
in_pos_range = tf.logical_and(
tf.less_equal(
tf.abs(timesteps - tf.transpose(timesteps)), pos_radius),
sequence_adjacency)
# Build pairwise binary discordance matrix, where discordance[i,j] is True
# if timestep j is inside the negative range for timestep i or if the
# timesteps come from different sequences.
in_neg_range = tf.logical_or(
tf.greater(tf.abs(timesteps - tf.transpose(timesteps)), neg_radius),
sequence_adjacency_not
)
else:
in_pos_range = tf.less_equal(
tf.abs(timesteps - tf.transpose(timesteps)), pos_radius)
in_neg_range = tf.greater(tf.abs(timesteps - tf.transpose(timesteps)),
neg_radius)
batch_size = tf.size(timesteps)
# compute the mask
pdist_matrix_tile = tf.tile(pdist_matrix, [batch_size, 1])
mask = tf.logical_and(
tf.tile(in_neg_range, [batch_size, 1]),
tf.greater(pdist_matrix_tile,
tf.reshape(tf.transpose(pdist_matrix), [-1, 1])))
mask_final = tf.reshape(
tf.greater(
tf.reduce_sum(
tf.cast(
mask, dtype=tf.float32), 1, keep_dims=True),
0.0), [batch_size, batch_size])
mask_final = tf.transpose(mask_final)
in_neg_range = tf.cast(in_neg_range, dtype=tf.float32)
mask = tf.cast(mask, dtype=tf.float32)
# negatives_outside: smallest D_an where D_an > D_ap
negatives_outside = tf.reshape(
masked_minimum(pdist_matrix_tile, mask), [batch_size, batch_size])
negatives_outside = tf.transpose(negatives_outside)
# negatives_inside: largest D_an
negatives_inside = tf.tile(
masked_maximum(pdist_matrix, in_neg_range), [1, batch_size])
semi_hard_negatives = tf.where(
mask_final, negatives_outside, negatives_inside)
loss_mat = tf.add(margin, pdist_matrix - semi_hard_negatives)
mask_positives = tf.cast(
in_pos_range, dtype=tf.float32) - tf.diag(tf.ones([batch_size]))
# In lifted-struct, the authors multiply 0.5 for upper triangular
# in semihard, they take all positive pairs except the diagonal.
num_positives = tf.reduce_sum(mask_positives)
triplet_loss = tf.truediv(
tf.reduce_sum(tf.maximum(tf.multiply(loss_mat, mask_positives), 0.0)),
num_positives,
name='triplet_svtcn_loss')
return triplet_loss
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment